Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
206 lines
7.0 KiB
Python
206 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Parse ans.xhtml into structured answers.json.
|
|
|
|
Output schema:
|
|
{
|
|
"answers": {
|
|
"1.1": {
|
|
"id": "1.1",
|
|
"anchor": "ch1ans1",
|
|
"chapter": 1,
|
|
"subparts": [
|
|
{"label": null, "items": [
|
|
{"number": 1, "answer": "el", "alternates": []},
|
|
{"number": 2, "answer": "el", "alternates": []},
|
|
...
|
|
]}
|
|
],
|
|
"freeform": false, # true if "Answers will vary"
|
|
"raw": "..." # raw text for fallback
|
|
},
|
|
"2.4": { # multi-part exercise
|
|
"subparts": [
|
|
{"label": "A", "items": [...]},
|
|
{"label": "B", "items": [...]},
|
|
{"label": "C", "items": [...]}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
|
|
ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
|
|
OUT = Path(__file__).resolve().parent / "answers.json"
|
|
|
|
ANSWER_CLASSES = {"answerq", "answerq1", "answerq2", "answerqa"}
|
|
EXERCISE_ID_RE = re.compile(r"^([0-9]+)\.([0-9]+)$")
|
|
SUBPART_LABEL_RE = re.compile(r"^([A-Z])\b")
|
|
NUMBERED_ITEM_RE = re.compile(r"(?:^|\s)(\d+)\.\s+")
|
|
FREEFORM_PATTERNS = [
|
|
re.compile(r"answers? will vary", re.IGNORECASE),
|
|
re.compile(r"answer will vary", re.IGNORECASE),
|
|
]
|
|
OR_TOKEN = "{{OR}}"
|
|
|
|
|
|
def render_with_or(p) -> str:
|
|
"""Convert <p> to plain text, replacing 'OR' span markers with sentinel."""
|
|
soup = BeautifulSoup(str(p), "lxml")
|
|
# Replace <span class="small">OR</span> with sentinel
|
|
for span in soup.find_all("span"):
|
|
cls = span.get("class") or []
|
|
if "small" in cls and span.get_text(strip=True).upper() == "OR":
|
|
span.replace_with(f" {OR_TOKEN} ")
|
|
# Drop pagebreak spans
|
|
for span in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
|
|
span.decompose()
|
|
# Drop emphasis but keep text
|
|
for tag in soup.find_all(["em", "i", "strong", "b"]):
|
|
tag.unwrap()
|
|
text = soup.get_text(separator=" ", strip=False)
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
|
|
|
|
def split_numbered_items(text: str) -> "list[dict]":
|
|
"""Given '1. el 2. la 3. el ...' return [{'number':1,'answer':'el'}, ...]."""
|
|
# Find positions of N. tokens
|
|
matches = list(NUMBERED_ITEM_RE.finditer(text))
|
|
items = []
|
|
for i, m in enumerate(matches):
|
|
num = int(m.group(1))
|
|
start = m.end()
|
|
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
body = text[start:end].strip().rstrip(".,;")
|
|
# Split alternates on the OR token
|
|
parts = [p.strip() for p in body.split(OR_TOKEN) if p.strip()]
|
|
if not parts:
|
|
continue
|
|
items.append({
|
|
"number": num,
|
|
"answer": parts[0],
|
|
"alternates": parts[1:],
|
|
})
|
|
return items
|
|
|
|
|
|
def parse_subpart_label(text: str) -> "tuple[str | None, str]":
|
|
"""Try to peel a leading subpart label (A, B, C) from the text.
|
|
Returns (label_or_None, remaining_text)."""
|
|
# Pattern at start: "A " or "A " (lots of whitespace from <em>A</em><tab>)
|
|
m = re.match(r"^([A-Z])\s+(?=\d)", text)
|
|
if m:
|
|
return m.group(1), text[m.end():]
|
|
return None, text
|
|
|
|
|
|
def parse_answer_paragraph(p, exercise_id: str) -> "list[dict]":
|
|
"""Convert one <p> into a list of subparts.
|
|
For p.answerq, the text typically starts with the exercise id, then items.
|
|
For p.answerqa, the text starts with a subpart label letter."""
|
|
raw = render_with_or(p)
|
|
# Strip the leading exercise id if present
|
|
raw = re.sub(rf"^{re.escape(exercise_id)}\s*", "", raw)
|
|
|
|
label, body = parse_subpart_label(raw)
|
|
|
|
# Detect freeform
|
|
freeform = any(pat.search(body) for pat in FREEFORM_PATTERNS)
|
|
if freeform:
|
|
return [{"label": label, "items": [], "freeform": True, "raw": body}]
|
|
|
|
items = split_numbered_items(body)
|
|
return [{"label": label, "items": items, "freeform": False, "raw": body}]
|
|
|
|
|
|
def main() -> None:
|
|
src = ROOT / "ans.xhtml"
|
|
soup = BeautifulSoup(src.read_text(encoding="utf-8"), "lxml")
|
|
body = soup.find("body")
|
|
|
|
answers: dict = {}
|
|
current_chapter = None
|
|
current_exercise_id: "str | None" = None
|
|
|
|
for el in body.find_all(["h3", "p"]):
|
|
classes = set(el.get("class") or [])
|
|
|
|
# Chapter boundary
|
|
if el.name == "h3" and "h3b" in classes:
|
|
text = el.get_text(strip=True)
|
|
m = re.search(r"Chapter\s+(\d+)", text)
|
|
if m:
|
|
current_chapter = int(m.group(1))
|
|
current_exercise_id = None
|
|
continue
|
|
|
|
if el.name != "p" or not (classes & ANSWER_CLASSES):
|
|
continue
|
|
|
|
# Find the exercise-id anchor (only present on p.answerq, not on continuation)
|
|
a = el.find("a", href=True)
|
|
ex_link = None
|
|
if a:
|
|
link_text = a.get_text(strip=True)
|
|
if EXERCISE_ID_RE.match(link_text):
|
|
ex_link = link_text
|
|
|
|
if ex_link:
|
|
current_exercise_id = ex_link
|
|
anchor = ""
|
|
href = a.get("href", "")
|
|
anchor_m = re.search(r"#(ch\d+ans\d+)", href + " " + (a.get("id") or ""))
|
|
anchor = anchor_m.group(1) if anchor_m else (a.get("id") or "")
|
|
# Use the anchor's `id` attr if it's the entry id (e.g. "ch1ans1")
|
|
entry_id = a.get("id") or anchor
|
|
|
|
answers[ex_link] = {
|
|
"id": ex_link,
|
|
"anchor": entry_id,
|
|
"chapter": current_chapter,
|
|
"subparts": [],
|
|
"freeform": False,
|
|
"raw": "",
|
|
}
|
|
new_subparts = parse_answer_paragraph(el, ex_link)
|
|
answers[ex_link]["subparts"].extend(new_subparts)
|
|
answers[ex_link]["raw"] = render_with_or(el)
|
|
answers[ex_link]["freeform"] = any(sp["freeform"] for sp in new_subparts)
|
|
else:
|
|
# Continuation paragraph for current exercise
|
|
if current_exercise_id and current_exercise_id in answers:
|
|
more = parse_answer_paragraph(el, current_exercise_id)
|
|
answers[current_exercise_id]["subparts"].extend(more)
|
|
if any(sp["freeform"] for sp in more):
|
|
answers[current_exercise_id]["freeform"] = True
|
|
|
|
out = {"answers": answers}
|
|
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
|
|
|
|
total = len(answers)
|
|
freeform = sum(1 for v in answers.values() if v["freeform"])
|
|
multipart = sum(1 for v in answers.values() if len(v["subparts"]) > 1)
|
|
total_items = sum(
|
|
len(sp["items"]) for v in answers.values() for sp in v["subparts"]
|
|
)
|
|
with_alternates = sum(
|
|
1 for v in answers.values()
|
|
for sp in v["subparts"] for it in sp["items"]
|
|
if it["alternates"]
|
|
)
|
|
print(f"Exercises with answers: {total}")
|
|
print(f" freeform: {freeform}")
|
|
print(f" multi-part (A/B/C): {multipart}")
|
|
print(f" total numbered items: {total_items}")
|
|
print(f" items with alternates:{with_alternates}")
|
|
print(f"Wrote {OUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|