#!/usr/bin/env python3 """Parse ans.xhtml into structured answers.json. Output schema: { "answers": { "1.1": { "id": "1.1", "anchor": "ch1ans1", "chapter": 1, "subparts": [ {"label": null, "items": [ {"number": 1, "answer": "el", "alternates": []}, {"number": 2, "answer": "el", "alternates": []}, ... ]} ], "freeform": false, # true if "Answers will vary" "raw": "..." # raw text for fallback }, "2.4": { # multi-part exercise "subparts": [ {"label": "A", "items": [...]}, {"label": "B", "items": [...]}, {"label": "C", "items": [...]} ] } } } """ import json import re from pathlib import Path from bs4 import BeautifulSoup, NavigableString ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS" OUT = Path(__file__).resolve().parent / "answers.json" ANSWER_CLASSES = {"answerq", "answerq1", "answerq2", "answerqa"} EXERCISE_ID_RE = re.compile(r"^([0-9]+)\.([0-9]+)$") SUBPART_LABEL_RE = re.compile(r"^([A-Z])\b") NUMBERED_ITEM_RE = re.compile(r"(?:^|\s)(\d+)\.\s+") FREEFORM_PATTERNS = [ re.compile(r"answers? will vary", re.IGNORECASE), re.compile(r"answer will vary", re.IGNORECASE), ] OR_TOKEN = "{{OR}}" def render_with_or(p) -> str: """Convert
to plain text, replacing 'OR' span markers with sentinel."""
soup = BeautifulSoup(str(p), "lxml")
# Replace OR with sentinel
for span in soup.find_all("span"):
cls = span.get("class") or []
if "small" in cls and span.get_text(strip=True).upper() == "OR":
span.replace_with(f" {OR_TOKEN} ")
# Drop pagebreak spans
for span in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
span.decompose()
# Drop emphasis but keep text
for tag in soup.find_all(["em", "i", "strong", "b"]):
tag.unwrap()
text = soup.get_text(separator=" ", strip=False)
text = re.sub(r"\s+", " ", text).strip()
return text
def split_numbered_items(text: str) -> "list[dict]":
"""Given '1. el 2. la 3. el ...' return [{'number':1,'answer':'el'}, ...]."""
# Find positions of N. tokens
matches = list(NUMBERED_ITEM_RE.finditer(text))
items = []
for i, m in enumerate(matches):
num = int(m.group(1))
start = m.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
body = text[start:end].strip().rstrip(".,;")
# Split alternates on the OR token
parts = [p.strip() for p in body.split(OR_TOKEN) if p.strip()]
if not parts:
continue
items.append({
"number": num,
"answer": parts[0],
"alternates": parts[1:],
})
return items
def parse_subpart_label(text: str) -> "tuple[str | None, str]":
"""Try to peel a leading subpart label (A, B, C) from the text.
Returns (label_or_None, remaining_text)."""
# Pattern at start: "A " or "A " (lots of whitespace from A into a list of subparts.
For p.answerq, the text typically starts with the exercise id, then items.
For p.answerqa, the text starts with a subpart label letter."""
raw = render_with_or(p)
# Strip the leading exercise id if present
raw = re.sub(rf"^{re.escape(exercise_id)}\s*", "", raw)
label, body = parse_subpart_label(raw)
# Detect freeform
freeform = any(pat.search(body) for pat in FREEFORM_PATTERNS)
if freeform:
return [{"label": label, "items": [], "freeform": True, "raw": body}]
items = split_numbered_items(body)
return [{"label": label, "items": items, "freeform": False, "raw": body}]
def main() -> None:
src = ROOT / "ans.xhtml"
soup = BeautifulSoup(src.read_text(encoding="utf-8"), "lxml")
body = soup.find("body")
answers: dict = {}
current_chapter = None
current_exercise_id: "str | None" = None
for el in body.find_all(["h3", "p"]):
classes = set(el.get("class") or [])
# Chapter boundary
if el.name == "h3" and "h3b" in classes:
text = el.get_text(strip=True)
m = re.search(r"Chapter\s+(\d+)", text)
if m:
current_chapter = int(m.group(1))
current_exercise_id = None
continue
if el.name != "p" or not (classes & ANSWER_CLASSES):
continue
# Find the exercise-id anchor (only present on p.answerq, not on continuation)
a = el.find("a", href=True)
ex_link = None
if a:
link_text = a.get_text(strip=True)
if EXERCISE_ID_RE.match(link_text):
ex_link = link_text
if ex_link:
current_exercise_id = ex_link
anchor = ""
href = a.get("href", "")
anchor_m = re.search(r"#(ch\d+ans\d+)", href + " " + (a.get("id") or ""))
anchor = anchor_m.group(1) if anchor_m else (a.get("id") or "")
# Use the anchor's `id` attr if it's the entry id (e.g. "ch1ans1")
entry_id = a.get("id") or anchor
answers[ex_link] = {
"id": ex_link,
"anchor": entry_id,
"chapter": current_chapter,
"subparts": [],
"freeform": False,
"raw": "",
}
new_subparts = parse_answer_paragraph(el, ex_link)
answers[ex_link]["subparts"].extend(new_subparts)
answers[ex_link]["raw"] = render_with_or(el)
answers[ex_link]["freeform"] = any(sp["freeform"] for sp in new_subparts)
else:
# Continuation paragraph for current exercise
if current_exercise_id and current_exercise_id in answers:
more = parse_answer_paragraph(el, current_exercise_id)
answers[current_exercise_id]["subparts"].extend(more)
if any(sp["freeform"] for sp in more):
answers[current_exercise_id]["freeform"] = True
out = {"answers": answers}
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
total = len(answers)
freeform = sum(1 for v in answers.values() if v["freeform"])
multipart = sum(1 for v in answers.values() if len(v["subparts"]) > 1)
total_items = sum(
len(sp["items"]) for v in answers.values() for sp in v["subparts"]
)
with_alternates = sum(
1 for v in answers.values()
for sp in v["subparts"] for it in sp["items"]
if it["alternates"]
)
print(f"Exercises with answers: {total}")
print(f" freeform: {freeform}")
print(f" multi-part (A/B/C): {multipart}")
print(f" total numbered items: {total_items}")
print(f" items with alternates:{with_alternates}")
print(f"Wrote {OUT}")
if __name__ == "__main__":
main()