#!/usr/bin/env python3 """Parse ans.xhtml into structured answers.json. Output schema: { "answers": { "1.1": { "id": "1.1", "anchor": "ch1ans1", "chapter": 1, "subparts": [ {"label": null, "items": [ {"number": 1, "answer": "el", "alternates": []}, {"number": 2, "answer": "el", "alternates": []}, ... ]} ], "freeform": false, # true if "Answers will vary" "raw": "..." # raw text for fallback }, "2.4": { # multi-part exercise "subparts": [ {"label": "A", "items": [...]}, {"label": "B", "items": [...]}, {"label": "C", "items": [...]} ] } } } """ import json import re from pathlib import Path from bs4 import BeautifulSoup, NavigableString ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS" OUT = Path(__file__).resolve().parent / "answers.json" ANSWER_CLASSES = {"answerq", "answerq1", "answerq2", "answerqa"} EXERCISE_ID_RE = re.compile(r"^([0-9]+)\.([0-9]+)$") SUBPART_LABEL_RE = re.compile(r"^([A-Z])\b") NUMBERED_ITEM_RE = re.compile(r"(?:^|\s)(\d+)\.\s+") FREEFORM_PATTERNS = [ re.compile(r"answers? will vary", re.IGNORECASE), re.compile(r"answer will vary", re.IGNORECASE), ] OR_TOKEN = "{{OR}}" def render_with_or(p) -> str: """Convert

to plain text, replacing 'OR' span markers with sentinel.""" soup = BeautifulSoup(str(p), "lxml") # Replace OR with sentinel for span in soup.find_all("span"): cls = span.get("class") or [] if "small" in cls and span.get_text(strip=True).upper() == "OR": span.replace_with(f" {OR_TOKEN} ") # Drop pagebreak spans for span in soup.find_all("span", attrs={"epub:type": "pagebreak"}): span.decompose() # Drop emphasis but keep text for tag in soup.find_all(["em", "i", "strong", "b"]): tag.unwrap() text = soup.get_text(separator=" ", strip=False) text = re.sub(r"\s+", " ", text).strip() return text def split_numbered_items(text: str) -> "list[dict]": """Given '1. el 2. la 3. el ...' return [{'number':1,'answer':'el'}, ...].""" # Find positions of N. tokens matches = list(NUMBERED_ITEM_RE.finditer(text)) items = [] for i, m in enumerate(matches): num = int(m.group(1)) start = m.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) body = text[start:end].strip().rstrip(".,;") # Split alternates on the OR token parts = [p.strip() for p in body.split(OR_TOKEN) if p.strip()] if not parts: continue items.append({ "number": num, "answer": parts[0], "alternates": parts[1:], }) return items def parse_subpart_label(text: str) -> "tuple[str | None, str]": """Try to peel a leading subpart label (A, B, C) from the text. Returns (label_or_None, remaining_text).""" # Pattern at start: "A " or "A " (lots of whitespace from A) m = re.match(r"^([A-Z])\s+(?=\d)", text) if m: return m.group(1), text[m.end():] return None, text def parse_answer_paragraph(p, exercise_id: str) -> "list[dict]": """Convert one

into a list of subparts. For p.answerq, the text typically starts with the exercise id, then items. For p.answerqa, the text starts with a subpart label letter.""" raw = render_with_or(p) # Strip the leading exercise id if present raw = re.sub(rf"^{re.escape(exercise_id)}\s*", "", raw) label, body = parse_subpart_label(raw) # Detect freeform freeform = any(pat.search(body) for pat in FREEFORM_PATTERNS) if freeform: return [{"label": label, "items": [], "freeform": True, "raw": body}] items = split_numbered_items(body) return [{"label": label, "items": items, "freeform": False, "raw": body}] def main() -> None: src = ROOT / "ans.xhtml" soup = BeautifulSoup(src.read_text(encoding="utf-8"), "lxml") body = soup.find("body") answers: dict = {} current_chapter = None current_exercise_id: "str | None" = None for el in body.find_all(["h3", "p"]): classes = set(el.get("class") or []) # Chapter boundary if el.name == "h3" and "h3b" in classes: text = el.get_text(strip=True) m = re.search(r"Chapter\s+(\d+)", text) if m: current_chapter = int(m.group(1)) current_exercise_id = None continue if el.name != "p" or not (classes & ANSWER_CLASSES): continue # Find the exercise-id anchor (only present on p.answerq, not on continuation) a = el.find("a", href=True) ex_link = None if a: link_text = a.get_text(strip=True) if EXERCISE_ID_RE.match(link_text): ex_link = link_text if ex_link: current_exercise_id = ex_link anchor = "" href = a.get("href", "") anchor_m = re.search(r"#(ch\d+ans\d+)", href + " " + (a.get("id") or "")) anchor = anchor_m.group(1) if anchor_m else (a.get("id") or "") # Use the anchor's `id` attr if it's the entry id (e.g. "ch1ans1") entry_id = a.get("id") or anchor answers[ex_link] = { "id": ex_link, "anchor": entry_id, "chapter": current_chapter, "subparts": [], "freeform": False, "raw": "", } new_subparts = parse_answer_paragraph(el, ex_link) answers[ex_link]["subparts"].extend(new_subparts) answers[ex_link]["raw"] = render_with_or(el) answers[ex_link]["freeform"] = any(sp["freeform"] for sp in new_subparts) else: # Continuation paragraph for current exercise if current_exercise_id and current_exercise_id in answers: more = parse_answer_paragraph(el, current_exercise_id) answers[current_exercise_id]["subparts"].extend(more) if any(sp["freeform"] for sp in more): answers[current_exercise_id]["freeform"] = True out = {"answers": answers} OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2)) total = len(answers) freeform = sum(1 for v in answers.values() if v["freeform"]) multipart = sum(1 for v in answers.values() if len(v["subparts"]) > 1) total_items = sum( len(sp["items"]) for v in answers.values() for sp in v["subparts"] ) with_alternates = sum( 1 for v in answers.values() for sp in v["subparts"] for it in sp["items"] if it["alternates"] ) print(f"Exercises with answers: {total}") print(f" freeform: {freeform}") print(f" multi-part (A/B/C): {multipart}") print(f" total numbered items: {total_items}") print(f" items with alternates:{with_alternates}") print(f"Wrote {OUT}") if __name__ == "__main__": main()