Spanish/Conjuga/Scripts/textbook/extract_answers.py

#!/usr/bin/env python3
"""Parse ans.xhtml into structured answers.json.

Output schema:
{
  "answers": {
    "1.1": {
      "id": "1.1",
      "anchor": "ch1ans1",
      "chapter": 1,
      "subparts": [
        {"label": null, "items": [
          {"number": 1, "answer": "el", "alternates": []},
          {"number": 2, "answer": "el", "alternates": []},
          ...
        ]}
      ],
      "freeform": false,        # true if "Answers will vary"
      "raw": "..."              # raw text for fallback
    },
    "2.4": {                     # multi-part exercise
      "subparts": [
        {"label": "A", "items": [...]},
        {"label": "B", "items": [...]},
        {"label": "C", "items": [...]}
      ]
    }
  }
}
"""

import json
import re
from pathlib import Path
from bs4 import BeautifulSoup, NavigableString

ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
OUT = Path(__file__).resolve().parent / "answers.json"

ANSWER_CLASSES = {"answerq", "answerq1", "answerq2", "answerqa"}
EXERCISE_ID_RE = re.compile(r"^([0-9]+)\.([0-9]+)$")
SUBPART_LABEL_RE = re.compile(r"^([A-Z])\b")
NUMBERED_ITEM_RE = re.compile(r"(?:^|\s)(\d+)\.\s+")
FREEFORM_PATTERNS = [
    re.compile(r"answers? will vary", re.IGNORECASE),
    re.compile(r"answer will vary", re.IGNORECASE),
]
OR_TOKEN = "{{OR}}"


def render_with_or(p) -> str:
    """Convert <p> to plain text, replacing 'OR' span markers with sentinel."""
    soup = BeautifulSoup(str(p), "lxml")
    # Replace <span class="small">OR</span> with sentinel
    for span in soup.find_all("span"):
        cls = span.get("class") or []
        if "small" in cls and span.get_text(strip=True).upper() == "OR":
            span.replace_with(f" {OR_TOKEN} ")
    # Drop pagebreak spans
    for span in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
        span.decompose()
    # Drop emphasis but keep text
    for tag in soup.find_all(["em", "i", "strong", "b"]):
        tag.unwrap()
    text = soup.get_text(separator=" ", strip=False)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def split_numbered_items(text: str) -> "list[dict]":
    """Given '1. el 2. la 3. el ...' return [{'number':1,'answer':'el'}, ...]."""
    # Find positions of N. tokens
    matches = list(NUMBERED_ITEM_RE.finditer(text))
    items = []
    for i, m in enumerate(matches):
        num = int(m.group(1))
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        body = text[start:end].strip().rstrip(".,;")
        # Split alternates on the OR token
        parts = [p.strip() for p in body.split(OR_TOKEN) if p.strip()]
        if not parts:
            continue
        items.append({
            "number": num,
            "answer": parts[0],
            "alternates": parts[1:],
        })
    return items


def parse_subpart_label(text: str) -> "tuple[str | None, str]":
    """Try to peel a leading subpart label (A, B, C) from the text.
    Returns (label_or_None, remaining_text)."""
    # Pattern at start: "A " or "A      " (lots of whitespace from <em>A</em><tab>)
    m = re.match(r"^([A-Z])\s+(?=\d)", text)
    if m:
        return m.group(1), text[m.end():]
    return None, text


def parse_answer_paragraph(p, exercise_id: str) -> "list[dict]":
    """Convert one <p> into a list of subparts.
    For p.answerq, the text typically starts with the exercise id, then items.
    For p.answerqa, the text starts with a subpart label letter."""
    raw = render_with_or(p)
    # Strip the leading exercise id if present
    raw = re.sub(rf"^{re.escape(exercise_id)}\s*", "", raw)

    label, body = parse_subpart_label(raw)

    # Detect freeform
    freeform = any(pat.search(body) for pat in FREEFORM_PATTERNS)
    if freeform:
        return [{"label": label, "items": [], "freeform": True, "raw": body}]

    items = split_numbered_items(body)
    return [{"label": label, "items": items, "freeform": False, "raw": body}]


def main() -> None:
    src = ROOT / "ans.xhtml"
    soup = BeautifulSoup(src.read_text(encoding="utf-8"), "lxml")
    body = soup.find("body")

    answers: dict = {}
    current_chapter = None
    current_exercise_id: "str | None" = None

    for el in body.find_all(["h3", "p"]):
        classes = set(el.get("class") or [])

        # Chapter boundary
        if el.name == "h3" and "h3b" in classes:
            text = el.get_text(strip=True)
            m = re.search(r"Chapter\s+(\d+)", text)
            if m:
                current_chapter = int(m.group(1))
                current_exercise_id = None
            continue

        if el.name != "p" or not (classes & ANSWER_CLASSES):
            continue

        # Find the exercise-id anchor (only present on p.answerq, not on continuation)
        a = el.find("a", href=True)
        ex_link = None
        if a:
            link_text = a.get_text(strip=True)
            if EXERCISE_ID_RE.match(link_text):
                ex_link = link_text

        if ex_link:
            current_exercise_id = ex_link
            anchor = ""
            href = a.get("href", "")
            anchor_m = re.search(r"#(ch\d+ans\d+)", href + " " + (a.get("id") or ""))
            anchor = anchor_m.group(1) if anchor_m else (a.get("id") or "")
            # Use the anchor's `id` attr if it's the entry id (e.g. "ch1ans1")
            entry_id = a.get("id") or anchor

            answers[ex_link] = {
                "id": ex_link,
                "anchor": entry_id,
                "chapter": current_chapter,
                "subparts": [],
                "freeform": False,
                "raw": "",
            }
            new_subparts = parse_answer_paragraph(el, ex_link)
            answers[ex_link]["subparts"].extend(new_subparts)
            answers[ex_link]["raw"] = render_with_or(el)
            answers[ex_link]["freeform"] = any(sp["freeform"] for sp in new_subparts)
        else:
            # Continuation paragraph for current exercise
            if current_exercise_id and current_exercise_id in answers:
                more = parse_answer_paragraph(el, current_exercise_id)
                answers[current_exercise_id]["subparts"].extend(more)
                if any(sp["freeform"] for sp in more):
                    answers[current_exercise_id]["freeform"] = True

    out = {"answers": answers}
    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))

    total = len(answers)
    freeform = sum(1 for v in answers.values() if v["freeform"])
    multipart = sum(1 for v in answers.values() if len(v["subparts"]) > 1)
    total_items = sum(
        len(sp["items"]) for v in answers.values() for sp in v["subparts"]
    )
    with_alternates = sum(
        1 for v in answers.values()
        for sp in v["subparts"] for it in sp["items"]
        if it["alternates"]
    )
    print(f"Exercises with answers: {total}")
    print(f"  freeform:             {freeform}")
    print(f"  multi-part (A/B/C):   {multipart}")
    print(f"  total numbered items: {total_items}")
    print(f"  items with alternates:{with_alternates}")
    print(f"Wrote {OUT}")


if __name__ == "__main__":
    main()