Spanish/Conjuga/Scripts/textbook/extract_chapters.py

#!/usr/bin/env python3
"""Parse all chapter XHTMLs + appendix into structured chapters.json.

Output schema:
{
  "chapters": [
    {
      "id": "ch1",
      "number": 1,
      "title": "Nouns, Articles, and Adjectives",
      "part": 1,                          # part 1/2/3 or null
      "blocks": [                         # ordered content
        {"kind": "heading", "level": 3, "text": "..."},
        {"kind": "paragraph", "text": "...", "hasItalic": false},
        {"kind": "key_vocab_header", "title": "Los colores (The colors)"},
        {"kind": "vocab_image", "src": "f0010-03.jpg"},
        {
          "kind": "exercise",
          "id": "1.1",
          "ans_anchor": "ch1ans1",
          "instruction": "Write the appropriate...",
          "image_refs": ["f0005-02.jpg"]
        },
        {"kind": "image", "src": "...", "alt": "..."}
      ]
    }
  ]
}
"""

import json
import re
from pathlib import Path
from bs4 import BeautifulSoup

ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
OUT = Path(__file__).resolve().parent / "chapters.json"

# Common icon images embedded in headings — ignore when collecting content images
ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}

EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")


def clean_text(el) -> str:
    """Extract text preserving inline emphasis markers."""
    if el is None:
        return ""
    # Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
    html = str(el)
    soup = BeautifulSoup(html, "lxml")
    # First: flatten nested emphasis so we don't emit overlapping markers.
    # For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
    # already carries the emphasis visually). Same for <em><strong>...</strong></em>.
    for tag in soup.find_all(["strong", "b"]):
        for inner in tag.find_all(["em", "i"]):
            inner.unwrap()
    for tag in soup.find_all(["em", "i"]):
        for inner in tag.find_all(["strong", "b"]):
            inner.unwrap()
    # Drop ALL inline emphasis. The source has nested/sibling em/strong
    # patterns that CommonMark can't reliably parse, causing markers to leak
    # into the UI. Plain text renders cleanly everywhere.
    for tag in soup.find_all(["em", "i", "strong", "b"]):
        tag.unwrap()
    # Drop pagebreak spans
    for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
        tag.decompose()
    # Replace <br/> with newline
    for br in soup.find_all("br"):
        br.replace_with("\n")
    # Use a separator so adjacent inline tags don't concatenate without spaces
    # (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
    text = soup.get_text(separator=" ", strip=False)
    # Collapse runs of whitespace first.
    text = re.sub(r"\s+", " ", text).strip()
    # Strip any stray asterisks that sneak through (e.g. author's literal *).
    text = text.replace("*", "")
    # De-space punctuation
    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
    # Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
    text = re.sub(r"([(\[])\s+", r"\1", text)
    text = re.sub(r"\s+([)\]])", r"\1", text)
    # Collapse any double-spaces
    text = re.sub(r"  +", " ", text).strip()
    return text


def is_exercise_header(h) -> bool:
    """Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
    Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
    if h.name not in ("h3", "h4"):
        return False
    a = h.find("a", href=True)
    if a and "ans.xhtml" in a["href"]:
        return True
    return False


def is_key_vocab_header(h) -> bool:
    """Heading with 'Key Vocabulary' text (no anchor link to answers)."""
    if h.name not in ("h3", "h4"):
        return False
    text = h.get_text(strip=True)
    if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
        return True
    return False


def extract_image_srcs(parent) -> list:
    """Return list of image src attributes, skipping icon images."""
    srcs = []
    for img in parent.find_all("img"):
        src = img.get("src", "")
        if not src or Path(src).name in ICON_IMAGES:
            continue
        srcs.append(src)
    return srcs


def parse_chapter(path: Path) -> "dict | None":
    """Parse one chapter file into structured blocks."""
    html = path.read_text(encoding="utf-8")
    soup = BeautifulSoup(html, "lxml")
    body = soup.find("body")
    if body is None:
        return None

    # Chapter number + title
    number = None
    title = ""
    h2s = body.find_all("h2")
    for h2 in h2s:
        classes = h2.get("class") or []
        # Use a separator so consecutive inline tags don't concatenate
        # (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
        text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
        # Strip spaces that were inserted before punctuation
        text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
        if "h2c" in classes and text_with_sep.isdigit():
            number = int(text_with_sep)
        # Chapters 1–16 use h2c1; chapters 17+ use h2-c
        elif ("h2c1" in classes or "h2-c" in classes) and not title:
            title = text_with_sep
    if number is None:
        # Try id on chapter header (ch1 → 1)
        for h2 in h2s:
            id_ = h2.get("id", "")
            m = re.match(r"ch(\d+)", id_)
            if m:
                number = int(m.group(1))
                break

    chapter_id = path.stem  # ch1, ch2, ...

    # Walk section content in document order
    section = body.find("section") or body
    blocks: list = []
    pending_instruction = None  # holds italic paragraph following an exercise header

    for el in section.descendants:
        if el.name is None:
            continue

        classes = el.get("class") or []

        # Skip nested tags already captured via parent processing
        # We operate only on direct h2/h3/h4/h5/p elements
        if el.name not in ("h2", "h3", "h4", "h5", "p"):
            continue

        # Exercise header detection (h3 in ch1-16, h4 in ch17+)
        if is_exercise_header(el):
            a = el.find("a", href=True)
            href = a["href"] if a else ""
            m = EXERCISE_ID_RE.search(el.get_text())
            ex_id = m.group(1) if m else ""
            anchor_m = ANS_REF_RE.search(href)
            ans_anchor = anchor_m.group(0) if anchor_m else ""
            blocks.append({
                "kind": "exercise",
                "id": ex_id,
                "ans_anchor": ans_anchor,
                "instruction": "",
                "image_refs": [],
                "prompts": []
            })
            pending_instruction = blocks[-1]
            continue

        # Key Vocabulary header
        if is_key_vocab_header(el):
            blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
            pending_instruction = None
            continue

        # Other headings
        if el.name in ("h2", "h3", "h4", "h5"):
            if el.name == "h2":
                # Skip the chapter-number/chapter-title h2s we already captured
                continue
            txt = clean_text(el)
            if txt:
                blocks.append({
                    "kind": "heading",
                    "level": int(el.name[1]),
                    "text": txt,
                })
            pending_instruction = None
            continue

        # Paragraphs
        if el.name == "p":
            imgs = extract_image_srcs(el)
            text = clean_text(el)
            p_classes = set(classes)

            # Skip pure blank-line class ("nump" = underscore lines under number prompts)
            if p_classes & {"nump", "numpa"} and not text:
                continue

            # Exercise prompt: <p class="number">1.  Prompt text</p>
            # Also number1, number2 (continuation numbering), numbera, numbert
            if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
                if text:
                    pending_instruction["prompts"].append(text)
                continue

            # Image container for a pending exercise
            if pending_instruction is not None and imgs and not text:
                pending_instruction["image_refs"].extend(imgs)
                continue

            # Instruction line right after the exercise header
            if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
                pending_instruction["instruction"] = text
                continue

            # While in pending-exercise state, extra text paragraphs are word
            # banks / context ("from the following list:" etc) — keep pending alive.
            if pending_instruction is not None and text and not imgs:
                pending_instruction.setdefault("extra", []).append(text)
                continue

            # Paragraphs that contain an image belong to vocab/key-vocab callouts
            if imgs and not text:
                for src in imgs:
                    blocks.append({"kind": "vocab_image", "src": src})
                continue

            # Mixed paragraph: image with caption
            if imgs and text:
                for src in imgs:
                    blocks.append({"kind": "vocab_image", "src": src})
                blocks.append({"kind": "paragraph", "text": text})
                continue

            # Plain paragraph — outside any exercise
            if text:
                blocks.append({"kind": "paragraph", "text": text})

    return {
        "id": chapter_id,
        "number": number,
        "title": title,
        "blocks": blocks,
    }


def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
    """Annotate chapters with part number based on TOC membership."""
    for part_num, chapter_nums in part_files.items():
        for ch in chapters:
            if ch["number"] in chapter_nums:
                ch["part"] = part_num
    for ch in chapters:
        ch.setdefault("part", None)


def read_part_memberships() -> "dict[int, list[int]]":
    """Derive part→chapter grouping from the OPF spine order."""
    opf = next(ROOT.glob("*.opf"), None)
    if opf is None:
        return {}
    soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
    memberships: dict = {}
    current_part: "int | None" = None
    for item in soup.find_all("item"):
        href = item.get("href", "")
        m_part = re.match(r"part(\d+)\.xhtml", href)
        m_ch = re.match(r"ch(\d+)\.xhtml", href)
        if m_part:
            current_part = int(m_part.group(1))
            memberships.setdefault(current_part, [])
        elif m_ch and current_part is not None:
            memberships[current_part].append(int(m_ch.group(1)))
    # Manifest order tends to match spine order for this book; verify via spine just in case
    spine = soup.find("spine")
    if spine is not None:
        order = []
        for ref in spine.find_all("itemref"):
            idref = ref.get("idref")
            item = soup.find("item", attrs={"id": idref})
            if item is not None:
                order.append(item.get("href", ""))
        # Rebuild from spine order
        memberships = {}
        current_part = None
        for href in order:
            m_part = re.match(r"part(\d+)\.xhtml", href)
            m_ch = re.match(r"ch(\d+)\.xhtml", href)
            if m_part:
                current_part = int(m_part.group(1))
                memberships.setdefault(current_part, [])
            elif m_ch and current_part is not None:
                memberships[current_part].append(int(m_ch.group(1)))
    return memberships


def main() -> None:
    chapter_files = sorted(
        ROOT.glob("ch*.xhtml"),
        key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
    )
    chapters = []
    for path in chapter_files:
        ch = parse_chapter(path)
        if ch:
            chapters.append(ch)

    part_memberships = read_part_memberships()
    assign_parts(chapters, part_memberships)

    out = {
        "chapters": chapters,
        "part_memberships": part_memberships,
    }
    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))

    # Summary
    ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
    ex_with_prompts = sum(
        1 for ch in chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and b["prompts"]
    )
    ex_with_images = sum(
        1 for ch in chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and b["image_refs"]
    )
    ex_empty = sum(
        1 for ch in chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
    )
    para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
    vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
    print(f"Chapters:             {len(chapters)}")
    print(f"Exercises total:      {ex_total}")
    print(f"  with text prompts:  {ex_with_prompts}")
    print(f"  with image prompts: {ex_with_images}")
    print(f"  empty:              {ex_empty}")
    print(f"Paragraphs:           {para_total}")
    print(f"Vocab images:         {vocab_img_total}")
    print(f"Parts: {part_memberships}")
    print(f"Wrote {OUT}")


if __name__ == "__main__":
    main()