#!/usr/bin/env python3 """Parse all chapter XHTMLs + appendix into structured chapters.json. Output schema: { "chapters": [ { "id": "ch1", "number": 1, "title": "Nouns, Articles, and Adjectives", "part": 1, # part 1/2/3 or null "blocks": [ # ordered content {"kind": "heading", "level": 3, "text": "..."}, {"kind": "paragraph", "text": "...", "hasItalic": false}, {"kind": "key_vocab_header", "title": "Los colores (The colors)"}, {"kind": "vocab_image", "src": "f0010-03.jpg"}, { "kind": "exercise", "id": "1.1", "ans_anchor": "ch1ans1", "instruction": "Write the appropriate...", "image_refs": ["f0005-02.jpg"] }, {"kind": "image", "src": "...", "alt": "..."} ] } ] } """ import json import re from pathlib import Path from bs4 import BeautifulSoup ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS" OUT = Path(__file__).resolve().parent / "chapters.json" # Common icon images embedded in headings — ignore when collecting content images ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"} EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)") ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)") def clean_text(el) -> str: """Extract text preserving inline emphasis markers.""" if el is None: return "" # Replace / with markdown-ish *...*, / with **...** html = str(el) soup = BeautifulSoup(html, "lxml") # First: flatten nested emphasis so we don't emit overlapping markers. # For X, drop the inner em (the bold wrapping # already carries the emphasis visually). Same for .... for tag in soup.find_all(["strong", "b"]): for inner in tag.find_all(["em", "i"]): inner.unwrap() for tag in soup.find_all(["em", "i"]): for inner in tag.find_all(["strong", "b"]): inner.unwrap() # Drop ALL inline emphasis. The source has nested/sibling em/strong # patterns that CommonMark can't reliably parse, causing markers to leak # into the UI. Plain text renders cleanly everywhere. for tag in soup.find_all(["em", "i", "strong", "b"]): tag.unwrap() # Drop pagebreak spans for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}): tag.decompose() # Replace
with newline for br in soup.find_all("br"): br.replace_with("\n") # Use a separator so adjacent inline tags don't concatenate without spaces # (e.g. "Ir and" would otherwise become "Irand"). text = soup.get_text(separator=" ", strip=False) # Collapse runs of whitespace first. text = re.sub(r"\s+", " ", text).strip() # Strip any stray asterisks that sneak through (e.g. author's literal *). text = text.replace("*", "") # De-space punctuation text = re.sub(r"\s+([,.;:!?])", r"\1", text) # Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)" text = re.sub(r"([(\[])\s+", r"\1", text) text = re.sub(r"\s+([)\]])", r"\1", text) # Collapse any double-spaces text = re.sub(r" +", " ", text).strip() return text def is_exercise_header(h) -> bool: """Heading with an Exercise N.N link. Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4.""" if h.name not in ("h3", "h4"): return False a = h.find("a", href=True) if a and "ans.xhtml" in a["href"]: return True return False def is_key_vocab_header(h) -> bool: """Heading with 'Key Vocabulary' text (no anchor link to answers).""" if h.name not in ("h3", "h4"): return False text = h.get_text(strip=True) if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v): return True return False def extract_image_srcs(parent) -> list: """Return list of image src attributes, skipping icon images.""" srcs = [] for img in parent.find_all("img"): src = img.get("src", "") if not src or Path(src).name in ICON_IMAGES: continue srcs.append(src) return srcs def parse_chapter(path: Path) -> "dict | None": """Parse one chapter file into structured blocks.""" html = path.read_text(encoding="utf-8") soup = BeautifulSoup(html, "lxml") body = soup.find("body") if body is None: return None # Chapter number + title number = None title = "" h2s = body.find_all("h2") for h2 in h2s: classes = h2.get("class") or [] # Use a separator so consecutive inline tags don't concatenate # (e.g. "Ir and the Future" → "Ir and the Future") text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True)) # Strip spaces that were inserted before punctuation text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip() if "h2c" in classes and text_with_sep.isdigit(): number = int(text_with_sep) # Chapters 1–16 use h2c1; chapters 17+ use h2-c elif ("h2c1" in classes or "h2-c" in classes) and not title: title = text_with_sep if number is None: # Try id on chapter header (ch1 → 1) for h2 in h2s: id_ = h2.get("id", "") m = re.match(r"ch(\d+)", id_) if m: number = int(m.group(1)) break chapter_id = path.stem # ch1, ch2, ... # Walk section content in document order section = body.find("section") or body blocks: list = [] pending_instruction = None # holds italic paragraph following an exercise header for el in section.descendants: if el.name is None: continue classes = el.get("class") or [] # Skip nested tags already captured via parent processing # We operate only on direct h2/h3/h4/h5/p elements if el.name not in ("h2", "h3", "h4", "h5", "p"): continue # Exercise header detection (h3 in ch1-16, h4 in ch17+) if is_exercise_header(el): a = el.find("a", href=True) href = a["href"] if a else "" m = EXERCISE_ID_RE.search(el.get_text()) ex_id = m.group(1) if m else "" anchor_m = ANS_REF_RE.search(href) ans_anchor = anchor_m.group(0) if anchor_m else "" blocks.append({ "kind": "exercise", "id": ex_id, "ans_anchor": ans_anchor, "instruction": "", "image_refs": [], "prompts": [] }) pending_instruction = blocks[-1] continue # Key Vocabulary header if is_key_vocab_header(el): blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"}) pending_instruction = None continue # Other headings if el.name in ("h2", "h3", "h4", "h5"): if el.name == "h2": # Skip the chapter-number/chapter-title h2s we already captured continue txt = clean_text(el) if txt: blocks.append({ "kind": "heading", "level": int(el.name[1]), "text": txt, }) pending_instruction = None continue # Paragraphs if el.name == "p": imgs = extract_image_srcs(el) text = clean_text(el) p_classes = set(classes) # Skip pure blank-line class ("nump" = underscore lines under number prompts) if p_classes & {"nump", "numpa"} and not text: continue # Exercise prompt:

1. Prompt text

# Also number1, number2 (continuation numbering), numbera, numbert if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}: if text: pending_instruction["prompts"].append(text) continue # Image container for a pending exercise if pending_instruction is not None and imgs and not text: pending_instruction["image_refs"].extend(imgs) continue # Instruction line right after the exercise header if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]: pending_instruction["instruction"] = text continue # While in pending-exercise state, extra text paragraphs are word # banks / context ("from the following list:" etc) — keep pending alive. if pending_instruction is not None and text and not imgs: pending_instruction.setdefault("extra", []).append(text) continue # Paragraphs that contain an image belong to vocab/key-vocab callouts if imgs and not text: for src in imgs: blocks.append({"kind": "vocab_image", "src": src}) continue # Mixed paragraph: image with caption if imgs and text: for src in imgs: blocks.append({"kind": "vocab_image", "src": src}) blocks.append({"kind": "paragraph", "text": text}) continue # Plain paragraph — outside any exercise if text: blocks.append({"kind": "paragraph", "text": text}) return { "id": chapter_id, "number": number, "title": title, "blocks": blocks, } def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None: """Annotate chapters with part number based on TOC membership.""" for part_num, chapter_nums in part_files.items(): for ch in chapters: if ch["number"] in chapter_nums: ch["part"] = part_num for ch in chapters: ch.setdefault("part", None) def read_part_memberships() -> "dict[int, list[int]]": """Derive part→chapter grouping from the OPF spine order.""" opf = next(ROOT.glob("*.opf"), None) if opf is None: return {} soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml") memberships: dict = {} current_part: "int | None" = None for item in soup.find_all("item"): href = item.get("href", "") m_part = re.match(r"part(\d+)\.xhtml", href) m_ch = re.match(r"ch(\d+)\.xhtml", href) if m_part: current_part = int(m_part.group(1)) memberships.setdefault(current_part, []) elif m_ch and current_part is not None: memberships[current_part].append(int(m_ch.group(1))) # Manifest order tends to match spine order for this book; verify via spine just in case spine = soup.find("spine") if spine is not None: order = [] for ref in spine.find_all("itemref"): idref = ref.get("idref") item = soup.find("item", attrs={"id": idref}) if item is not None: order.append(item.get("href", "")) # Rebuild from spine order memberships = {} current_part = None for href in order: m_part = re.match(r"part(\d+)\.xhtml", href) m_ch = re.match(r"ch(\d+)\.xhtml", href) if m_part: current_part = int(m_part.group(1)) memberships.setdefault(current_part, []) elif m_ch and current_part is not None: memberships[current_part].append(int(m_ch.group(1))) return memberships def main() -> None: chapter_files = sorted( ROOT.glob("ch*.xhtml"), key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1)) ) chapters = [] for path in chapter_files: ch = parse_chapter(path) if ch: chapters.append(ch) part_memberships = read_part_memberships() assign_parts(chapters, part_memberships) out = { "chapters": chapters, "part_memberships": part_memberships, } OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2)) # Summary ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise") ex_with_prompts = sum( 1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise" and b["prompts"] ) ex_with_images = sum( 1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise" and b["image_refs"] ) ex_empty = sum( 1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"] ) para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph") vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image") print(f"Chapters: {len(chapters)}") print(f"Exercises total: {ex_total}") print(f" with text prompts: {ex_with_prompts}") print(f" with image prompts: {ex_with_images}") print(f" empty: {ex_empty}") print(f"Paragraphs: {para_total}") print(f"Vocab images: {vocab_img_total}") print(f"Parts: {part_memberships}") print(f"Wrote {OUT}") if __name__ == "__main__": main()