#!/usr/bin/env python3 """Render book.json + ocr.json into a static HTML review page. The HTML surfaces low-confidence OCR results in red, and shows the parsed exercise prompts/answers next to the original image. Designed for rapid visual diffing against the source book. """ import html import json from pathlib import Path HERE = Path(__file__).resolve().parent BOOK = HERE / "book.json" OCR = HERE / "ocr.json" OUT_HTML = HERE / "review.html" EPUB_IMAGES = Path(HERE).parents[2] / "epub_extract" / "OEBPS" IMAGE_REL = EPUB_IMAGES.relative_to(HERE.parent) if False else EPUB_IMAGES def load(p: Path) -> dict: return json.loads(p.read_text(encoding="utf-8")) def esc(s: str) -> str: return html.escape(s or "") def img_tag(src: str) -> str: full = (EPUB_IMAGES / src).resolve() return f' {esc(src)}

' def render() -> None: book = load(BOOK) ocr = load(OCR) if OCR.exists() else {} out: list = [] out.append(""" Book review """) out.append(f"

{esc(book['courseName'])} — review

") out.append(f"

{book['totalChapters']} chapters · {book['totalExercises']} exercises · {book['totalVocabTables']} vocab tables · {book['totalVocabCards']} auto-derived cards

") for ch in book["chapters"]: part = ch.get("part") part_str = f" (Part {part})" if part else "" out.append(f"

Chapter {ch['number']}: {esc(ch['title'])}{esc(part_str)}

") for b in ch["blocks"]: kind = b["kind"] if kind == "heading": level = b["level"] out.append(f"{esc(b['text'])}") elif kind == "paragraph": out.append(f"

{esc(b['text'])}

") elif kind == "key_vocab_header": out.append(f"

★ Key Vocabulary

") elif kind == "vocab_table": src = b["sourceImage"] conf = b["ocrConfidence"] conf_class = "lowconf" if conf < 0.85 else "" out.append(f"

") out.append(f"

vocab {esc(src)} · confidence {conf:.2f} · {b['cardCount']} card(s)

") out.append(img_tag(src)) out.append("

") for line in b.get("ocrLines", []): out.append(f"

{esc(line)}

") out.append("

") # Show derived pairs (if any). We don't have them inline in book.json, # but we can recompute from ocrLines using the same function. out.append("

") elif kind == "exercise": out.append(f"

") out.append(f"Exercise {esc(b['id'])} — {esc(b['instruction'])}") if b.get("extra"): for e in b["extra"]: out.append(f"

{esc(e)}

") if b.get("ocrLines"): out.append(f"

OCR lines from image

") for line in b["ocrLines"]: out.append(f"

{esc(line)}

") out.append("

") if b.get("prompts"): out.append("

Parsed prompts:

") for p in b["prompts"]: out.append(f"

• {esc(p)}

") if b.get("answerItems"): out.append("

Answer key:

") for a in b["answerItems"]: label_str = f"{a['label']}. " if a.get("label") else "" alts = ", ".join(a["alternates"]) alt_str = f" (also: {esc(alts)})" if alts else "" out.append(f"

{esc(label_str)}{a['number']}. {esc(a['answer'])}{alt_str}

") if b.get("freeform"): out.append("

(Freeform — answers will vary)

") for img_src in b.get("image_refs", []): out.append(img_tag(img_src)) out.append("

") out.append("") OUT_HTML.write_text("\n".join(out), encoding="utf-8") print(f"Wrote {OUT_HTML}") if __name__ == "__main__": render()