Spanish/Conjuga/Scripts/textbook/build_review.py

#!/usr/bin/env python3
"""Render book.json + ocr.json into a static HTML review page.

The HTML surfaces low-confidence OCR results in red, and shows the parsed
exercise prompts/answers next to the original image. Designed for rapid
visual diffing against the source book.
"""

import html
import json
from pathlib import Path

HERE = Path(__file__).resolve().parent
BOOK = HERE / "book.json"
OCR = HERE / "ocr.json"
OUT_HTML = HERE / "review.html"
EPUB_IMAGES = Path(HERE).parents[2] / "epub_extract" / "OEBPS"
IMAGE_REL = EPUB_IMAGES.relative_to(HERE.parent) if False else EPUB_IMAGES


def load(p: Path) -> dict:
    return json.loads(p.read_text(encoding="utf-8"))


def esc(s: str) -> str:
    return html.escape(s or "")


def img_tag(src: str) -> str:
    full = (EPUB_IMAGES / src).resolve()
    return f'<img src="file://{full}" alt="{esc(src)}" class="src"/>'


def render() -> None:
    book = load(BOOK)
    ocr = load(OCR) if OCR.exists() else {}

    out: list = []
    out.append("""<!DOCTYPE html>
<html><head><meta charset='utf-8'><title>Book review</title>
<style>
body { font-family: -apple-system, system-ui, sans-serif; margin: 2em; max-width: 1000px; color: #222; }
h1 { color: #c44; }
h2.chapter { background: #eee; padding: 0.5em; border-left: 4px solid #c44; }
h3.heading { color: #555; }
.para { margin: 0.5em 0; }
.vocab-table { background: #fafff0; padding: 0.5em; margin: 0.5em 0; border: 1px solid #bda; border-radius: 6px; }
.ocr-line { font-family: ui-monospace, monospace; font-size: 12px; }
.lowconf { color: #c44; background: #fee; }
.exercise { background: #fff8e8; padding: 0.5em; margin: 0.75em 0; border: 1px solid #cb9; border-radius: 6px; }
.prompt { font-family: ui-monospace, monospace; font-size: 13px; margin: 2px 0; }
.answer { color: #080; font-family: ui-monospace, monospace; font-size: 13px; }
img.src { max-width: 520px; border: 1px solid #ccc; margin: 4px 0; }
.kv { color: #04a; font-weight: bold; }
summary { cursor: pointer; font-weight: bold; color: #666; }
.card-pair { font-family: ui-monospace, monospace; font-size: 12px; }
.card-es { color: #04a; }
.card-en { color: #555; }
.counts { color: #888; font-size: 12px; }
</style></head><body>""")
    out.append(f"<h1>{esc(book['courseName'])} — review</h1>")
    out.append(f"<p>{book['totalChapters']} chapters · {book['totalExercises']} exercises · {book['totalVocabTables']} vocab tables · {book['totalVocabCards']} auto-derived cards</p>")

    for ch in book["chapters"]:
        part = ch.get("part")
        part_str = f" (Part {part})" if part else ""
        out.append(f"<h2 class='chapter'>Chapter {ch['number']}: {esc(ch['title'])}{esc(part_str)}</h2>")

        for b in ch["blocks"]:
            kind = b["kind"]
            if kind == "heading":
                level = b["level"]
                out.append(f"<h{level} class='heading'>{esc(b['text'])}</h{level}>")
            elif kind == "paragraph":
                out.append(f"<p class='para'>{esc(b['text'])}</p>")
            elif kind == "key_vocab_header":
                out.append(f"<p class='kv'>★ Key Vocabulary</p>")
            elif kind == "vocab_table":
                src = b["sourceImage"]
                conf = b["ocrConfidence"]
                conf_class = "lowconf" if conf < 0.85 else ""
                out.append(f"<div class='vocab-table'>")
                out.append(f"<details><summary>vocab {esc(src)} · confidence {conf:.2f} · {b['cardCount']} card(s)</summary>")
                out.append(img_tag(src))
                out.append("<div>")
                for line in b.get("ocrLines", []):
                    out.append(f"<div class='ocr-line {conf_class}'>{esc(line)}</div>")
                out.append("</div>")
                # Show derived pairs (if any). We don't have them inline in book.json,
                # but we can recompute from ocrLines using the same function.
                out.append("</details></div>")
            elif kind == "exercise":
                out.append(f"<div class='exercise'>")
                out.append(f"<b>Exercise {esc(b['id'])}</b> — <i>{esc(b['instruction'])}</i>")
                if b.get("extra"):
                    for e in b["extra"]:
                        out.append(f"<div class='para'>{esc(e)}</div>")
                if b.get("ocrLines"):
                    out.append(f"<details><summary>OCR lines from image</summary>")
                    for line in b["ocrLines"]:
                        out.append(f"<div class='ocr-line'>{esc(line)}</div>")
                    out.append("</details>")
                if b.get("prompts"):
                    out.append("<div><b>Parsed prompts:</b></div>")
                    for p in b["prompts"]:
                        out.append(f"<div class='prompt'>• {esc(p)}</div>")
                if b.get("answerItems"):
                    out.append("<div><b>Answer key:</b></div>")
                    for a in b["answerItems"]:
                        label_str = f"{a['label']}. " if a.get("label") else ""
                        alts = ", ".join(a["alternates"])
                        alt_str = f"  <span style='color:#999'>(also: {esc(alts)})</span>" if alts else ""
                        out.append(f"<div class='answer'>{esc(label_str)}{a['number']}. {esc(a['answer'])}{alt_str}</div>")
                if b.get("freeform"):
                    out.append("<div style='color:#c44'>(Freeform — answers will vary)</div>")
                for img_src in b.get("image_refs", []):
                    out.append(img_tag(img_src))
                out.append("</div>")

    out.append("</body></html>")
    OUT_HTML.write_text("\n".join(out), encoding="utf-8")
    print(f"Wrote {OUT_HTML}")


if __name__ == "__main__":
    render()