Files
Spanish/Conjuga/Scripts/textbook/build_review.py
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

127 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""Render book.json + ocr.json into a static HTML review page.
The HTML surfaces low-confidence OCR results in red, and shows the parsed
exercise prompts/answers next to the original image. Designed for rapid
visual diffing against the source book.
"""
import html
import json
from pathlib import Path
HERE = Path(__file__).resolve().parent
BOOK = HERE / "book.json"
OCR = HERE / "ocr.json"
OUT_HTML = HERE / "review.html"
EPUB_IMAGES = Path(HERE).parents[2] / "epub_extract" / "OEBPS"
IMAGE_REL = EPUB_IMAGES.relative_to(HERE.parent) if False else EPUB_IMAGES
def load(p: Path) -> dict:
return json.loads(p.read_text(encoding="utf-8"))
def esc(s: str) -> str:
return html.escape(s or "")
def img_tag(src: str) -> str:
full = (EPUB_IMAGES / src).resolve()
return f'<img src="file://{full}" alt="{esc(src)}" class="src"/>'
def render() -> None:
book = load(BOOK)
ocr = load(OCR) if OCR.exists() else {}
out: list = []
out.append("""<!DOCTYPE html>
<html><head><meta charset='utf-8'><title>Book review</title>
<style>
body { font-family: -apple-system, system-ui, sans-serif; margin: 2em; max-width: 1000px; color: #222; }
h1 { color: #c44; }
h2.chapter { background: #eee; padding: 0.5em; border-left: 4px solid #c44; }
h3.heading { color: #555; }
.para { margin: 0.5em 0; }
.vocab-table { background: #fafff0; padding: 0.5em; margin: 0.5em 0; border: 1px solid #bda; border-radius: 6px; }
.ocr-line { font-family: ui-monospace, monospace; font-size: 12px; }
.lowconf { color: #c44; background: #fee; }
.exercise { background: #fff8e8; padding: 0.5em; margin: 0.75em 0; border: 1px solid #cb9; border-radius: 6px; }
.prompt { font-family: ui-monospace, monospace; font-size: 13px; margin: 2px 0; }
.answer { color: #080; font-family: ui-monospace, monospace; font-size: 13px; }
img.src { max-width: 520px; border: 1px solid #ccc; margin: 4px 0; }
.kv { color: #04a; font-weight: bold; }
summary { cursor: pointer; font-weight: bold; color: #666; }
.card-pair { font-family: ui-monospace, monospace; font-size: 12px; }
.card-es { color: #04a; }
.card-en { color: #555; }
.counts { color: #888; font-size: 12px; }
</style></head><body>""")
out.append(f"<h1>{esc(book['courseName'])} — review</h1>")
out.append(f"<p>{book['totalChapters']} chapters · {book['totalExercises']} exercises · {book['totalVocabTables']} vocab tables · {book['totalVocabCards']} auto-derived cards</p>")
for ch in book["chapters"]:
part = ch.get("part")
part_str = f" (Part {part})" if part else ""
out.append(f"<h2 class='chapter'>Chapter {ch['number']}: {esc(ch['title'])}{esc(part_str)}</h2>")
for b in ch["blocks"]:
kind = b["kind"]
if kind == "heading":
level = b["level"]
out.append(f"<h{level} class='heading'>{esc(b['text'])}</h{level}>")
elif kind == "paragraph":
out.append(f"<p class='para'>{esc(b['text'])}</p>")
elif kind == "key_vocab_header":
out.append(f"<p class='kv'>★ Key Vocabulary</p>")
elif kind == "vocab_table":
src = b["sourceImage"]
conf = b["ocrConfidence"]
conf_class = "lowconf" if conf < 0.85 else ""
out.append(f"<div class='vocab-table'>")
out.append(f"<details><summary>vocab {esc(src)} · confidence {conf:.2f} · {b['cardCount']} card(s)</summary>")
out.append(img_tag(src))
out.append("<div>")
for line in b.get("ocrLines", []):
out.append(f"<div class='ocr-line {conf_class}'>{esc(line)}</div>")
out.append("</div>")
# Show derived pairs (if any). We don't have them inline in book.json,
# but we can recompute from ocrLines using the same function.
out.append("</details></div>")
elif kind == "exercise":
out.append(f"<div class='exercise'>")
out.append(f"<b>Exercise {esc(b['id'])}</b> — <i>{esc(b['instruction'])}</i>")
if b.get("extra"):
for e in b["extra"]:
out.append(f"<div class='para'>{esc(e)}</div>")
if b.get("ocrLines"):
out.append(f"<details><summary>OCR lines from image</summary>")
for line in b["ocrLines"]:
out.append(f"<div class='ocr-line'>{esc(line)}</div>")
out.append("</details>")
if b.get("prompts"):
out.append("<div><b>Parsed prompts:</b></div>")
for p in b["prompts"]:
out.append(f"<div class='prompt'>• {esc(p)}</div>")
if b.get("answerItems"):
out.append("<div><b>Answer key:</b></div>")
for a in b["answerItems"]:
label_str = f"{a['label']}. " if a.get("label") else ""
alts = ", ".join(a["alternates"])
alt_str = f" <span style='color:#999'>(also: {esc(alts)})</span>" if alts else ""
out.append(f"<div class='answer'>{esc(label_str)}{a['number']}. {esc(a['answer'])}{alt_str}</div>")
if b.get("freeform"):
out.append("<div style='color:#c44'>(Freeform — answers will vary)</div>")
for img_src in b.get("image_refs", []):
out.append(img_tag(img_src))
out.append("</div>")
out.append("</body></html>")
OUT_HTML.write_text("\n".join(out), encoding="utf-8")
print(f"Wrote {OUT_HTML}")
if __name__ == "__main__":
render()