Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
374
Conjuga/Scripts/textbook/build_book.py
Normal file
374
Conjuga/Scripts/textbook/build_book.py
Normal file
@@ -0,0 +1,374 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Merge chapters.json + answers.json + ocr.json → book.json (single source).
|
||||
|
||||
Also emits vocab_cards.json: flashcards derived from vocab_image blocks where
|
||||
OCR text parses as a clean two-column (Spanish ↔ English) table.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
HERE = Path(__file__).resolve().parent
|
||||
CHAPTERS_JSON = HERE / "chapters.json"
|
||||
ANSWERS_JSON = HERE / "answers.json"
|
||||
OCR_JSON = HERE / "ocr.json"
|
||||
OUT_BOOK = HERE / "book.json"
|
||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||
|
||||
COURSE_NAME = "Complete Spanish Step-by-Step"
|
||||
|
||||
# Heuristic: parseable "Spanish | English" vocab rows.
|
||||
# OCR usually produces "word — translation" or "word translation" separated
|
||||
# by 2+ spaces. We detect rows that contain both Spanish and English words.
|
||||
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
|
||||
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
|
||||
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their", "your", "some"}
|
||||
# English-only words that would never appear as Spanish
|
||||
ENGLISH_ONLY_WORDS = {"the", "he", "she", "it", "we", "they", "I", "is", "are", "was", "were",
|
||||
"been", "have", "has", "had", "will", "would", "should", "could"}
|
||||
SEP_RE = re.compile(r"[ \t]{2,}|\s[—–−-]\s")
|
||||
|
||||
|
||||
def classify_line(line: str) -> str:
|
||||
"""Return 'es', 'en', or 'unknown' for the dominant language of a vocab line."""
|
||||
line = line.strip()
|
||||
if not line:
|
||||
return "unknown"
|
||||
# Accent = definitely Spanish
|
||||
if SPANISH_ACCENT_RE.search(line):
|
||||
return "es"
|
||||
first = line.split()[0].lower().strip(",.;:")
|
||||
if first in SPANISH_ARTICLES:
|
||||
return "es"
|
||||
if first in ENGLISH_STARTERS:
|
||||
return "en"
|
||||
# Check if the leading word is an English-only function word
|
||||
if first in ENGLISH_ONLY_WORDS:
|
||||
return "en"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def looks_english(word: str) -> bool:
|
||||
"""Legacy helper — kept for try_split_row below."""
|
||||
w = word.lower().strip()
|
||||
if not w:
|
||||
return False
|
||||
if SPANISH_ACCENT_RE.search(w):
|
||||
return False
|
||||
if w in SPANISH_ARTICLES:
|
||||
return False
|
||||
if w in ENGLISH_STARTERS or w in ENGLISH_ONLY_WORDS:
|
||||
return True
|
||||
return bool(re.match(r"^[a-z][a-z\s'/()\-,.]*$", w))
|
||||
|
||||
|
||||
def try_split_row(line: str) -> "tuple[str, str] | None":
|
||||
"""Split a line into (spanish, english) if it looks like a vocab entry."""
|
||||
line = line.strip()
|
||||
if not line or len(line) < 3:
|
||||
return None
|
||||
# Try explicit separators first
|
||||
parts = SEP_RE.split(line)
|
||||
parts = [p.strip() for p in parts if p.strip()]
|
||||
if len(parts) == 2:
|
||||
spanish, english = parts
|
||||
if looks_english(english) and not looks_english(spanish.split()[0]):
|
||||
return (spanish, english)
|
||||
return None
|
||||
|
||||
|
||||
def load(p: Path) -> dict:
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_vocab_cards_for_block(block: dict, ocr_entry: dict, chapter: dict, context_title: str, idx: int) -> list:
|
||||
"""Given a vocab_image block + its OCR lines, derive flashcards.
|
||||
|
||||
Vision OCR reads top-to-bottom, left-to-right; a two-column vocab table
|
||||
produces Spanish lines first, then English lines. We split the list in
|
||||
half when one side is predominantly Spanish and the other English.
|
||||
Per-line '—' separators are also supported as a fallback.
|
||||
"""
|
||||
cards = []
|
||||
if not ocr_entry:
|
||||
return cards
|
||||
lines = [l.strip() for l in ocr_entry.get("lines", []) if l.strip()]
|
||||
if not lines:
|
||||
return cards
|
||||
|
||||
def card(front: str, back: str) -> dict:
|
||||
return {
|
||||
"front": front,
|
||||
"back": back,
|
||||
"chapter": chapter["number"],
|
||||
"chapterTitle": chapter["title"],
|
||||
"section": context_title,
|
||||
"sourceImage": block["src"],
|
||||
}
|
||||
|
||||
# Attempt 1: explicit inline separator (e.g. "la casa — the house")
|
||||
inline = []
|
||||
all_inline = True
|
||||
for line in lines:
|
||||
pair = try_split_row(line)
|
||||
if pair:
|
||||
inline.append(pair)
|
||||
else:
|
||||
all_inline = False
|
||||
break
|
||||
if all_inline and inline:
|
||||
for es, en in inline:
|
||||
cards.append(card(es, en))
|
||||
return cards
|
||||
|
||||
# Attempt 2: block-alternating layout.
|
||||
# Vision OCR reads columns top-to-bottom, so a 2-col table rendered across
|
||||
# 2 visual columns produces runs like: [ES...ES][EN...EN][ES...ES][EN...EN]
|
||||
# We classify each line, smooth "unknown" using neighbors, then pair
|
||||
# same-sized consecutive ES/EN blocks.
|
||||
classes = [classify_line(l) for l in lines]
|
||||
|
||||
# Pass 1: fill unknowns using nearest non-unknown neighbor (forward)
|
||||
last_known = "unknown"
|
||||
forward = []
|
||||
for c in classes:
|
||||
if c != "unknown":
|
||||
last_known = c
|
||||
forward.append(last_known)
|
||||
# Pass 2: backfill leading unknowns (backward)
|
||||
last_known = "unknown"
|
||||
backward = [""] * len(classes)
|
||||
for i in range(len(classes) - 1, -1, -1):
|
||||
if classes[i] != "unknown":
|
||||
last_known = classes[i]
|
||||
backward[i] = last_known
|
||||
# Merge: prefer forward unless still unknown
|
||||
resolved = []
|
||||
for f, b in zip(forward, backward):
|
||||
if f != "unknown":
|
||||
resolved.append(f)
|
||||
elif b != "unknown":
|
||||
resolved.append(b)
|
||||
else:
|
||||
resolved.append("unknown")
|
||||
|
||||
# Group consecutive same-lang lines
|
||||
blocks: list = []
|
||||
cur_lang: "str | None" = None
|
||||
cur_block: list = []
|
||||
for line, lang in zip(lines, resolved):
|
||||
if lang != cur_lang:
|
||||
if cur_block and cur_lang is not None:
|
||||
blocks.append((cur_lang, cur_block))
|
||||
cur_block = [line]
|
||||
cur_lang = lang
|
||||
else:
|
||||
cur_block.append(line)
|
||||
if cur_block and cur_lang is not None:
|
||||
blocks.append((cur_lang, cur_block))
|
||||
|
||||
# Walk blocks pairing ES then EN of equal length
|
||||
i = 0
|
||||
while i < len(blocks) - 1:
|
||||
lang_a, lines_a = blocks[i]
|
||||
lang_b, lines_b = blocks[i + 1]
|
||||
if lang_a == "es" and lang_b == "en" and len(lines_a) == len(lines_b):
|
||||
for es, en in zip(lines_a, lines_b):
|
||||
cards.append(card(es, en))
|
||||
i += 2
|
||||
continue
|
||||
# If reversed order (some pages have EN column on left), try that too
|
||||
if lang_a == "en" and lang_b == "es" and len(lines_a) == len(lines_b):
|
||||
for es, en in zip(lines_b, lines_a):
|
||||
cards.append(card(es, en))
|
||||
i += 2
|
||||
continue
|
||||
i += 1
|
||||
|
||||
return cards
|
||||
|
||||
|
||||
def clean_instruction(text: str) -> str:
|
||||
"""Strip leading/trailing emphasis markers from a parsed instruction."""
|
||||
# Our XHTML parser emitted * and ** for emphasis; flatten them
|
||||
t = re.sub(r"\*+", "", text)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def merge() -> None:
|
||||
chapters_data = load(CHAPTERS_JSON)
|
||||
answers_data = load(ANSWERS_JSON)
|
||||
try:
|
||||
ocr_data = load(OCR_JSON)
|
||||
except FileNotFoundError:
|
||||
print("ocr.json not found — proceeding with empty OCR data")
|
||||
ocr_data = {}
|
||||
|
||||
answers = answers_data["answers"]
|
||||
chapters = chapters_data["chapters"]
|
||||
parts = chapters_data.get("part_memberships", {})
|
||||
|
||||
book_chapters = []
|
||||
all_vocab_cards = []
|
||||
missing_ocr = set()
|
||||
current_section_title = ""
|
||||
|
||||
for ch in chapters:
|
||||
out_blocks = []
|
||||
current_section_title = ch["title"]
|
||||
|
||||
for bi, block in enumerate(ch["blocks"]):
|
||||
k = block["kind"]
|
||||
|
||||
if k == "heading":
|
||||
current_section_title = block["text"]
|
||||
out_blocks.append(block)
|
||||
continue
|
||||
|
||||
if k == "paragraph":
|
||||
out_blocks.append(block)
|
||||
continue
|
||||
|
||||
if k == "key_vocab_header":
|
||||
out_blocks.append(block)
|
||||
continue
|
||||
|
||||
if k == "vocab_image":
|
||||
ocr_entry = ocr_data.get(block["src"])
|
||||
if ocr_entry is None:
|
||||
missing_ocr.add(block["src"])
|
||||
derived = build_vocab_cards_for_block(
|
||||
block, ocr_entry, ch, current_section_title, bi
|
||||
)
|
||||
all_vocab_cards.extend(derived)
|
||||
out_blocks.append({
|
||||
"kind": "vocab_table",
|
||||
"sourceImage": block["src"],
|
||||
"ocrLines": ocr_entry.get("lines", []) if ocr_entry else [],
|
||||
"ocrConfidence": ocr_entry.get("confidence", 0.0) if ocr_entry else 0.0,
|
||||
"cardCount": len(derived),
|
||||
})
|
||||
continue
|
||||
|
||||
if k == "exercise":
|
||||
ans = answers.get(block["id"])
|
||||
image_ocr_lines = []
|
||||
for src in block.get("image_refs", []):
|
||||
e = ocr_data.get(src)
|
||||
if e is None:
|
||||
missing_ocr.add(src)
|
||||
continue
|
||||
image_ocr_lines.extend(e.get("lines", []))
|
||||
|
||||
# Build the final prompt list. If we have text prompts from
|
||||
# XHTML, prefer them. Otherwise, attempt to use OCR lines.
|
||||
prompts = [p for p in block.get("prompts", []) if p.strip()]
|
||||
extras = [e for e in block.get("extra", []) if e.strip()]
|
||||
if not prompts and image_ocr_lines:
|
||||
# Extract numbered lines from OCR (look for "1. ..." pattern)
|
||||
for line in image_ocr_lines:
|
||||
m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
|
||||
if m:
|
||||
prompts.append(f"{m.group(1)}. {m.group(2)}")
|
||||
|
||||
# Cross-reference prompts with answers
|
||||
sub = ans["subparts"] if ans else []
|
||||
answer_items = []
|
||||
for sp in sub:
|
||||
for it in sp["items"]:
|
||||
answer_items.append({
|
||||
"label": sp["label"],
|
||||
"number": it["number"],
|
||||
"answer": it["answer"],
|
||||
"alternates": it["alternates"],
|
||||
})
|
||||
|
||||
out_blocks.append({
|
||||
"kind": "exercise",
|
||||
"id": block["id"],
|
||||
"ansAnchor": block.get("ans_anchor", ""),
|
||||
"instruction": clean_instruction(block.get("instruction", "")),
|
||||
"extra": extras,
|
||||
"prompts": prompts,
|
||||
"ocrLines": image_ocr_lines,
|
||||
"freeform": ans["freeform"] if ans else False,
|
||||
"answerItems": answer_items,
|
||||
"answerRaw": ans["raw"] if ans else "",
|
||||
"answerSubparts": sub,
|
||||
})
|
||||
continue
|
||||
|
||||
out_blocks.append(block)
|
||||
|
||||
book_chapters.append({
|
||||
"id": ch["id"],
|
||||
"number": ch["number"],
|
||||
"title": ch["title"],
|
||||
"part": ch.get("part"),
|
||||
"blocks": out_blocks,
|
||||
})
|
||||
|
||||
book = {
|
||||
"courseName": COURSE_NAME,
|
||||
"totalChapters": len(book_chapters),
|
||||
"totalExercises": sum(
|
||||
1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"
|
||||
),
|
||||
"totalVocabTables": sum(
|
||||
1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"
|
||||
),
|
||||
"totalVocabCards": len(all_vocab_cards),
|
||||
"parts": parts,
|
||||
"chapters": book_chapters,
|
||||
}
|
||||
OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))
|
||||
|
||||
# Vocab cards as a separate file (grouped per chapter so they can be seeded
|
||||
# as CourseDecks in the existing schema).
|
||||
vocab_by_chapter: dict = {}
|
||||
for card in all_vocab_cards:
|
||||
vocab_by_chapter.setdefault(card["chapter"], []).append(card)
|
||||
OUT_VOCAB.write_text(json.dumps({
|
||||
"courseName": COURSE_NAME,
|
||||
"chapters": [
|
||||
{
|
||||
"chapter": ch_num,
|
||||
"cards": cards,
|
||||
}
|
||||
for ch_num, cards in sorted(vocab_by_chapter.items())
|
||||
],
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
# Summary
|
||||
print(f"Wrote {OUT_BOOK}")
|
||||
print(f"Wrote {OUT_VOCAB}")
|
||||
print(f"Chapters: {book['totalChapters']}")
|
||||
print(f"Exercises: {book['totalExercises']}")
|
||||
print(f"Vocab tables: {book['totalVocabTables']}")
|
||||
print(f"Vocab cards (auto): {book['totalVocabCards']}")
|
||||
if missing_ocr:
|
||||
print(f"Missing OCR for {len(missing_ocr)} images (first 5): {sorted(list(missing_ocr))[:5]}")
|
||||
|
||||
# Validation
|
||||
total_exercises = book["totalExercises"]
|
||||
exercises_with_prompts = sum(
|
||||
1 for ch in book_chapters for b in ch["blocks"]
|
||||
if b["kind"] == "exercise" and (b["prompts"] or b["extra"])
|
||||
)
|
||||
exercises_with_answers = sum(
|
||||
1 for ch in book_chapters for b in ch["blocks"]
|
||||
if b["kind"] == "exercise" and b["answerItems"]
|
||||
)
|
||||
exercises_freeform = sum(
|
||||
1 for ch in book_chapters for b in ch["blocks"]
|
||||
if b["kind"] == "exercise" and b["freeform"]
|
||||
)
|
||||
print(f"Exercises with prompts: {exercises_with_prompts}/{total_exercises}")
|
||||
print(f"Exercises with answers: {exercises_with_answers}/{total_exercises}")
|
||||
print(f"Freeform exercises: {exercises_freeform}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
merge()
|
||||
Reference in New Issue
Block a user