#!/usr/bin/env python3 """Merge chapters.json + answers.json + ocr.json → book.json (single source). Also emits vocab_cards.json: flashcards derived from vocab_image blocks where OCR text parses as a clean two-column (Spanish ↔ English) table. """ import json import re import sys from pathlib import Path HERE = Path(__file__).resolve().parent CHAPTERS_JSON = HERE / "chapters.json" ANSWERS_JSON = HERE / "answers.json" OCR_JSON = HERE / "ocr.json" OUT_BOOK = HERE / "book.json" OUT_VOCAB = HERE / "vocab_cards.json" COURSE_NAME = "Complete Spanish Step-by-Step" # Heuristic: parseable "Spanish | English" vocab rows. # OCR usually produces "word — translation" or "word translation" separated # by 2+ spaces. We detect rows that contain both Spanish and English words. SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]") SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"} ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their", "your", "some"} # English-only words that would never appear as Spanish ENGLISH_ONLY_WORDS = {"the", "he", "she", "it", "we", "they", "I", "is", "are", "was", "were", "been", "have", "has", "had", "will", "would", "should", "could"} SEP_RE = re.compile(r"[ \t]{2,}|\s[—–−-]\s") def classify_line(line: str) -> str: """Return 'es', 'en', or 'unknown' for the dominant language of a vocab line.""" line = line.strip() if not line: return "unknown" # Accent = definitely Spanish if SPANISH_ACCENT_RE.search(line): return "es" first = line.split()[0].lower().strip(",.;:") if first in SPANISH_ARTICLES: return "es" if first in ENGLISH_STARTERS: return "en" # Check if the leading word is an English-only function word if first in ENGLISH_ONLY_WORDS: return "en" return "unknown" def looks_english(word: str) -> bool: """Legacy helper — kept for try_split_row below.""" w = word.lower().strip() if not w: return False if SPANISH_ACCENT_RE.search(w): return False if w in SPANISH_ARTICLES: return False if w in ENGLISH_STARTERS or w in ENGLISH_ONLY_WORDS: return True return bool(re.match(r"^[a-z][a-z\s'/()\-,.]*$", w)) def try_split_row(line: str) -> "tuple[str, str] | None": """Split a line into (spanish, english) if it looks like a vocab entry.""" line = line.strip() if not line or len(line) < 3: return None # Try explicit separators first parts = SEP_RE.split(line) parts = [p.strip() for p in parts if p.strip()] if len(parts) == 2: spanish, english = parts if looks_english(english) and not looks_english(spanish.split()[0]): return (spanish, english) return None def load(p: Path) -> dict: return json.loads(p.read_text(encoding="utf-8")) def build_vocab_cards_for_block(block: dict, ocr_entry: dict, chapter: dict, context_title: str, idx: int) -> list: """Given a vocab_image block + its OCR lines, derive flashcards. Vision OCR reads top-to-bottom, left-to-right; a two-column vocab table produces Spanish lines first, then English lines. We split the list in half when one side is predominantly Spanish and the other English. Per-line '—' separators are also supported as a fallback. """ cards = [] if not ocr_entry: return cards lines = [l.strip() for l in ocr_entry.get("lines", []) if l.strip()] if not lines: return cards def card(front: str, back: str) -> dict: return { "front": front, "back": back, "chapter": chapter["number"], "chapterTitle": chapter["title"], "section": context_title, "sourceImage": block["src"], } # Attempt 1: explicit inline separator (e.g. "la casa — the house") inline = [] all_inline = True for line in lines: pair = try_split_row(line) if pair: inline.append(pair) else: all_inline = False break if all_inline and inline: for es, en in inline: cards.append(card(es, en)) return cards # Attempt 2: block-alternating layout. # Vision OCR reads columns top-to-bottom, so a 2-col table rendered across # 2 visual columns produces runs like: [ES...ES][EN...EN][ES...ES][EN...EN] # We classify each line, smooth "unknown" using neighbors, then pair # same-sized consecutive ES/EN blocks. classes = [classify_line(l) for l in lines] # Pass 1: fill unknowns using nearest non-unknown neighbor (forward) last_known = "unknown" forward = [] for c in classes: if c != "unknown": last_known = c forward.append(last_known) # Pass 2: backfill leading unknowns (backward) last_known = "unknown" backward = [""] * len(classes) for i in range(len(classes) - 1, -1, -1): if classes[i] != "unknown": last_known = classes[i] backward[i] = last_known # Merge: prefer forward unless still unknown resolved = [] for f, b in zip(forward, backward): if f != "unknown": resolved.append(f) elif b != "unknown": resolved.append(b) else: resolved.append("unknown") # Group consecutive same-lang lines blocks: list = [] cur_lang: "str | None" = None cur_block: list = [] for line, lang in zip(lines, resolved): if lang != cur_lang: if cur_block and cur_lang is not None: blocks.append((cur_lang, cur_block)) cur_block = [line] cur_lang = lang else: cur_block.append(line) if cur_block and cur_lang is not None: blocks.append((cur_lang, cur_block)) # Walk blocks pairing ES then EN of equal length i = 0 while i < len(blocks) - 1: lang_a, lines_a = blocks[i] lang_b, lines_b = blocks[i + 1] if lang_a == "es" and lang_b == "en" and len(lines_a) == len(lines_b): for es, en in zip(lines_a, lines_b): cards.append(card(es, en)) i += 2 continue # If reversed order (some pages have EN column on left), try that too if lang_a == "en" and lang_b == "es" and len(lines_a) == len(lines_b): for es, en in zip(lines_b, lines_a): cards.append(card(es, en)) i += 2 continue i += 1 return cards def clean_instruction(text: str) -> str: """Strip leading/trailing emphasis markers from a parsed instruction.""" # Our XHTML parser emitted * and ** for emphasis; flatten them t = re.sub(r"\*+", "", text) return t.strip() def merge() -> None: chapters_data = load(CHAPTERS_JSON) answers_data = load(ANSWERS_JSON) try: ocr_data = load(OCR_JSON) except FileNotFoundError: print("ocr.json not found — proceeding with empty OCR data") ocr_data = {} answers = answers_data["answers"] chapters = chapters_data["chapters"] parts = chapters_data.get("part_memberships", {}) book_chapters = [] all_vocab_cards = [] missing_ocr = set() current_section_title = "" for ch in chapters: out_blocks = [] current_section_title = ch["title"] for bi, block in enumerate(ch["blocks"]): k = block["kind"] if k == "heading": current_section_title = block["text"] out_blocks.append(block) continue if k == "paragraph": out_blocks.append(block) continue if k == "key_vocab_header": out_blocks.append(block) continue if k == "vocab_image": ocr_entry = ocr_data.get(block["src"]) if ocr_entry is None: missing_ocr.add(block["src"]) derived = build_vocab_cards_for_block( block, ocr_entry, ch, current_section_title, bi ) all_vocab_cards.extend(derived) out_blocks.append({ "kind": "vocab_table", "sourceImage": block["src"], "ocrLines": ocr_entry.get("lines", []) if ocr_entry else [], "ocrConfidence": ocr_entry.get("confidence", 0.0) if ocr_entry else 0.0, "cardCount": len(derived), }) continue if k == "exercise": ans = answers.get(block["id"]) image_ocr_lines = [] for src in block.get("image_refs", []): e = ocr_data.get(src) if e is None: missing_ocr.add(src) continue image_ocr_lines.extend(e.get("lines", [])) # Build the final prompt list. If we have text prompts from # XHTML, prefer them. Otherwise, attempt to use OCR lines. prompts = [p for p in block.get("prompts", []) if p.strip()] extras = [e for e in block.get("extra", []) if e.strip()] if not prompts and image_ocr_lines: # Extract numbered lines from OCR (look for "1. ..." pattern) for line in image_ocr_lines: m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip()) if m: prompts.append(f"{m.group(1)}. {m.group(2)}") # Cross-reference prompts with answers sub = ans["subparts"] if ans else [] answer_items = [] for sp in sub: for it in sp["items"]: answer_items.append({ "label": sp["label"], "number": it["number"], "answer": it["answer"], "alternates": it["alternates"], }) out_blocks.append({ "kind": "exercise", "id": block["id"], "ansAnchor": block.get("ans_anchor", ""), "instruction": clean_instruction(block.get("instruction", "")), "extra": extras, "prompts": prompts, "ocrLines": image_ocr_lines, "freeform": ans["freeform"] if ans else False, "answerItems": answer_items, "answerRaw": ans["raw"] if ans else "", "answerSubparts": sub, }) continue out_blocks.append(block) book_chapters.append({ "id": ch["id"], "number": ch["number"], "title": ch["title"], "part": ch.get("part"), "blocks": out_blocks, }) book = { "courseName": COURSE_NAME, "totalChapters": len(book_chapters), "totalExercises": sum( 1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise" ), "totalVocabTables": sum( 1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table" ), "totalVocabCards": len(all_vocab_cards), "parts": parts, "chapters": book_chapters, } OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False)) # Vocab cards as a separate file (grouped per chapter so they can be seeded # as CourseDecks in the existing schema). vocab_by_chapter: dict = {} for card in all_vocab_cards: vocab_by_chapter.setdefault(card["chapter"], []).append(card) OUT_VOCAB.write_text(json.dumps({ "courseName": COURSE_NAME, "chapters": [ { "chapter": ch_num, "cards": cards, } for ch_num, cards in sorted(vocab_by_chapter.items()) ], }, ensure_ascii=False, indent=2)) # Summary print(f"Wrote {OUT_BOOK}") print(f"Wrote {OUT_VOCAB}") print(f"Chapters: {book['totalChapters']}") print(f"Exercises: {book['totalExercises']}") print(f"Vocab tables: {book['totalVocabTables']}") print(f"Vocab cards (auto): {book['totalVocabCards']}") if missing_ocr: print(f"Missing OCR for {len(missing_ocr)} images (first 5): {sorted(list(missing_ocr))[:5]}") # Validation total_exercises = book["totalExercises"] exercises_with_prompts = sum( 1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise" and (b["prompts"] or b["extra"]) ) exercises_with_answers = sum( 1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise" and b["answerItems"] ) exercises_freeform = sum( 1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise" and b["freeform"] ) print(f"Exercises with prompts: {exercises_with_prompts}/{total_exercises}") print(f"Exercises with answers: {exercises_with_answers}/{total_exercises}") print(f"Freeform exercises: {exercises_freeform}") if __name__ == "__main__": merge()