Render textbook vocab as paired Spanish→English grid

Previously the chapter reader showed vocab tables as a flat list of OCR lines — because Vision reads columns top-to-bottom, the Spanish column appeared as one block followed by the English column, making pairings illegible. Now every vocab table renders as a 2-column grid with Spanish on the left and English on the right. Supporting changes: - New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images, cluster lines into rows by Y-coordinate, split rows by largest X-gap, detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted this pass vs ~1100 from the old block-alternation heuristic. - merge_pdf_into_book.py now prefers bounding-box pairs when present, falls back to the heuristic, embeds the resulting pairs as vocab_table.cards in book.json. - DataLoader passes cards through to TextbookBlock on seed. - TextbookChapterView renders cards via SwiftUI Grid (2 cols). - fix_vocab.py quarantine rule relaxed — only mis-pairs where both sides are clearly the same language are removed. "unknown" sides stay (bbox pipeline already oriented them correctly). Textbook card count jumps from 1044 → 3118 active pairs. textbookDataVersion bumped to 9. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:58:41 -05:00
parent cd491bd695
commit 5f90a01314
9 changed files with 17619 additions and 1148 deletions
@@ -173,14 +173,17 @@ def main() -> None:
                kept_cards.append(card)
                continue

-            # Quarantine obvious mis-pairs: both sides same language OR language mismatch
+            # Quarantine only clear mis-pairs: both sides EXPLICITLY the wrong
+            # language (both Spanish or both English). "unknown" sides stay —
+            # the bounding-box pipeline already handled orientation correctly
+            # and many valid pairs lack the article/accent markers we classify on.
            fes, fen = language_score(card["front"])
            bes, ben = language_score(card["back"])
            front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
            back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
-            # A good card has front=es, back=en. Anything else when the card is
-            # flagged is almost always a column-pairing error.
-            if front_lang != "es" or back_lang != "en":
+            bothSameLang = (front_lang == "es" and back_lang == "es") or (front_lang == "en" and back_lang == "en")
+            reversed_pair = front_lang == "en" and back_lang == "es"
+            if bothSameLang or reversed_pair:
                quarantined_cards.append({
                    "chapter": ch["chapter"],
                    "front": card["front"],
@@ -33,6 +33,7 @@ CHAPTERS_JSON = HERE / "chapters.json"
 ANSWERS_JSON = HERE / "answers.json"
 OCR_JSON = HERE / "ocr.json"
 PDF_OCR_JSON = HERE / "pdf_ocr.json"
+PAIRED_VOCAB_JSON = HERE / "paired_vocab.json"  # bounding-box pairs (preferred)
 OUT_BOOK = HERE / "book.json"
 OUT_VOCAB = HERE / "vocab_cards.json"

@@ -222,7 +223,9 @@ def main() -> None:
    epub_ocr = load(OCR_JSON)
    pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
    pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
+    paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
    print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
+    print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")

    # Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
    narrative_set = set()
@@ -279,19 +282,48 @@ def main() -> None:
                if repairs > 0:
                    merged_pages += 1

-                derived = build_vocab_cards_for_block(
+                # Prefer bounding-box pairs (from paired_vocab.json) when
+                # present. Fall back to the block-alternation heuristic.
+                bbox = paired_vocab.get(src, {})
+                bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
+                heuristic = build_vocab_cards_for_block(
                    {"src": src},
                    {"lines": merged_lines, "confidence": merged_conf},
                    ch, current_section_title, bi
                )
-                all_vocab_cards.extend(derived)
+
+                if bbox_pairs:
+                    cards_for_block = [
+                        {"front": p["es"], "back": p["en"]}
+                        for p in bbox_pairs
+                        if p.get("es") and p.get("en")
+                    ]
+                    # Also feed the flashcard deck
+                    for p in bbox_pairs:
+                        if p.get("es") and p.get("en"):
+                            all_vocab_cards.append({
+                                "front": p["es"],
+                                "back": p["en"],
+                                "chapter": ch["number"],
+                                "chapterTitle": ch["title"],
+                                "section": current_section_title,
+                                "sourceImage": src,
+                            })
+                    pair_source = "bbox"
+                else:
+                    cards_for_block = [{"front": c["front"], "back": c["back"]} for c in heuristic]
+                    all_vocab_cards.extend(heuristic)
+                    pair_source = "heuristic"
+
                out_blocks.append({
                    "kind": "vocab_table",
                    "sourceImage": src,
                    "ocrLines": merged_lines,
                    "ocrConfidence": merged_conf,
-                    "cardCount": len(derived),
-                    "source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"),
+                    "cardCount": len(cards_for_block),
+                    "cards": cards_for_block,
+                    "columnCount": bbox.get("columnCount", 2) if isinstance(bbox, dict) else 2,
+                    "source": pair_source,
                    "bookPage": book_page,
                    "repairs": repairs,
                })
@@ -0,0 +1,232 @@
+#!/usr/bin/env swift
+// Bounding-box OCR over every vocab image, producing Spanish→English pairs.
+// Much higher accuracy than the flat-OCR block-alternation heuristic because
+// we use each recognized line's position on the page: rows are clustered by
+// Y-coordinate and cells within a row are split by the biggest X gap.
+//
+// Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count >= 4 else {
+    print("Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>")
+    exit(1)
+}
+
+let imageListURL = URL(fileURLWithPath: CommandLine.arguments[1])
+let oebpsDir = URL(fileURLWithPath: CommandLine.arguments[2])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
+
+guard let listData = try? Data(contentsOf: imageListURL),
+      let imageNames = try? JSONDecoder().decode([String].self, from: listData) else {
+    print("Could not load image list at \(imageListURL.path)")
+    exit(1)
+}
+print("Processing \(imageNames.count) images...")
+
+struct RecognizedLine {
+    let text: String
+    let cx: Double
+    let cy: Double
+    let confidence: Double
+}
+
+struct Pair: Encodable {
+    var es: String
+    var en: String
+    var confidence: Double
+}
+
+struct ImageResult: Encodable {
+    var pairs: [Pair]
+    var columnCount: Int
+    var strategy: String
+    var lineCount: Int
+}
+
+let spanishAccents = Set<Character>(["á","é","í","ó","ú","ñ","ü","Á","É","Í","Ó","Ú","Ñ","Ü","¿","¡"])
+let spanishArticles: Set<String> = ["el","la","los","las","un","una","unos","unas"]
+let englishStarters: Set<String> = ["the","a","an","to","my","his","her","our","their","your"]
+let englishOnly: Set<String> = ["the","he","she","it","we","they","is","are","was","were","been","have","has","had","will","would"]
+
+func classify(_ s: String) -> String {
+    let lower = s.lowercased()
+    if lower.contains(where: { spanishAccents.contains($0) }) { return "es" }
+    let first = lower.split(separator: " ").first.map(String.init)?.trimmingCharacters(in: .punctuationCharacters) ?? ""
+    if spanishArticles.contains(first) { return "es" }
+    if englishStarters.contains(first) || englishOnly.contains(first) { return "en" }
+    return "?"
+}
+
+func recognize(_ cgImage: CGImage) -> [RecognizedLine] {
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let req = VNRecognizeTextRequest()
+    req.recognitionLevel = .accurate
+    req.recognitionLanguages = ["es-ES", "es", "en-US"]
+    req.usesLanguageCorrection = true
+    if #available(macOS 13.0, *) { req.automaticallyDetectsLanguage = true }
+    try? handler.perform([req])
+    var out: [RecognizedLine] = []
+    for obs in req.results ?? [] {
+        guard let top = obs.topCandidates(1).first else { continue }
+        let s = top.string.trimmingCharacters(in: .whitespaces)
+        if s.isEmpty { continue }
+        let bb = obs.boundingBox
+        out.append(RecognizedLine(
+            text: s,
+            cx: Double(bb.origin.x + bb.width / 2),
+            cy: Double(1.0 - (bb.origin.y + bb.height / 2)),
+            confidence: Double(top.confidence)
+        ))
+    }
+    return out
+}
+
+/// Split a sorted-by-X line group into cells by finding the largest gap(s).
+/// `desiredCells` = 2 for 2-col, 4 for 2-pair, etc.
+func splitRow(_ lines: [RecognizedLine], into desiredCells: Int) -> [String] {
+    guard lines.count >= desiredCells else {
+        // Merge into fewer cells: just concatenate left-to-right.
+        return [lines.map(\.text).joined(separator: " ")]
+    }
+    let sorted = lines.sorted { $0.cx < $1.cx }
+    // Find (desiredCells - 1) biggest gaps
+    var gaps: [(idx: Int, gap: Double)] = []
+    for i in 1..<sorted.count {
+        gaps.append((i, sorted[i].cx - sorted[i - 1].cx))
+    }
+    let splitAt = gaps.sorted { $0.gap > $1.gap }
+        .prefix(desiredCells - 1)
+        .map(\.idx)
+        .sorted()
+    var cells: [[RecognizedLine]] = []
+    var start = 0
+    for s in splitAt {
+        cells.append(Array(sorted[start..<s]))
+        start = s
+    }
+    cells.append(Array(sorted[start..<sorted.count]))
+    return cells.map { $0.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces) }
+}
+
+/// Cluster lines into rows by Y proximity. Returns rows in top-to-bottom order.
+func groupRows(_ lines: [RecognizedLine], tol: Double = 0.025) -> [[RecognizedLine]] {
+    let sorted = lines.sorted { $0.cy < $1.cy }
+    var rows: [[RecognizedLine]] = []
+    var current: [RecognizedLine] = []
+    for l in sorted {
+        if let last = current.last, abs(l.cy - last.cy) > tol {
+            rows.append(current)
+            current = [l]
+        } else {
+            current.append(l)
+        }
+    }
+    if !current.isEmpty { rows.append(current) }
+    return rows
+}
+
+/// Detect likely column count: look at how many x-cluster peaks exist across all rows.
+/// Clusters X-coords from all lines into buckets of 10% width.
+func detectColumnCount(_ lines: [RecognizedLine]) -> Int {
+    guard !lines.isEmpty else { return 2 }
+    let step = 0.10
+    var buckets = [Int](repeating: 0, count: Int(1.0 / step) + 1)
+    for l in lines {
+        let b = min(max(0, Int(l.cx / step)), buckets.count - 1)
+        buckets[b] += 1
+    }
+    // A peak = a bucket with count > 10% of total lines
+    let threshold = max(2, lines.count / 10)
+    let peaks = buckets.filter { $0 >= threshold }.count
+    // Most tables are 2-col (peaks = 2). Some 4-col (2 ES/EN pairs side by side → peaks = 4).
+    // Roman/decorative layouts may show 1 peak; treat as 2.
+    switch peaks {
+    case 0, 1, 2: return 2
+    case 3: return 3
+    default: return 4
+    }
+}
+
+/// Merge label-less cells into Spanish→English pairs.
+/// `cells` is a row's cells (length = columnCount). For N=2, [es, en]. For N=4,
+/// [es1, en1, es2, en2] (two pairs). For N=3, [es, en_short, en_long] (rare, merge).
+func cellsToPairs(_ cells: [String], columnCount: Int) -> [(String, String)] {
+    switch columnCount {
+    case 2 where cells.count >= 2:
+        return [(cells[0], cells[1])]
+    case 3 where cells.count >= 3:
+        // 3-col source: es | en | en-alternate. Keep all three by merging EN sides.
+        return [(cells[0], [cells[1], cells[2]].joined(separator: " / "))]
+    case 4 where cells.count >= 4:
+        return [(cells[0], cells[1]), (cells[2], cells[3])]
+    default:
+        if cells.count >= 2 { return [(cells[0], cells.dropFirst().joined(separator: " "))] }
+        return []
+    }
+}
+
+/// Swap pair if orientation is backwards (English on left, Spanish on right).
+func orientPair(_ pair: (String, String)) -> (String, String) {
+    let (a, b) = pair
+    let ca = classify(a), cb = classify(b)
+    if ca == "en" && cb == "es" { return (b, a) }
+    return pair
+}
+
+var results: [String: ImageResult] = [:]
+var processed = 0
+let startTime = Date()
+
+for name in imageNames {
+    processed += 1
+    let url = oebpsDir.appendingPathComponent(name)
+    guard let nsImg = NSImage(contentsOf: url),
+          let tiff = nsImg.tiffRepresentation,
+          let rep = NSBitmapImageRep(data: tiff),
+          let cg = rep.cgImage else {
+        continue
+    }
+    let lines = recognize(cg)
+    if lines.isEmpty {
+        results[name] = ImageResult(pairs: [], columnCount: 2, strategy: "empty", lineCount: 0)
+        continue
+    }
+
+    let columnCount = detectColumnCount(lines)
+    let rows = groupRows(lines, tol: 0.025)
+    var pairs: [Pair] = []
+    for row in rows {
+        guard row.count >= 2 else { continue }
+        let cells = splitRow(row, into: columnCount)
+        let rawPairs = cellsToPairs(cells, columnCount: columnCount)
+        for p in rawPairs {
+            let (es, en) = orientPair(p)
+            if es.count < 1 || en.count < 1 { continue }
+            let avgConf = row.reduce(0.0) { $0 + $1.confidence } / Double(row.count)
+            pairs.append(Pair(es: es, en: en, confidence: avgConf))
+        }
+    }
+    results[name] = ImageResult(
+        pairs: pairs,
+        columnCount: columnCount,
+        strategy: "bbox-row-split",
+        lineCount: lines.count
+    )
+
+    if processed % 50 == 0 || processed == imageNames.count {
+        let elapsed = Date().timeIntervalSince(startTime)
+        let rate = Double(processed) / max(elapsed, 0.001)
+        let eta = Double(imageNames.count - processed) / max(rate, 0.001)
+        print(String(format: "%d/%d  %.1f img/s  eta %.0fs", processed, imageNames.count, rate, eta))
+    }
+}
+
+let enc = JSONEncoder()
+enc.outputFormatting = [.sortedKeys]
+try enc.encode(results).write(to: outputURL)
+let totalPairs = results.values.reduce(0) { $0 + $1.pairs.count }
+let emptyTables = results.values.filter { $0.pairs.isEmpty }.count
+print("Wrote \(results.count) results, \(totalPairs) total pairs, \(emptyTables) unpaired")