Render textbook vocab as paired Spanish→English grid

Previously the chapter reader showed vocab tables as a flat list of OCR
lines — because Vision reads columns top-to-bottom, the Spanish column
appeared as one block followed by the English column, making pairings
illegible.

Now every vocab table renders as a 2-column grid with Spanish on the
left and English on the right. Supporting changes:

- New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images,
  cluster lines into rows by Y-coordinate, split rows by largest X-gap,
  detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted
  this pass vs ~1100 from the old block-alternation heuristic.
- merge_pdf_into_book.py now prefers bounding-box pairs when present,
  falls back to the heuristic, embeds the resulting pairs as
  vocab_table.cards in book.json.
- DataLoader passes cards through to TextbookBlock on seed.
- TextbookChapterView renders cards via SwiftUI Grid (2 cols).
- fix_vocab.py quarantine rule relaxed — only mis-pairs where both
  sides are clearly the same language are removed. "unknown" sides
  stay (bbox pipeline already oriented them correctly).

Textbook card count jumps from 1044 → 3118 active pairs.
textbookDataVersion bumped to 9.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-19 15:58:41 -05:00
parent cd491bd695
commit 5f90a01314
9 changed files with 17619 additions and 1148 deletions

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env swift
// Bounding-box OCR over every vocab image, producing SpanishEnglish pairs.
// Much higher accuracy than the flat-OCR block-alternation heuristic because
// we use each recognized line's position on the page: rows are clustered by
// Y-coordinate and cells within a row are split by the biggest X gap.
//
// Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>
import Foundation
import Vision
import AppKit
guard CommandLine.arguments.count >= 4 else {
print("Usage: swift ocr_all_vocab.swift <image_list.json> <oebps_dir> <output.json>")
exit(1)
}
let imageListURL = URL(fileURLWithPath: CommandLine.arguments[1])
let oebpsDir = URL(fileURLWithPath: CommandLine.arguments[2])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
guard let listData = try? Data(contentsOf: imageListURL),
let imageNames = try? JSONDecoder().decode([String].self, from: listData) else {
print("Could not load image list at \(imageListURL.path)")
exit(1)
}
print("Processing \(imageNames.count) images...")
struct RecognizedLine {
let text: String
let cx: Double
let cy: Double
let confidence: Double
}
struct Pair: Encodable {
var es: String
var en: String
var confidence: Double
}
struct ImageResult: Encodable {
var pairs: [Pair]
var columnCount: Int
var strategy: String
var lineCount: Int
}
let spanishAccents = Set<Character>(["á","é","í","ó","ú","ñ","ü","Á","É","Í","Ó","Ú","Ñ","Ü","¿","¡"])
let spanishArticles: Set<String> = ["el","la","los","las","un","una","unos","unas"]
let englishStarters: Set<String> = ["the","a","an","to","my","his","her","our","their","your"]
let englishOnly: Set<String> = ["the","he","she","it","we","they","is","are","was","were","been","have","has","had","will","would"]
func classify(_ s: String) -> String {
let lower = s.lowercased()
if lower.contains(where: { spanishAccents.contains($0) }) { return "es" }
let first = lower.split(separator: " ").first.map(String.init)?.trimmingCharacters(in: .punctuationCharacters) ?? ""
if spanishArticles.contains(first) { return "es" }
if englishStarters.contains(first) || englishOnly.contains(first) { return "en" }
return "?"
}
func recognize(_ cgImage: CGImage) -> [RecognizedLine] {
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let req = VNRecognizeTextRequest()
req.recognitionLevel = .accurate
req.recognitionLanguages = ["es-ES", "es", "en-US"]
req.usesLanguageCorrection = true
if #available(macOS 13.0, *) { req.automaticallyDetectsLanguage = true }
try? handler.perform([req])
var out: [RecognizedLine] = []
for obs in req.results ?? [] {
guard let top = obs.topCandidates(1).first else { continue }
let s = top.string.trimmingCharacters(in: .whitespaces)
if s.isEmpty { continue }
let bb = obs.boundingBox
out.append(RecognizedLine(
text: s,
cx: Double(bb.origin.x + bb.width / 2),
cy: Double(1.0 - (bb.origin.y + bb.height / 2)),
confidence: Double(top.confidence)
))
}
return out
}
/// Split a sorted-by-X line group into cells by finding the largest gap(s).
/// `desiredCells` = 2 for 2-col, 4 for 2-pair, etc.
func splitRow(_ lines: [RecognizedLine], into desiredCells: Int) -> [String] {
guard lines.count >= desiredCells else {
// Merge into fewer cells: just concatenate left-to-right.
return [lines.map(\.text).joined(separator: " ")]
}
let sorted = lines.sorted { $0.cx < $1.cx }
// Find (desiredCells - 1) biggest gaps
var gaps: [(idx: Int, gap: Double)] = []
for i in 1..<sorted.count {
gaps.append((i, sorted[i].cx - sorted[i - 1].cx))
}
let splitAt = gaps.sorted { $0.gap > $1.gap }
.prefix(desiredCells - 1)
.map(\.idx)
.sorted()
var cells: [[RecognizedLine]] = []
var start = 0
for s in splitAt {
cells.append(Array(sorted[start..<s]))
start = s
}
cells.append(Array(sorted[start..<sorted.count]))
return cells.map { $0.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces) }
}
/// Cluster lines into rows by Y proximity. Returns rows in top-to-bottom order.
func groupRows(_ lines: [RecognizedLine], tol: Double = 0.025) -> [[RecognizedLine]] {
let sorted = lines.sorted { $0.cy < $1.cy }
var rows: [[RecognizedLine]] = []
var current: [RecognizedLine] = []
for l in sorted {
if let last = current.last, abs(l.cy - last.cy) > tol {
rows.append(current)
current = [l]
} else {
current.append(l)
}
}
if !current.isEmpty { rows.append(current) }
return rows
}
/// Detect likely column count: look at how many x-cluster peaks exist across all rows.
/// Clusters X-coords from all lines into buckets of 10% width.
func detectColumnCount(_ lines: [RecognizedLine]) -> Int {
guard !lines.isEmpty else { return 2 }
let step = 0.10
var buckets = [Int](repeating: 0, count: Int(1.0 / step) + 1)
for l in lines {
let b = min(max(0, Int(l.cx / step)), buckets.count - 1)
buckets[b] += 1
}
// A peak = a bucket with count > 10% of total lines
let threshold = max(2, lines.count / 10)
let peaks = buckets.filter { $0 >= threshold }.count
// Most tables are 2-col (peaks = 2). Some 4-col (2 ES/EN pairs side by side peaks = 4).
// Roman/decorative layouts may show 1 peak; treat as 2.
switch peaks {
case 0, 1, 2: return 2
case 3: return 3
default: return 4
}
}
/// Merge label-less cells into SpanishEnglish pairs.
/// `cells` is a row's cells (length = columnCount). For N=2, [es, en]. For N=4,
/// [es1, en1, es2, en2] (two pairs). For N=3, [es, en_short, en_long] (rare, merge).
func cellsToPairs(_ cells: [String], columnCount: Int) -> [(String, String)] {
switch columnCount {
case 2 where cells.count >= 2:
return [(cells[0], cells[1])]
case 3 where cells.count >= 3:
// 3-col source: es | en | en-alternate. Keep all three by merging EN sides.
return [(cells[0], [cells[1], cells[2]].joined(separator: " / "))]
case 4 where cells.count >= 4:
return [(cells[0], cells[1]), (cells[2], cells[3])]
default:
if cells.count >= 2 { return [(cells[0], cells.dropFirst().joined(separator: " "))] }
return []
}
}
/// Swap pair if orientation is backwards (English on left, Spanish on right).
func orientPair(_ pair: (String, String)) -> (String, String) {
let (a, b) = pair
let ca = classify(a), cb = classify(b)
if ca == "en" && cb == "es" { return (b, a) }
return pair
}
var results: [String: ImageResult] = [:]
var processed = 0
let startTime = Date()
for name in imageNames {
processed += 1
let url = oebpsDir.appendingPathComponent(name)
guard let nsImg = NSImage(contentsOf: url),
let tiff = nsImg.tiffRepresentation,
let rep = NSBitmapImageRep(data: tiff),
let cg = rep.cgImage else {
continue
}
let lines = recognize(cg)
if lines.isEmpty {
results[name] = ImageResult(pairs: [], columnCount: 2, strategy: "empty", lineCount: 0)
continue
}
let columnCount = detectColumnCount(lines)
let rows = groupRows(lines, tol: 0.025)
var pairs: [Pair] = []
for row in rows {
guard row.count >= 2 else { continue }
let cells = splitRow(row, into: columnCount)
let rawPairs = cellsToPairs(cells, columnCount: columnCount)
for p in rawPairs {
let (es, en) = orientPair(p)
if es.count < 1 || en.count < 1 { continue }
let avgConf = row.reduce(0.0) { $0 + $1.confidence } / Double(row.count)
pairs.append(Pair(es: es, en: en, confidence: avgConf))
}
}
results[name] = ImageResult(
pairs: pairs,
columnCount: columnCount,
strategy: "bbox-row-split",
lineCount: lines.count
)
if processed % 50 == 0 || processed == imageNames.count {
let elapsed = Date().timeIntervalSince(startTime)
let rate = Double(processed) / max(elapsed, 0.001)
let eta = Double(imageNames.count - processed) / max(rate, 0.001)
print(String(format: "%d/%d %.1f img/s eta %.0fs", processed, imageNames.count, rate, eta))
}
}
let enc = JSONEncoder()
enc.outputFormatting = [.sortedKeys]
try enc.encode(results).write(to: outputURL)
let totalPairs = results.values.reduce(0) { $0 + $1.pairs.count }
let emptyTables = results.values.filter { $0.pairs.isEmpty }.count
print("Wrote \(results.count) results, \(totalPairs) total pairs, \(emptyTables) unpaired")