#!/usr/bin/env swift // Bounding-box OCR over every vocab image, producing Spanish→English pairs. // Much higher accuracy than the flat-OCR block-alternation heuristic because // we use each recognized line's position on the page: rows are clustered by // Y-coordinate and cells within a row are split by the biggest X gap. // // Usage: swift ocr_all_vocab.swift import Foundation import Vision import AppKit guard CommandLine.arguments.count >= 4 else { print("Usage: swift ocr_all_vocab.swift ") exit(1) } let imageListURL = URL(fileURLWithPath: CommandLine.arguments[1]) let oebpsDir = URL(fileURLWithPath: CommandLine.arguments[2]) let outputURL = URL(fileURLWithPath: CommandLine.arguments[3]) guard let listData = try? Data(contentsOf: imageListURL), let imageNames = try? JSONDecoder().decode([String].self, from: listData) else { print("Could not load image list at \(imageListURL.path)") exit(1) } print("Processing \(imageNames.count) images...") struct RecognizedLine { let text: String let cx: Double let cy: Double let confidence: Double } struct Pair: Encodable { var es: String var en: String var confidence: Double } struct ImageResult: Encodable { var pairs: [Pair] var columnCount: Int var strategy: String var lineCount: Int } let spanishAccents = Set(["á","é","í","ó","ú","ñ","ü","Á","É","Í","Ó","Ú","Ñ","Ü","¿","¡"]) let spanishArticles: Set = ["el","la","los","las","un","una","unos","unas"] let englishStarters: Set = ["the","a","an","to","my","his","her","our","their","your"] let englishOnly: Set = ["the","he","she","it","we","they","is","are","was","were","been","have","has","had","will","would"] func classify(_ s: String) -> String { let lower = s.lowercased() if lower.contains(where: { spanishAccents.contains($0) }) { return "es" } let first = lower.split(separator: " ").first.map(String.init)?.trimmingCharacters(in: .punctuationCharacters) ?? "" if spanishArticles.contains(first) { return "es" } if englishStarters.contains(first) || englishOnly.contains(first) { return "en" } return "?" } func recognize(_ cgImage: CGImage) -> [RecognizedLine] { let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) let req = VNRecognizeTextRequest() req.recognitionLevel = .accurate req.recognitionLanguages = ["es-ES", "es", "en-US"] req.usesLanguageCorrection = true if #available(macOS 13.0, *) { req.automaticallyDetectsLanguage = true } try? handler.perform([req]) var out: [RecognizedLine] = [] for obs in req.results ?? [] { guard let top = obs.topCandidates(1).first else { continue } let s = top.string.trimmingCharacters(in: .whitespaces) if s.isEmpty { continue } let bb = obs.boundingBox out.append(RecognizedLine( text: s, cx: Double(bb.origin.x + bb.width / 2), cy: Double(1.0 - (bb.origin.y + bb.height / 2)), confidence: Double(top.confidence) )) } return out } /// Split a sorted-by-X line group into cells by finding the largest gap(s). /// `desiredCells` = 2 for 2-col, 4 for 2-pair, etc. func splitRow(_ lines: [RecognizedLine], into desiredCells: Int) -> [String] { guard lines.count >= desiredCells else { // Merge into fewer cells: just concatenate left-to-right. return [lines.map(\.text).joined(separator: " ")] } let sorted = lines.sorted { $0.cx < $1.cx } // Find (desiredCells - 1) biggest gaps var gaps: [(idx: Int, gap: Double)] = [] for i in 1.. $1.gap } .prefix(desiredCells - 1) .map(\.idx) .sorted() var cells: [[RecognizedLine]] = [] var start = 0 for s in splitAt { cells.append(Array(sorted[start.. [[RecognizedLine]] { let sorted = lines.sorted { $0.cy < $1.cy } var rows: [[RecognizedLine]] = [] var current: [RecognizedLine] = [] for l in sorted { if let last = current.last, abs(l.cy - last.cy) > tol { rows.append(current) current = [l] } else { current.append(l) } } if !current.isEmpty { rows.append(current) } return rows } /// Detect likely column count: look at how many x-cluster peaks exist across all rows. /// Clusters X-coords from all lines into buckets of 10% width. func detectColumnCount(_ lines: [RecognizedLine]) -> Int { guard !lines.isEmpty else { return 2 } let step = 0.10 var buckets = [Int](repeating: 0, count: Int(1.0 / step) + 1) for l in lines { let b = min(max(0, Int(l.cx / step)), buckets.count - 1) buckets[b] += 1 } // A peak = a bucket with count > 10% of total lines let threshold = max(2, lines.count / 10) let peaks = buckets.filter { $0 >= threshold }.count // Most tables are 2-col (peaks = 2). Some 4-col (2 ES/EN pairs side by side → peaks = 4). // Roman/decorative layouts may show 1 peak; treat as 2. switch peaks { case 0, 1, 2: return 2 case 3: return 3 default: return 4 } } /// Merge label-less cells into Spanish→English pairs. /// `cells` is a row's cells (length = columnCount). For N=2, [es, en]. For N=4, /// [es1, en1, es2, en2] (two pairs). For N=3, [es, en_short, en_long] (rare, merge). func cellsToPairs(_ cells: [String], columnCount: Int) -> [(String, String)] { switch columnCount { case 2 where cells.count >= 2: return [(cells[0], cells[1])] case 3 where cells.count >= 3: // 3-col source: es | en | en-alternate. Keep all three by merging EN sides. return [(cells[0], [cells[1], cells[2]].joined(separator: " / "))] case 4 where cells.count >= 4: return [(cells[0], cells[1]), (cells[2], cells[3])] default: if cells.count >= 2 { return [(cells[0], cells.dropFirst().joined(separator: " "))] } return [] } } /// Swap pair if orientation is backwards (English on left, Spanish on right). func orientPair(_ pair: (String, String)) -> (String, String) { let (a, b) = pair let ca = classify(a), cb = classify(b) if ca == "en" && cb == "es" { return (b, a) } return pair } var results: [String: ImageResult] = [:] var processed = 0 let startTime = Date() for name in imageNames { processed += 1 let url = oebpsDir.appendingPathComponent(name) guard let nsImg = NSImage(contentsOf: url), let tiff = nsImg.tiffRepresentation, let rep = NSBitmapImageRep(data: tiff), let cg = rep.cgImage else { continue } let lines = recognize(cg) if lines.isEmpty { results[name] = ImageResult(pairs: [], columnCount: 2, strategy: "empty", lineCount: 0) continue } let columnCount = detectColumnCount(lines) let rows = groupRows(lines, tol: 0.025) var pairs: [Pair] = [] for row in rows { guard row.count >= 2 else { continue } let cells = splitRow(row, into: columnCount) let rawPairs = cellsToPairs(cells, columnCount: columnCount) for p in rawPairs { let (es, en) = orientPair(p) if es.count < 1 || en.count < 1 { continue } let avgConf = row.reduce(0.0) { $0 + $1.confidence } / Double(row.count) pairs.append(Pair(es: es, en: en, confidence: avgConf)) } } results[name] = ImageResult( pairs: pairs, columnCount: columnCount, strategy: "bbox-row-split", lineCount: lines.count ) if processed % 50 == 0 || processed == imageNames.count { let elapsed = Date().timeIntervalSince(startTime) let rate = Double(processed) / max(elapsed, 0.001) let eta = Double(imageNames.count - processed) / max(rate, 0.001) print(String(format: "%d/%d %.1f img/s eta %.0fs", processed, imageNames.count, rate, eta)) } } let enc = JSONEncoder() enc.outputFormatting = [.sortedKeys] try enc.encode(results).write(to: outputURL) let totalPairs = results.values.reduce(0) { $0 + $1.pairs.count } let emptyTables = results.values.filter { $0.pairs.isEmpty }.count print("Wrote \(results.count) results, \(totalPairs) total pairs, \(emptyTables) unpaired")