Spanish/Conjuga/Scripts/textbook/repair_quarantined.swift

#!/usr/bin/env swift
// Re-OCR the images referenced in quarantined_cards.json using Vision with
// bounding-box info, then pair lines by column position (left = Spanish,
// right = English) instead of by document read order.
//
// Output: repaired_cards.json — {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}

import Foundation
import Vision
import AppKit

guard CommandLine.arguments.count >= 4 else {
    print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
    exit(1)
}

let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])

guard let data = try? Data(contentsOf: quarantinedURL),
      let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
      let cards = json["cards"] as? [[String: Any]] else {
    print("Could not load \(quarantinedURL.path)")
    exit(1)
}

var uniqueImages = Set<String>()
for card in cards {
    if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
}
print("Unique images to re-OCR: \(uniqueImages.count)")

struct RecognizedLine {
    let text: String
    let cx: CGFloat   // center X (normalized 0..1)
    let cy: CGFloat   // center Y (normalized 0..1 from top)
    let confidence: Float
}

struct Pair: Encodable {
    var es: String
    var en: String
    var confidence: Double
}

struct ImageResult: Encodable {
    var pairs: [Pair]
    var lineCount: Int
    var strategy: String
}

func classify(_ s: String) -> String {
    // "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
    let lower = s.lowercased()
    let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
    if lower.contains(where: { accentChars.contains($0) }) { return "es" }
    let first = lower.split(separator: " ").first.map(String.init) ?? ""
    let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
    let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
    if esArticles.contains(first) { return "es" }
    if enStarters.contains(first) { return "en" }
    return "?"
}

func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
    let request = VNRecognizeTextRequest()
    request.recognitionLevel = .accurate
    request.recognitionLanguages = ["es-ES", "es", "en-US"]
    request.usesLanguageCorrection = true
    if #available(macOS 13.0, *) {
        request.automaticallyDetectsLanguage = true
    }
    do { try handler.perform([request]) } catch { return [] }
    var out: [RecognizedLine] = []
    for obs in request.results ?? [] {
        guard let top = obs.topCandidates(1).first else { continue }
        let s = top.string.trimmingCharacters(in: .whitespaces)
        if s.isEmpty { continue }
        // Vision's boundingBox is normalized with origin at lower-left
        let bb = obs.boundingBox
        let cx = bb.origin.x + bb.width / 2
        let cyTop = 1.0 - (bb.origin.y + bb.height / 2)  // flip to top-origin
        out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
    }
    return out
}

/// Pair lines by column position: left column = Spanish, right column = English.
/// Groups lines into rows by Y proximity, then within each row pairs left-right.
func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
    guard !lines.isEmpty else { return ([], "empty") }

    // Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
    let sortedByY = lines.sorted { $0.cy < $1.cy }
    var rows: [[RecognizedLine]] = []
    var current: [RecognizedLine] = []
    let rowTol: CGFloat = 0.015   // 1.5% of page height
    for l in sortedByY {
        if let last = current.last, abs(l.cy - last.cy) > rowTol {
            rows.append(current)
            current = [l]
        } else {
            current.append(l)
        }
    }
    if !current.isEmpty { rows.append(current) }

    var pairs: [Pair] = []
    var strategy = "row-pair"
    for row in rows {
        guard row.count >= 2 else { continue }
        // Sort row by X, split at midpoint; left = Spanish, right = English
        let sortedX = row.sorted { $0.cx < $1.cx }
        // Find gap: pick the biggest x-gap in the row to split
        var maxGap: CGFloat = 0
        var splitIdx = 1
        for i in 1..<sortedX.count {
            let gap = sortedX[i].cx - sortedX[i - 1].cx
            if gap > maxGap {
                maxGap = gap
                splitIdx = i
            }
        }
        let leftLines = Array(sortedX[0..<splitIdx])
        let rightLines = Array(sortedX[splitIdx..<sortedX.count])
        let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
        let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
        if leftText.isEmpty || rightText.isEmpty { continue }
        // Verify language orientation — swap if we got it backwards
        var es = leftText
        var en = rightText
        let lc = classify(es)
        let rc = classify(en)
        if lc == "en" && rc == "es" {
            es = rightText
            en = leftText
        }
        let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
        pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
    }

    if pairs.isEmpty { strategy = "no-rows" }
    return (pairs, strategy)
}

var results: [String: ImageResult] = [:]

for name in uniqueImages.sorted() {
    let url = imageDir.appendingPathComponent(name)
    guard let img = NSImage(contentsOf: url),
          let tiff = img.tiffRepresentation,
          let rep = NSBitmapImageRep(data: tiff),
          let cg = rep.cgImage else {
        print("\(name): could not load")
        continue
    }
    let lines = recognizeLines(cgImage: cg)
    let (pairs, strategy) = pairByPosition(lines)
    results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
    print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
}

struct Output: Encodable {
    var byImage: [String: ImageResult]
    var totalPairs: Int
}
let output = Output(
    byImage: results,
    totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
)

let enc = JSONEncoder()
enc.outputFormatting = [.prettyPrinted, .sortedKeys]
try enc.encode(output).write(to: outputURL)
print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")