Spanish/Conjuga/Scripts/textbook/ocr_images.swift

#!/usr/bin/env swift
// OCR every JPG in the given input directory using the macOS Vision framework.
// Output: JSON map of { "<filename>": { "lines": [...], "confidence": Double } }
//
// Usage: swift ocr_images.swift <input_dir> <output_json>
// Example: swift ocr_images.swift ../../../epub_extract/OEBPS ocr.json

import Foundation
import Vision
import AppKit

guard CommandLine.arguments.count >= 3 else {
    print("Usage: swift ocr_images.swift <input_dir> <output_json>")
    exit(1)
}

let inputDir = URL(fileURLWithPath: CommandLine.arguments[1])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])

// Skip images that are icons/inline markers — not real content
let skipSubstrings = ["Common", "cover", "title"]

let fileManager = FileManager.default
guard let enumerator = fileManager.enumerator(at: inputDir, includingPropertiesForKeys: nil) else {
    print("Could not enumerate \(inputDir.path)")
    exit(1)
}

var jpgs: [URL] = []
for case let url as URL in enumerator {
    let name = url.lastPathComponent
    guard name.hasSuffix(".jpg") || name.hasSuffix(".jpeg") || name.hasSuffix(".png") else { continue }
    if skipSubstrings.contains(where: { name.contains($0) }) { continue }
    jpgs.append(url)
}
jpgs.sort { $0.lastPathComponent < $1.lastPathComponent }
print("Found \(jpgs.count) images to OCR")

struct OCRResult: Encodable {
    var lines: [String]
    var confidence: Double
}

var results: [String: OCRResult] = [:]
let total = jpgs.count
var processed = 0
let startTime = Date()

for url in jpgs {
    processed += 1
    let name = url.lastPathComponent

    guard let nsImage = NSImage(contentsOf: url),
          let tiffData = nsImage.tiffRepresentation,
          let bitmap = NSBitmapImageRep(data: tiffData),
          let cgImage = bitmap.cgImage else {
        print("\(processed)/\(total) \(name) — could not load")
        continue
    }

    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
    let request = VNRecognizeTextRequest()
    request.recognitionLevel = .accurate
    request.recognitionLanguages = ["es-ES", "es", "en-US"]
    request.usesLanguageCorrection = true
    // For the 2020 book, automaticallyDetectsLanguage helps with mixed content
    if #available(macOS 13.0, *) {
        request.automaticallyDetectsLanguage = true
    }

    do {
        try handler.perform([request])
        let observations = request.results ?? []
        var lines: [String] = []
        var totalConfidence: Float = 0
        var count = 0
        for obs in observations {
            if let top = obs.topCandidates(1).first {
                let s = top.string.trimmingCharacters(in: .whitespaces)
                if !s.isEmpty {
                    lines.append(s)
                    totalConfidence += top.confidence
                    count += 1
                }
            }
        }
        let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
        results[name] = OCRResult(lines: lines, confidence: avg)
    } catch {
        print("\(processed)/\(total) \(name) — error: \(error)")
    }

    if processed % 50 == 0 || processed == total {
        let elapsed = Date().timeIntervalSince(startTime)
        let rate = Double(processed) / max(elapsed, 0.001)
        let remaining = Double(total - processed) / max(rate, 0.001)
        print(String(format: "%d/%d  %.1f img/s  eta %.0fs", processed, total, rate, remaining))
    }
}

let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
do {
    let data = try encoder.encode(results)
    try data.write(to: outputURL)
    print("Wrote \(results.count) OCR entries to \(outputURL.path)")
} catch {
    print("Error writing output: \(error)")
    exit(1)
}