#!/usr/bin/env swift // OCR every JPG in the given input directory using the macOS Vision framework. // Output: JSON map of { "": { "lines": [...], "confidence": Double } } // // Usage: swift ocr_images.swift // Example: swift ocr_images.swift ../../../epub_extract/OEBPS ocr.json import Foundation import Vision import AppKit guard CommandLine.arguments.count >= 3 else { print("Usage: swift ocr_images.swift ") exit(1) } let inputDir = URL(fileURLWithPath: CommandLine.arguments[1]) let outputURL = URL(fileURLWithPath: CommandLine.arguments[2]) // Skip images that are icons/inline markers — not real content let skipSubstrings = ["Common", "cover", "title"] let fileManager = FileManager.default guard let enumerator = fileManager.enumerator(at: inputDir, includingPropertiesForKeys: nil) else { print("Could not enumerate \(inputDir.path)") exit(1) } var jpgs: [URL] = [] for case let url as URL in enumerator { let name = url.lastPathComponent guard name.hasSuffix(".jpg") || name.hasSuffix(".jpeg") || name.hasSuffix(".png") else { continue } if skipSubstrings.contains(where: { name.contains($0) }) { continue } jpgs.append(url) } jpgs.sort { $0.lastPathComponent < $1.lastPathComponent } print("Found \(jpgs.count) images to OCR") struct OCRResult: Encodable { var lines: [String] var confidence: Double } var results: [String: OCRResult] = [:] let total = jpgs.count var processed = 0 let startTime = Date() for url in jpgs { processed += 1 let name = url.lastPathComponent guard let nsImage = NSImage(contentsOf: url), let tiffData = nsImage.tiffRepresentation, let bitmap = NSBitmapImageRep(data: tiffData), let cgImage = bitmap.cgImage else { print("\(processed)/\(total) \(name) — could not load") continue } let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) let request = VNRecognizeTextRequest() request.recognitionLevel = .accurate request.recognitionLanguages = ["es-ES", "es", "en-US"] request.usesLanguageCorrection = true // For the 2020 book, automaticallyDetectsLanguage helps with mixed content if #available(macOS 13.0, *) { request.automaticallyDetectsLanguage = true } do { try handler.perform([request]) let observations = request.results ?? [] var lines: [String] = [] var totalConfidence: Float = 0 var count = 0 for obs in observations { if let top = obs.topCandidates(1).first { let s = top.string.trimmingCharacters(in: .whitespaces) if !s.isEmpty { lines.append(s) totalConfidence += top.confidence count += 1 } } } let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0 results[name] = OCRResult(lines: lines, confidence: avg) } catch { print("\(processed)/\(total) \(name) — error: \(error)") } if processed % 50 == 0 || processed == total { let elapsed = Date().timeIntervalSince(startTime) let rate = Double(processed) / max(elapsed, 0.001) let remaining = Double(total - processed) / max(rate, 0.001) print(String(format: "%d/%d %.1f img/s eta %.0fs", processed, total, rate, remaining)) } } let encoder = JSONEncoder() encoder.outputFormatting = [.prettyPrinted, .sortedKeys] do { let data = try encoder.encode(results) try data.write(to: outputURL) print("Wrote \(results.count) OCR entries to \(outputURL.path)") } catch { print("Error writing output: \(error)") exit(1) }