#!/usr/bin/env swift // Rasterize each page of a PDF at high DPI and OCR it with Vision. // Output: { "": { "lines": [...], "confidence": Double, "bookPage": Int? } } // // Usage: swift ocr_pdf.swift [dpi] // Example: swift ocr_pdf.swift "book.pdf" pdf_ocr.json 240 import Foundation import Vision import AppKit import Quartz guard CommandLine.arguments.count >= 3 else { print("Usage: swift ocr_pdf.swift [dpi]") exit(1) } let pdfURL = URL(fileURLWithPath: CommandLine.arguments[1]) let outputURL = URL(fileURLWithPath: CommandLine.arguments[2]) let dpi: CGFloat = CommandLine.arguments.count >= 4 ? CGFloat(Double(CommandLine.arguments[3]) ?? 240.0) : 240.0 guard let pdfDoc = PDFDocument(url: pdfURL) else { print("Could not open PDF at \(pdfURL.path)") exit(1) } let pageCount = pdfDoc.pageCount print("PDF has \(pageCount) pages. Rendering at \(dpi) DPI.") struct PageResult: Encodable { var lines: [String] var confidence: Double var bookPage: Int? } var results: [String: PageResult] = [:] let startTime = Date() // Render at scale = dpi / 72 (72 is default PDF DPI) let scale: CGFloat = dpi / 72.0 for i in 0.. 0 ? Double(totalConfidence) / Double(count) : 0.0 // Try to detect book page number: a short numeric line in the first // 3 or last 3 entries (typical page-number placement). var bookPage: Int? = nil let candidates = Array(lines.prefix(3)) + Array(lines.suffix(3)) for c in candidates { let trimmed = c.trimmingCharacters(in: .whitespaces) if let n = Int(trimmed), n >= 1 && n <= 1000 { bookPage = n break } } results[String(i)] = PageResult(lines: lines, confidence: avg, bookPage: bookPage) } catch { print("\(i): \(error)") } if (i + 1) % 25 == 0 || (i + 1) == pageCount { let elapsed = Date().timeIntervalSince(startTime) let rate = Double(i + 1) / max(elapsed, 0.001) let remaining = Double(pageCount - (i + 1)) / max(rate, 0.001) print(String(format: "%d/%d %.1f pg/s eta %.0fs", i + 1, pageCount, rate, remaining)) } } let encoder = JSONEncoder() encoder.outputFormatting = [.sortedKeys] do { let data = try encoder.encode(results) try data.write(to: outputURL) print("Wrote \(results.count) pages to \(outputURL.path)") } catch { print("Error writing output: \(error)") exit(1) }