Spanish/Conjuga/Scripts/textbook/ocr_pdf.swift

#!/usr/bin/env swift
// Rasterize each page of a PDF at high DPI and OCR it with Vision.
// Output: { "<pdfIndex>": { "lines": [...], "confidence": Double, "bookPage": Int? } }
//
// Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]
// Example: swift ocr_pdf.swift "book.pdf" pdf_ocr.json 240

import Foundation
import Vision
import AppKit
import Quartz

guard CommandLine.arguments.count >= 3 else {
    print("Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]")
    exit(1)
}

let pdfURL = URL(fileURLWithPath: CommandLine.arguments[1])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
let dpi: CGFloat = CommandLine.arguments.count >= 4 ? CGFloat(Double(CommandLine.arguments[3]) ?? 240.0) : 240.0

guard let pdfDoc = PDFDocument(url: pdfURL) else {
    print("Could not open PDF at \(pdfURL.path)")
    exit(1)
}

let pageCount = pdfDoc.pageCount
print("PDF has \(pageCount) pages. Rendering at \(dpi) DPI.")

struct PageResult: Encodable {
    var lines: [String]
    var confidence: Double
    var bookPage: Int?
}

var results: [String: PageResult] = [:]
let startTime = Date()

// Render at scale = dpi / 72 (72 is default PDF DPI)
let scale: CGFloat = dpi / 72.0

for i in 0..<pageCount {
    guard let page = pdfDoc.page(at: i) else { continue }
    let pageBounds = page.bounds(for: .mediaBox)
    let scaledSize = CGSize(width: pageBounds.width * scale, height: pageBounds.height * scale)

    // Render the page into a CGImage
    let colorSpace = CGColorSpaceCreateDeviceRGB()
    let bitmapInfo = CGImageAlphaInfo.noneSkipLast.rawValue
    guard let context = CGContext(
        data: nil,
        width: Int(scaledSize.width),
        height: Int(scaledSize.height),
        bitsPerComponent: 8,
        bytesPerRow: 0,
        space: colorSpace,
        bitmapInfo: bitmapInfo
    ) else {
        print("\(i): could not create CGContext")
        continue
    }
    context.setFillColor(CGColor(gray: 1.0, alpha: 1.0))
    context.fill(CGRect(origin: .zero, size: scaledSize))
    context.scaleBy(x: scale, y: scale)
    page.draw(with: .mediaBox, to: context)

    guard let cgImage = context.makeImage() else {
        print("\(i): could not create CGImage")
        continue
    }

    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
    let request = VNRecognizeTextRequest()
    request.recognitionLevel = .accurate
    request.recognitionLanguages = ["es-ES", "es", "en-US"]
    request.usesLanguageCorrection = true
    if #available(macOS 13.0, *) {
        request.automaticallyDetectsLanguage = true
    }

    do {
        try handler.perform([request])
        let observations = request.results ?? []
        var lines: [String] = []
        var totalConfidence: Float = 0
        var count = 0
        for obs in observations {
            if let top = obs.topCandidates(1).first {
                let s = top.string.trimmingCharacters(in: .whitespaces)
                if !s.isEmpty {
                    lines.append(s)
                    totalConfidence += top.confidence
                    count += 1
                }
            }
        }
        let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0

        // Try to detect book page number: a short numeric line in the first
        // 3 or last 3 entries (typical page-number placement).
        var bookPage: Int? = nil
        let candidates = Array(lines.prefix(3)) + Array(lines.suffix(3))
        for c in candidates {
            let trimmed = c.trimmingCharacters(in: .whitespaces)
            if let n = Int(trimmed), n >= 1 && n <= 1000 {
                bookPage = n
                break
            }
        }

        results[String(i)] = PageResult(lines: lines, confidence: avg, bookPage: bookPage)
    } catch {
        print("\(i): \(error)")
    }

    if (i + 1) % 25 == 0 || (i + 1) == pageCount {
        let elapsed = Date().timeIntervalSince(startTime)
        let rate = Double(i + 1) / max(elapsed, 0.001)
        let remaining = Double(pageCount - (i + 1)) / max(rate, 0.001)
        print(String(format: "%d/%d  %.1f pg/s  eta %.0fs", i + 1, pageCount, rate, remaining))
    }
}

let encoder = JSONEncoder()
encoder.outputFormatting = [.sortedKeys]
do {
    let data = try encoder.encode(results)
    try data.write(to: outputURL)
    print("Wrote \(results.count) pages to \(outputURL.path)")
} catch {
    print("Error writing output: \(error)")
    exit(1)
}