Files
Spanish/Conjuga/Scripts/textbook/ocr_images.swift
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

111 lines
3.7 KiB
Swift

#!/usr/bin/env swift
// OCR every JPG in the given input directory using the macOS Vision framework.
// Output: JSON map of { "<filename>": { "lines": [...], "confidence": Double } }
//
// Usage: swift ocr_images.swift <input_dir> <output_json>
// Example: swift ocr_images.swift ../../../epub_extract/OEBPS ocr.json
import Foundation
import Vision
import AppKit
guard CommandLine.arguments.count >= 3 else {
print("Usage: swift ocr_images.swift <input_dir> <output_json>")
exit(1)
}
let inputDir = URL(fileURLWithPath: CommandLine.arguments[1])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
// Skip images that are icons/inline markers not real content
let skipSubstrings = ["Common", "cover", "title"]
let fileManager = FileManager.default
guard let enumerator = fileManager.enumerator(at: inputDir, includingPropertiesForKeys: nil) else {
print("Could not enumerate \(inputDir.path)")
exit(1)
}
var jpgs: [URL] = []
for case let url as URL in enumerator {
let name = url.lastPathComponent
guard name.hasSuffix(".jpg") || name.hasSuffix(".jpeg") || name.hasSuffix(".png") else { continue }
if skipSubstrings.contains(where: { name.contains($0) }) { continue }
jpgs.append(url)
}
jpgs.sort { $0.lastPathComponent < $1.lastPathComponent }
print("Found \(jpgs.count) images to OCR")
struct OCRResult: Encodable {
var lines: [String]
var confidence: Double
}
var results: [String: OCRResult] = [:]
let total = jpgs.count
var processed = 0
let startTime = Date()
for url in jpgs {
processed += 1
let name = url.lastPathComponent
guard let nsImage = NSImage(contentsOf: url),
let tiffData = nsImage.tiffRepresentation,
let bitmap = NSBitmapImageRep(data: tiffData),
let cgImage = bitmap.cgImage else {
print("\(processed)/\(total) \(name) — could not load")
continue
}
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let request = VNRecognizeTextRequest()
request.recognitionLevel = .accurate
request.recognitionLanguages = ["es-ES", "es", "en-US"]
request.usesLanguageCorrection = true
// For the 2020 book, automaticallyDetectsLanguage helps with mixed content
if #available(macOS 13.0, *) {
request.automaticallyDetectsLanguage = true
}
do {
try handler.perform([request])
let observations = request.results ?? []
var lines: [String] = []
var totalConfidence: Float = 0
var count = 0
for obs in observations {
if let top = obs.topCandidates(1).first {
let s = top.string.trimmingCharacters(in: .whitespaces)
if !s.isEmpty {
lines.append(s)
totalConfidence += top.confidence
count += 1
}
}
}
let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
results[name] = OCRResult(lines: lines, confidence: avg)
} catch {
print("\(processed)/\(total) \(name) — error: \(error)")
}
if processed % 50 == 0 || processed == total {
let elapsed = Date().timeIntervalSince(startTime)
let rate = Double(processed) / max(elapsed, 0.001)
let remaining = Double(total - processed) / max(rate, 0.001)
print(String(format: "%d/%d %.1f img/s eta %.0fs", processed, total, rate, remaining))
}
}
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
do {
let data = try encoder.encode(results)
try data.write(to: outputURL)
print("Wrote \(results.count) OCR entries to \(outputURL.path)")
} catch {
print("Error writing output: \(error)")
exit(1)
}