Files
Spanish/Conjuga/Scripts/textbook/repair_quarantined.swift
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

178 lines
6.6 KiB
Swift

#!/usr/bin/env swift
// Re-OCR the images referenced in quarantined_cards.json using Vision with
// bounding-box info, then pair lines by column position (left = Spanish,
// right = English) instead of by document read order.
//
// Output: repaired_cards.json {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}
import Foundation
import Vision
import AppKit
guard CommandLine.arguments.count >= 4 else {
print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
exit(1)
}
let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
guard let data = try? Data(contentsOf: quarantinedURL),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let cards = json["cards"] as? [[String: Any]] else {
print("Could not load \(quarantinedURL.path)")
exit(1)
}
var uniqueImages = Set<String>()
for card in cards {
if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
}
print("Unique images to re-OCR: \(uniqueImages.count)")
struct RecognizedLine {
let text: String
let cx: CGFloat // center X (normalized 0..1)
let cy: CGFloat // center Y (normalized 0..1 from top)
let confidence: Float
}
struct Pair: Encodable {
var es: String
var en: String
var confidence: Double
}
struct ImageResult: Encodable {
var pairs: [Pair]
var lineCount: Int
var strategy: String
}
func classify(_ s: String) -> String {
// "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
let lower = s.lowercased()
let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
if lower.contains(where: { accentChars.contains($0) }) { return "es" }
let first = lower.split(separator: " ").first.map(String.init) ?? ""
let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
if esArticles.contains(first) { return "es" }
if enStarters.contains(first) { return "en" }
return "?"
}
func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let request = VNRecognizeTextRequest()
request.recognitionLevel = .accurate
request.recognitionLanguages = ["es-ES", "es", "en-US"]
request.usesLanguageCorrection = true
if #available(macOS 13.0, *) {
request.automaticallyDetectsLanguage = true
}
do { try handler.perform([request]) } catch { return [] }
var out: [RecognizedLine] = []
for obs in request.results ?? [] {
guard let top = obs.topCandidates(1).first else { continue }
let s = top.string.trimmingCharacters(in: .whitespaces)
if s.isEmpty { continue }
// Vision's boundingBox is normalized with origin at lower-left
let bb = obs.boundingBox
let cx = bb.origin.x + bb.width / 2
let cyTop = 1.0 - (bb.origin.y + bb.height / 2) // flip to top-origin
out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
}
return out
}
/// Pair lines by column position: left column = Spanish, right column = English.
/// Groups lines into rows by Y proximity, then within each row pairs left-right.
func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
guard !lines.isEmpty else { return ([], "empty") }
// Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
let sortedByY = lines.sorted { $0.cy < $1.cy }
var rows: [[RecognizedLine]] = []
var current: [RecognizedLine] = []
let rowTol: CGFloat = 0.015 // 1.5% of page height
for l in sortedByY {
if let last = current.last, abs(l.cy - last.cy) > rowTol {
rows.append(current)
current = [l]
} else {
current.append(l)
}
}
if !current.isEmpty { rows.append(current) }
var pairs: [Pair] = []
var strategy = "row-pair"
for row in rows {
guard row.count >= 2 else { continue }
// Sort row by X, split at midpoint; left = Spanish, right = English
let sortedX = row.sorted { $0.cx < $1.cx }
// Find gap: pick the biggest x-gap in the row to split
var maxGap: CGFloat = 0
var splitIdx = 1
for i in 1..<sortedX.count {
let gap = sortedX[i].cx - sortedX[i - 1].cx
if gap > maxGap {
maxGap = gap
splitIdx = i
}
}
let leftLines = Array(sortedX[0..<splitIdx])
let rightLines = Array(sortedX[splitIdx..<sortedX.count])
let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
if leftText.isEmpty || rightText.isEmpty { continue }
// Verify language orientation swap if we got it backwards
var es = leftText
var en = rightText
let lc = classify(es)
let rc = classify(en)
if lc == "en" && rc == "es" {
es = rightText
en = leftText
}
let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
}
if pairs.isEmpty { strategy = "no-rows" }
return (pairs, strategy)
}
var results: [String: ImageResult] = [:]
for name in uniqueImages.sorted() {
let url = imageDir.appendingPathComponent(name)
guard let img = NSImage(contentsOf: url),
let tiff = img.tiffRepresentation,
let rep = NSBitmapImageRep(data: tiff),
let cg = rep.cgImage else {
print("\(name): could not load")
continue
}
let lines = recognizeLines(cgImage: cg)
let (pairs, strategy) = pairByPosition(lines)
results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
}
struct Output: Encodable {
var byImage: [String: ImageResult]
var totalPairs: Int
}
let output = Output(
byImage: results,
totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
)
let enc = JSONEncoder()
enc.outputFormatting = [.prettyPrinted, .sortedKeys]
try enc.encode(output).write(to: outputURL)
print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")