Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
178 lines
6.6 KiB
Swift
178 lines
6.6 KiB
Swift
#!/usr/bin/env swift
|
|
// Re-OCR the images referenced in quarantined_cards.json using Vision with
|
|
// bounding-box info, then pair lines by column position (left = Spanish,
|
|
// right = English) instead of by document read order.
|
|
//
|
|
// Output: repaired_cards.json — {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}
|
|
|
|
import Foundation
|
|
import Vision
|
|
import AppKit
|
|
|
|
guard CommandLine.arguments.count >= 4 else {
|
|
print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
|
|
exit(1)
|
|
}
|
|
|
|
let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
|
|
let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
|
|
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
|
|
|
|
guard let data = try? Data(contentsOf: quarantinedURL),
|
|
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
|
let cards = json["cards"] as? [[String: Any]] else {
|
|
print("Could not load \(quarantinedURL.path)")
|
|
exit(1)
|
|
}
|
|
|
|
var uniqueImages = Set<String>()
|
|
for card in cards {
|
|
if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
|
|
}
|
|
print("Unique images to re-OCR: \(uniqueImages.count)")
|
|
|
|
struct RecognizedLine {
|
|
let text: String
|
|
let cx: CGFloat // center X (normalized 0..1)
|
|
let cy: CGFloat // center Y (normalized 0..1 from top)
|
|
let confidence: Float
|
|
}
|
|
|
|
struct Pair: Encodable {
|
|
var es: String
|
|
var en: String
|
|
var confidence: Double
|
|
}
|
|
|
|
struct ImageResult: Encodable {
|
|
var pairs: [Pair]
|
|
var lineCount: Int
|
|
var strategy: String
|
|
}
|
|
|
|
func classify(_ s: String) -> String {
|
|
// "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
|
|
let lower = s.lowercased()
|
|
let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
|
|
if lower.contains(where: { accentChars.contains($0) }) { return "es" }
|
|
let first = lower.split(separator: " ").first.map(String.init) ?? ""
|
|
let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
|
|
let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
|
|
if esArticles.contains(first) { return "es" }
|
|
if enStarters.contains(first) { return "en" }
|
|
return "?"
|
|
}
|
|
|
|
func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
|
|
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
let request = VNRecognizeTextRequest()
|
|
request.recognitionLevel = .accurate
|
|
request.recognitionLanguages = ["es-ES", "es", "en-US"]
|
|
request.usesLanguageCorrection = true
|
|
if #available(macOS 13.0, *) {
|
|
request.automaticallyDetectsLanguage = true
|
|
}
|
|
do { try handler.perform([request]) } catch { return [] }
|
|
var out: [RecognizedLine] = []
|
|
for obs in request.results ?? [] {
|
|
guard let top = obs.topCandidates(1).first else { continue }
|
|
let s = top.string.trimmingCharacters(in: .whitespaces)
|
|
if s.isEmpty { continue }
|
|
// Vision's boundingBox is normalized with origin at lower-left
|
|
let bb = obs.boundingBox
|
|
let cx = bb.origin.x + bb.width / 2
|
|
let cyTop = 1.0 - (bb.origin.y + bb.height / 2) // flip to top-origin
|
|
out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
|
|
}
|
|
return out
|
|
}
|
|
|
|
/// Pair lines by column position: left column = Spanish, right column = English.
|
|
/// Groups lines into rows by Y proximity, then within each row pairs left-right.
|
|
func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
|
|
guard !lines.isEmpty else { return ([], "empty") }
|
|
|
|
// Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
|
|
let sortedByY = lines.sorted { $0.cy < $1.cy }
|
|
var rows: [[RecognizedLine]] = []
|
|
var current: [RecognizedLine] = []
|
|
let rowTol: CGFloat = 0.015 // 1.5% of page height
|
|
for l in sortedByY {
|
|
if let last = current.last, abs(l.cy - last.cy) > rowTol {
|
|
rows.append(current)
|
|
current = [l]
|
|
} else {
|
|
current.append(l)
|
|
}
|
|
}
|
|
if !current.isEmpty { rows.append(current) }
|
|
|
|
var pairs: [Pair] = []
|
|
var strategy = "row-pair"
|
|
for row in rows {
|
|
guard row.count >= 2 else { continue }
|
|
// Sort row by X, split at midpoint; left = Spanish, right = English
|
|
let sortedX = row.sorted { $0.cx < $1.cx }
|
|
// Find gap: pick the biggest x-gap in the row to split
|
|
var maxGap: CGFloat = 0
|
|
var splitIdx = 1
|
|
for i in 1..<sortedX.count {
|
|
let gap = sortedX[i].cx - sortedX[i - 1].cx
|
|
if gap > maxGap {
|
|
maxGap = gap
|
|
splitIdx = i
|
|
}
|
|
}
|
|
let leftLines = Array(sortedX[0..<splitIdx])
|
|
let rightLines = Array(sortedX[splitIdx..<sortedX.count])
|
|
let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
|
|
let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
|
|
if leftText.isEmpty || rightText.isEmpty { continue }
|
|
// Verify language orientation — swap if we got it backwards
|
|
var es = leftText
|
|
var en = rightText
|
|
let lc = classify(es)
|
|
let rc = classify(en)
|
|
if lc == "en" && rc == "es" {
|
|
es = rightText
|
|
en = leftText
|
|
}
|
|
let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
|
|
pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
|
|
}
|
|
|
|
if pairs.isEmpty { strategy = "no-rows" }
|
|
return (pairs, strategy)
|
|
}
|
|
|
|
var results: [String: ImageResult] = [:]
|
|
|
|
for name in uniqueImages.sorted() {
|
|
let url = imageDir.appendingPathComponent(name)
|
|
guard let img = NSImage(contentsOf: url),
|
|
let tiff = img.tiffRepresentation,
|
|
let rep = NSBitmapImageRep(data: tiff),
|
|
let cg = rep.cgImage else {
|
|
print("\(name): could not load")
|
|
continue
|
|
}
|
|
let lines = recognizeLines(cgImage: cg)
|
|
let (pairs, strategy) = pairByPosition(lines)
|
|
results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
|
|
print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
|
|
}
|
|
|
|
struct Output: Encodable {
|
|
var byImage: [String: ImageResult]
|
|
var totalPairs: Int
|
|
}
|
|
let output = Output(
|
|
byImage: results,
|
|
totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
|
|
)
|
|
|
|
let enc = JSONEncoder()
|
|
enc.outputFormatting = [.prettyPrinted, .sortedKeys]
|
|
try enc.encode(output).write(to: outputURL)
|
|
print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")
|