Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
177
Conjuga/Scripts/textbook/repair_quarantined.swift
Normal file
177
Conjuga/Scripts/textbook/repair_quarantined.swift
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env swift
|
||||
// Re-OCR the images referenced in quarantined_cards.json using Vision with
|
||||
// bounding-box info, then pair lines by column position (left = Spanish,
|
||||
// right = English) instead of by document read order.
|
||||
//
|
||||
// Output: repaired_cards.json — {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}
|
||||
|
||||
import Foundation
|
||||
import Vision
|
||||
import AppKit
|
||||
|
||||
guard CommandLine.arguments.count >= 4 else {
|
||||
print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
|
||||
let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
|
||||
let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
|
||||
|
||||
guard let data = try? Data(contentsOf: quarantinedURL),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let cards = json["cards"] as? [[String: Any]] else {
|
||||
print("Could not load \(quarantinedURL.path)")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
var uniqueImages = Set<String>()
|
||||
for card in cards {
|
||||
if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
|
||||
}
|
||||
print("Unique images to re-OCR: \(uniqueImages.count)")
|
||||
|
||||
struct RecognizedLine {
|
||||
let text: String
|
||||
let cx: CGFloat // center X (normalized 0..1)
|
||||
let cy: CGFloat // center Y (normalized 0..1 from top)
|
||||
let confidence: Float
|
||||
}
|
||||
|
||||
struct Pair: Encodable {
|
||||
var es: String
|
||||
var en: String
|
||||
var confidence: Double
|
||||
}
|
||||
|
||||
struct ImageResult: Encodable {
|
||||
var pairs: [Pair]
|
||||
var lineCount: Int
|
||||
var strategy: String
|
||||
}
|
||||
|
||||
func classify(_ s: String) -> String {
|
||||
// "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
|
||||
let lower = s.lowercased()
|
||||
let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
|
||||
if lower.contains(where: { accentChars.contains($0) }) { return "es" }
|
||||
let first = lower.split(separator: " ").first.map(String.init) ?? ""
|
||||
let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
|
||||
let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
|
||||
if esArticles.contains(first) { return "es" }
|
||||
if enStarters.contains(first) { return "en" }
|
||||
return "?"
|
||||
}
|
||||
|
||||
func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let request = VNRecognizeTextRequest()
|
||||
request.recognitionLevel = .accurate
|
||||
request.recognitionLanguages = ["es-ES", "es", "en-US"]
|
||||
request.usesLanguageCorrection = true
|
||||
if #available(macOS 13.0, *) {
|
||||
request.automaticallyDetectsLanguage = true
|
||||
}
|
||||
do { try handler.perform([request]) } catch { return [] }
|
||||
var out: [RecognizedLine] = []
|
||||
for obs in request.results ?? [] {
|
||||
guard let top = obs.topCandidates(1).first else { continue }
|
||||
let s = top.string.trimmingCharacters(in: .whitespaces)
|
||||
if s.isEmpty { continue }
|
||||
// Vision's boundingBox is normalized with origin at lower-left
|
||||
let bb = obs.boundingBox
|
||||
let cx = bb.origin.x + bb.width / 2
|
||||
let cyTop = 1.0 - (bb.origin.y + bb.height / 2) // flip to top-origin
|
||||
out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
/// Pair lines by column position: left column = Spanish, right column = English.
|
||||
/// Groups lines into rows by Y proximity, then within each row pairs left-right.
|
||||
func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
|
||||
guard !lines.isEmpty else { return ([], "empty") }
|
||||
|
||||
// Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
|
||||
let sortedByY = lines.sorted { $0.cy < $1.cy }
|
||||
var rows: [[RecognizedLine]] = []
|
||||
var current: [RecognizedLine] = []
|
||||
let rowTol: CGFloat = 0.015 // 1.5% of page height
|
||||
for l in sortedByY {
|
||||
if let last = current.last, abs(l.cy - last.cy) > rowTol {
|
||||
rows.append(current)
|
||||
current = [l]
|
||||
} else {
|
||||
current.append(l)
|
||||
}
|
||||
}
|
||||
if !current.isEmpty { rows.append(current) }
|
||||
|
||||
var pairs: [Pair] = []
|
||||
var strategy = "row-pair"
|
||||
for row in rows {
|
||||
guard row.count >= 2 else { continue }
|
||||
// Sort row by X, split at midpoint; left = Spanish, right = English
|
||||
let sortedX = row.sorted { $0.cx < $1.cx }
|
||||
// Find gap: pick the biggest x-gap in the row to split
|
||||
var maxGap: CGFloat = 0
|
||||
var splitIdx = 1
|
||||
for i in 1..<sortedX.count {
|
||||
let gap = sortedX[i].cx - sortedX[i - 1].cx
|
||||
if gap > maxGap {
|
||||
maxGap = gap
|
||||
splitIdx = i
|
||||
}
|
||||
}
|
||||
let leftLines = Array(sortedX[0..<splitIdx])
|
||||
let rightLines = Array(sortedX[splitIdx..<sortedX.count])
|
||||
let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
|
||||
let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
|
||||
if leftText.isEmpty || rightText.isEmpty { continue }
|
||||
// Verify language orientation — swap if we got it backwards
|
||||
var es = leftText
|
||||
var en = rightText
|
||||
let lc = classify(es)
|
||||
let rc = classify(en)
|
||||
if lc == "en" && rc == "es" {
|
||||
es = rightText
|
||||
en = leftText
|
||||
}
|
||||
let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
|
||||
pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
|
||||
}
|
||||
|
||||
if pairs.isEmpty { strategy = "no-rows" }
|
||||
return (pairs, strategy)
|
||||
}
|
||||
|
||||
var results: [String: ImageResult] = [:]
|
||||
|
||||
for name in uniqueImages.sorted() {
|
||||
let url = imageDir.appendingPathComponent(name)
|
||||
guard let img = NSImage(contentsOf: url),
|
||||
let tiff = img.tiffRepresentation,
|
||||
let rep = NSBitmapImageRep(data: tiff),
|
||||
let cg = rep.cgImage else {
|
||||
print("\(name): could not load")
|
||||
continue
|
||||
}
|
||||
let lines = recognizeLines(cgImage: cg)
|
||||
let (pairs, strategy) = pairByPosition(lines)
|
||||
results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
|
||||
print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
|
||||
}
|
||||
|
||||
struct Output: Encodable {
|
||||
var byImage: [String: ImageResult]
|
||||
var totalPairs: Int
|
||||
}
|
||||
let output = Output(
|
||||
byImage: results,
|
||||
totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
|
||||
)
|
||||
|
||||
let enc = JSONEncoder()
|
||||
enc.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||
try enc.encode(output).write(to: outputURL)
|
||||
print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")
|
||||
Reference in New Issue
Block a user