Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,177 @@
+#!/usr/bin/env swift
+// Re-OCR the images referenced in quarantined_cards.json using Vision with
+// bounding-box info, then pair lines by column position (left = Spanish,
+// right = English) instead of by document read order.
+//
+// Output: repaired_cards.json — {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count >= 4 else {
+    print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
+    exit(1)
+}
+
+let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
+let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
+
+guard let data = try? Data(contentsOf: quarantinedURL),
+      let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+      let cards = json["cards"] as? [[String: Any]] else {
+    print("Could not load \(quarantinedURL.path)")
+    exit(1)
+}
+
+var uniqueImages = Set<String>()
+for card in cards {
+    if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
+}
+print("Unique images to re-OCR: \(uniqueImages.count)")
+
+struct RecognizedLine {
+    let text: String
+    let cx: CGFloat   // center X (normalized 0..1)
+    let cy: CGFloat   // center Y (normalized 0..1 from top)
+    let confidence: Float
+}
+
+struct Pair: Encodable {
+    var es: String
+    var en: String
+    var confidence: Double
+}
+
+struct ImageResult: Encodable {
+    var pairs: [Pair]
+    var lineCount: Int
+    var strategy: String
+}
+
+func classify(_ s: String) -> String {
+    // "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
+    let lower = s.lowercased()
+    let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
+    if lower.contains(where: { accentChars.contains($0) }) { return "es" }
+    let first = lower.split(separator: " ").first.map(String.init) ?? ""
+    let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
+    let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
+    if esArticles.contains(first) { return "es" }
+    if enStarters.contains(first) { return "en" }
+    return "?"
+}
+
+func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.recognitionLanguages = ["es-ES", "es", "en-US"]
+    request.usesLanguageCorrection = true
+    if #available(macOS 13.0, *) {
+        request.automaticallyDetectsLanguage = true
+    }
+    do { try handler.perform([request]) } catch { return [] }
+    var out: [RecognizedLine] = []
+    for obs in request.results ?? [] {
+        guard let top = obs.topCandidates(1).first else { continue }
+        let s = top.string.trimmingCharacters(in: .whitespaces)
+        if s.isEmpty { continue }
+        // Vision's boundingBox is normalized with origin at lower-left
+        let bb = obs.boundingBox
+        let cx = bb.origin.x + bb.width / 2
+        let cyTop = 1.0 - (bb.origin.y + bb.height / 2)  // flip to top-origin
+        out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
+    }
+    return out
+}
+
+/// Pair lines by column position: left column = Spanish, right column = English.
+/// Groups lines into rows by Y proximity, then within each row pairs left-right.
+func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
+    guard !lines.isEmpty else { return ([], "empty") }
+
+    // Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
+    let sortedByY = lines.sorted { $0.cy < $1.cy }
+    var rows: [[RecognizedLine]] = []
+    var current: [RecognizedLine] = []
+    let rowTol: CGFloat = 0.015   // 1.5% of page height
+    for l in sortedByY {
+        if let last = current.last, abs(l.cy - last.cy) > rowTol {
+            rows.append(current)
+            current = [l]
+        } else {
+            current.append(l)
+        }
+    }
+    if !current.isEmpty { rows.append(current) }
+
+    var pairs: [Pair] = []
+    var strategy = "row-pair"
+    for row in rows {
+        guard row.count >= 2 else { continue }
+        // Sort row by X, split at midpoint; left = Spanish, right = English
+        let sortedX = row.sorted { $0.cx < $1.cx }
+        // Find gap: pick the biggest x-gap in the row to split
+        var maxGap: CGFloat = 0
+        var splitIdx = 1
+        for i in 1..<sortedX.count {
+            let gap = sortedX[i].cx - sortedX[i - 1].cx
+            if gap > maxGap {
+                maxGap = gap
+                splitIdx = i
+            }
+        }
+        let leftLines = Array(sortedX[0..<splitIdx])
+        let rightLines = Array(sortedX[splitIdx..<sortedX.count])
+        let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
+        let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
+        if leftText.isEmpty || rightText.isEmpty { continue }
+        // Verify language orientation — swap if we got it backwards
+        var es = leftText
+        var en = rightText
+        let lc = classify(es)
+        let rc = classify(en)
+        if lc == "en" && rc == "es" {
+            es = rightText
+            en = leftText
+        }
+        let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
+        pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
+    }
+
+    if pairs.isEmpty { strategy = "no-rows" }
+    return (pairs, strategy)
+}
+
+var results: [String: ImageResult] = [:]
+
+for name in uniqueImages.sorted() {
+    let url = imageDir.appendingPathComponent(name)
+    guard let img = NSImage(contentsOf: url),
+          let tiff = img.tiffRepresentation,
+          let rep = NSBitmapImageRep(data: tiff),
+          let cg = rep.cgImage else {
+        print("\(name): could not load")
+        continue
+    }
+    let lines = recognizeLines(cgImage: cg)
+    let (pairs, strategy) = pairByPosition(lines)
+    results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
+    print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
+}
+
+struct Output: Encodable {
+    var byImage: [String: ImageResult]
+    var totalPairs: Int
+}
+let output = Output(
+    byImage: results,
+    totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
+)
+
+let enc = JSONEncoder()
+enc.outputFormatting = [.prettyPrinted, .sortedKeys]
+try enc.encode(output).write(to: outputURL)
+print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")