Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,110 @@
+#!/usr/bin/env swift
+// OCR every JPG in the given input directory using the macOS Vision framework.
+// Output: JSON map of { "<filename>": { "lines": [...], "confidence": Double } }
+//
+// Usage: swift ocr_images.swift <input_dir> <output_json>
+// Example: swift ocr_images.swift ../../../epub_extract/OEBPS ocr.json
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count >= 3 else {
+    print("Usage: swift ocr_images.swift <input_dir> <output_json>")
+    exit(1)
+}
+
+let inputDir = URL(fileURLWithPath: CommandLine.arguments[1])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
+
+// Skip images that are icons/inline markers — not real content
+let skipSubstrings = ["Common", "cover", "title"]
+
+let fileManager = FileManager.default
+guard let enumerator = fileManager.enumerator(at: inputDir, includingPropertiesForKeys: nil) else {
+    print("Could not enumerate \(inputDir.path)")
+    exit(1)
+}
+
+var jpgs: [URL] = []
+for case let url as URL in enumerator {
+    let name = url.lastPathComponent
+    guard name.hasSuffix(".jpg") || name.hasSuffix(".jpeg") || name.hasSuffix(".png") else { continue }
+    if skipSubstrings.contains(where: { name.contains($0) }) { continue }
+    jpgs.append(url)
+}
+jpgs.sort { $0.lastPathComponent < $1.lastPathComponent }
+print("Found \(jpgs.count) images to OCR")
+
+struct OCRResult: Encodable {
+    var lines: [String]
+    var confidence: Double
+}
+
+var results: [String: OCRResult] = [:]
+let total = jpgs.count
+var processed = 0
+let startTime = Date()
+
+for url in jpgs {
+    processed += 1
+    let name = url.lastPathComponent
+
+    guard let nsImage = NSImage(contentsOf: url),
+          let tiffData = nsImage.tiffRepresentation,
+          let bitmap = NSBitmapImageRep(data: tiffData),
+          let cgImage = bitmap.cgImage else {
+        print("\(processed)/\(total) \(name) — could not load")
+        continue
+    }
+
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.recognitionLanguages = ["es-ES", "es", "en-US"]
+    request.usesLanguageCorrection = true
+    // For the 2020 book, automaticallyDetectsLanguage helps with mixed content
+    if #available(macOS 13.0, *) {
+        request.automaticallyDetectsLanguage = true
+    }
+
+    do {
+        try handler.perform([request])
+        let observations = request.results ?? []
+        var lines: [String] = []
+        var totalConfidence: Float = 0
+        var count = 0
+        for obs in observations {
+            if let top = obs.topCandidates(1).first {
+                let s = top.string.trimmingCharacters(in: .whitespaces)
+                if !s.isEmpty {
+                    lines.append(s)
+                    totalConfidence += top.confidence
+                    count += 1
+                }
+            }
+        }
+        let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
+        results[name] = OCRResult(lines: lines, confidence: avg)
+    } catch {
+        print("\(processed)/\(total) \(name) — error: \(error)")
+    }
+
+    if processed % 50 == 0 || processed == total {
+        let elapsed = Date().timeIntervalSince(startTime)
+        let rate = Double(processed) / max(elapsed, 0.001)
+        let remaining = Double(total - processed) / max(rate, 0.001)
+        print(String(format: "%d/%d  %.1f img/s  eta %.0fs", processed, total, rate, remaining))
+    }
+}
+
+let encoder = JSONEncoder()
+encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+do {
+    let data = try encoder.encode(results)
+    try data.write(to: outputURL)
+    print("Wrote \(results.count) OCR entries to \(outputURL.path)")
+} catch {
+    print("Error writing output: \(error)")
+    exit(1)
+}