Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
--- a/Conjuga/SharedModels/Sources/SharedModels/AnswerGrader.swift
+++ b/Conjuga/SharedModels/Sources/SharedModels/AnswerGrader.swift
@@ -0,0 +1,68 @@
+import Foundation
+
+/// On-device deterministic answer grader with partial-credit support.
+/// No network calls, no API keys. Handles accent stripping and single-char typos.
+public enum AnswerGrader {
+
+    /// Evaluate `userText` against the canonical answer (plus alternates).
+    /// Returns `.correct` for exact/normalized match, `.close` for accent-strip
+    /// match or Levenshtein distance 1, `.wrong` otherwise.
+    public static func grade(userText: String, canonical: String, alternates: [String] = []) -> TextbookGrade {
+        let candidates = [canonical] + alternates
+        let normalizedUser = normalize(userText)
+        if normalizedUser.isEmpty { return .wrong }
+
+        for c in candidates {
+            if normalize(c) == normalizedUser { return .correct }
+        }
+        for c in candidates {
+            if stripAccents(normalize(c)) == stripAccents(normalizedUser) {
+                return .close
+            }
+        }
+        for c in candidates {
+            if levenshtein(normalizedUser, normalize(c)) <= 1 {
+                return .close
+            }
+        }
+        return .wrong
+    }
+
+    /// Lowercase, collapse whitespace, strip leading/trailing punctuation.
+    public static func normalize(_ s: String) -> String {
+        let lowered = s.lowercased(with: Locale(identifier: "es"))
+        let collapsed = lowered.replacingOccurrences(of: "\\s+", with: " ", options: .regularExpression)
+        let trimmed = collapsed.trimmingCharacters(in: .whitespacesAndNewlines)
+        let punct = CharacterSet(charactersIn: ".,;:!?¿¡\"'()[]{}—–-")
+        return trimmed.trimmingCharacters(in: punct)
+    }
+
+    /// Remove combining diacritics (á→a, ñ→n, ü→u).
+    public static func stripAccents(_ s: String) -> String {
+        s.folding(options: .diacriticInsensitive, locale: Locale(identifier: "en"))
+    }
+
+    /// Standard Levenshtein edit distance.
+    public static func levenshtein(_ a: String, _ b: String) -> Int {
+        if a == b { return 0 }
+        if a.isEmpty { return b.count }
+        if b.isEmpty { return a.count }
+        let aa = Array(a)
+        let bb = Array(b)
+        var prev = Array(0...bb.count)
+        var curr = Array(repeating: 0, count: bb.count + 1)
+        for i in 1...aa.count {
+            curr[0] = i
+            for j in 1...bb.count {
+                let cost = aa[i - 1] == bb[j - 1] ? 0 : 1
+                curr[j] = min(
+                    prev[j] + 1,
+                    curr[j - 1] + 1,
+                    prev[j - 1] + cost
+                )
+            }
+            swap(&prev, &curr)
+        }
+        return prev[bb.count]
+    }
+}
--- a/Conjuga/SharedModels/Sources/SharedModels/TextbookChapter.swift
+++ b/Conjuga/SharedModels/Sources/SharedModels/TextbookChapter.swift
@@ -0,0 +1,86 @@
+import Foundation
+import SwiftData
+
+/// One chapter of the textbook. Ordered content blocks are stored as JSON in `bodyJSON`
+/// (encoded [TextbookBlock]) since SwiftData @Model doesn't support heterogeneous arrays.
+@Model
+public final class TextbookChapter {
+    @Attribute(.unique) public var id: String = ""
+    public var number: Int = 0
+    public var title: String = ""
+    public var part: Int = 0                    // 0 = no part assignment
+    public var courseName: String = ""
+    public var bodyJSON: Data = Data()
+    public var exerciseCount: Int = 0
+    public var vocabTableCount: Int = 0
+
+    public init(
+        id: String,
+        number: Int,
+        title: String,
+        part: Int,
+        courseName: String,
+        bodyJSON: Data,
+        exerciseCount: Int,
+        vocabTableCount: Int
+    ) {
+        self.id = id
+        self.number = number
+        self.title = title
+        self.part = part
+        self.courseName = courseName
+        self.bodyJSON = bodyJSON
+        self.exerciseCount = exerciseCount
+        self.vocabTableCount = vocabTableCount
+    }
+
+    public func blocks() -> [TextbookBlock] {
+        (try? JSONDecoder().decode([TextbookBlock].self, from: bodyJSON)) ?? []
+    }
+}
+
+/// One content block within a chapter. Polymorphic via `kind`.
+public struct TextbookBlock: Codable, Identifiable, Sendable {
+    public enum Kind: String, Codable, Sendable {
+        case heading
+        case paragraph
+        case keyVocabHeader = "key_vocab_header"
+        case vocabTable = "vocab_table"
+        case exercise
+    }
+
+    public var id: String { "\(kind.rawValue):\(index)" }
+    public var index: Int
+    public var kind: Kind
+
+    // heading
+    public var level: Int?
+    // heading / paragraph
+    public var text: String?
+
+    // vocab_table
+    public var sourceImage: String?
+    public var ocrLines: [String]?
+    public var ocrConfidence: Double?
+    public var cards: [TextbookVocabPair]?
+
+    // exercise
+    public var exerciseId: String?
+    public var instruction: String?
+    public var extra: [String]?
+    public var prompts: [String]?
+    public var answerItems: [TextbookAnswerItem]?
+    public var freeform: Bool?
+}
+
+public struct TextbookVocabPair: Codable, Sendable {
+    public var front: String
+    public var back: String
+}
+
+public struct TextbookAnswerItem: Codable, Sendable {
+    public var label: String?      // A/B/C subpart label or nil
+    public var number: Int
+    public var answer: String
+    public var alternates: [String]
+}
--- a/Conjuga/SharedModels/Sources/SharedModels/TextbookExerciseAttempt.swift
+++ b/Conjuga/SharedModels/Sources/SharedModels/TextbookExerciseAttempt.swift
@@ -0,0 +1,83 @@
+import Foundation
+import SwiftData
+
+/// Per-prompt grading state recorded after the user submits an exercise.
+public enum TextbookGrade: Int, Codable, Sendable {
+    case wrong = 0
+    case close = 1
+    case correct = 2
+}
+
+/// User's attempt for one exercise. Stored in the cloud container so progress
+/// syncs across devices.
+@Model
+public final class TextbookExerciseAttempt {
+    /// Deterministic id: "<courseName>|<exerciseId>". CloudKit-synced models can't
+    /// use @Attribute(.unique); code that writes attempts must fetch-or-create.
+    public var id: String = ""
+    public var courseName: String = ""
+    public var chapterNumber: Int = 0
+    public var exerciseId: String = ""
+
+    /// JSON-encoded per-prompt state array.
+    /// Each entry: { "number": Int, "userText": String, "grade": Int }
+    public var stateJSON: Data = Data()
+
+    public var lastAttemptAt: Date = Date()
+    public var correctCount: Int = 0
+    public var closeCount: Int = 0
+    public var wrongCount: Int = 0
+    public var totalCount: Int = 0
+
+    public init(
+        id: String,
+        courseName: String,
+        chapterNumber: Int,
+        exerciseId: String,
+        stateJSON: Data = Data(),
+        lastAttemptAt: Date = Date(),
+        correctCount: Int = 0,
+        closeCount: Int = 0,
+        wrongCount: Int = 0,
+        totalCount: Int = 0
+    ) {
+        self.id = id
+        self.courseName = courseName
+        self.chapterNumber = chapterNumber
+        self.exerciseId = exerciseId
+        self.stateJSON = stateJSON
+        self.lastAttemptAt = lastAttemptAt
+        self.correctCount = correctCount
+        self.closeCount = closeCount
+        self.wrongCount = wrongCount
+        self.totalCount = totalCount
+    }
+
+    public func promptStates() -> [TextbookPromptState] {
+        (try? JSONDecoder().decode([TextbookPromptState].self, from: stateJSON)) ?? []
+    }
+
+    public func setPromptStates(_ states: [TextbookPromptState]) {
+        stateJSON = (try? JSONEncoder().encode(states)) ?? Data()
+        correctCount = states.filter { $0.grade == .correct }.count
+        closeCount = states.filter { $0.grade == .close }.count
+        wrongCount = states.filter { $0.grade == .wrong }.count
+        totalCount = states.count
+    }
+
+    public static func attemptId(courseName: String, exerciseId: String) -> String {
+        "\(courseName)|\(exerciseId)"
+    }
+}
+
+public struct TextbookPromptState: Codable, Sendable {
+    public var number: Int
+    public var userText: String
+    public var grade: TextbookGrade
+
+    public init(number: Int, userText: String, grade: TextbookGrade) {
+        self.number = number
+        self.userText = userText
+        self.grade = grade
+    }
+}
--- a/Conjuga/SharedModels/Tests/SharedModelsTests/AnswerGraderTests.swift
+++ b/Conjuga/SharedModels/Tests/SharedModelsTests/AnswerGraderTests.swift
@@ -0,0 +1,80 @@
+import Testing
+@testable import SharedModels
+
+@Suite("AnswerGrader")
+struct AnswerGraderTests {
+
+    @Test("exact match is correct")
+    func exact() {
+        #expect(AnswerGrader.grade(userText: "tengo", canonical: "tengo") == .correct)
+        #expect(AnswerGrader.grade(userText: "Tengo", canonical: "tengo") == .correct)
+        #expect(AnswerGrader.grade(userText: "  tengo  ", canonical: "tengo") == .correct)
+    }
+
+    @Test("missing accent is close")
+    func missingAccent() {
+        #expect(AnswerGrader.grade(userText: "esta", canonical: "está") == .close)
+        #expect(AnswerGrader.grade(userText: "nino", canonical: "niño") == .close)
+        #expect(AnswerGrader.grade(userText: "asi", canonical: "así") == .close)
+    }
+
+    @Test("single-char typo is close")
+    func singleCharTypo() {
+        // deletion
+        #expect(AnswerGrader.grade(userText: "tngo", canonical: "tengo") == .close)
+        // insertion
+        #expect(AnswerGrader.grade(userText: "tengoo", canonical: "tengo") == .close)
+        // substitution
+        #expect(AnswerGrader.grade(userText: "tengu", canonical: "tengo") == .close)
+    }
+
+    @Test("two-char typo is wrong")
+    func twoCharTypo() {
+        #expect(AnswerGrader.grade(userText: "tngu", canonical: "tengo") == .wrong)
+    }
+
+    @Test("empty is wrong")
+    func empty() {
+        #expect(AnswerGrader.grade(userText: "", canonical: "tengo") == .wrong)
+        #expect(AnswerGrader.grade(userText: "   ", canonical: "tengo") == .wrong)
+    }
+
+    @Test("alternates accepted")
+    func alternates() {
+        #expect(AnswerGrader.grade(userText: "flaca", canonical: "delgada", alternates: ["flaca"]) == .correct)
+        #expect(AnswerGrader.grade(userText: "flacca", canonical: "delgada", alternates: ["flaca"]) == .close)
+    }
+
+    @Test("punctuation stripped")
+    func punctuation() {
+        #expect(AnswerGrader.grade(userText: "el libro.", canonical: "el libro") == .correct)
+        #expect(AnswerGrader.grade(userText: "¿dónde?", canonical: "dónde") == .correct)
+    }
+
+    @Test("very different text is wrong")
+    func wrong() {
+        #expect(AnswerGrader.grade(userText: "hola", canonical: "tengo") == .wrong)
+        #expect(AnswerGrader.grade(userText: "casa", canonical: "perro") == .wrong)
+    }
+
+    @Test("normalize produces expected output")
+    func normalize() {
+        #expect(AnswerGrader.normalize("  Hola  ") == "hola")
+        #expect(AnswerGrader.normalize("ABC!") == "abc")
+    }
+
+    @Test("stripAccents handles common Spanish diacritics")
+    func stripAccents() {
+        #expect(AnswerGrader.stripAccents("niño") == "nino")
+        #expect(AnswerGrader.stripAccents("está") == "esta")
+        #expect(AnswerGrader.stripAccents("güero") == "guero")
+    }
+
+    @Test("levenshtein computes edit distance")
+    func levenshtein() {
+        #expect(AnswerGrader.levenshtein("kitten", "sitting") == 3)
+        #expect(AnswerGrader.levenshtein("flaw", "lawn") == 2)
+        #expect(AnswerGrader.levenshtein("abc", "abc") == 0)
+        #expect(AnswerGrader.levenshtein("", "abc") == 3)
+    }
+}