Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions

View File

@@ -0,0 +1,68 @@
import Foundation
/// On-device deterministic answer grader with partial-credit support.
/// No network calls, no API keys. Handles accent stripping and single-char typos.
public enum AnswerGrader {
/// Evaluate `userText` against the canonical answer (plus alternates).
/// Returns `.correct` for exact/normalized match, `.close` for accent-strip
/// match or Levenshtein distance 1, `.wrong` otherwise.
public static func grade(userText: String, canonical: String, alternates: [String] = []) -> TextbookGrade {
let candidates = [canonical] + alternates
let normalizedUser = normalize(userText)
if normalizedUser.isEmpty { return .wrong }
for c in candidates {
if normalize(c) == normalizedUser { return .correct }
}
for c in candidates {
if stripAccents(normalize(c)) == stripAccents(normalizedUser) {
return .close
}
}
for c in candidates {
if levenshtein(normalizedUser, normalize(c)) <= 1 {
return .close
}
}
return .wrong
}
/// Lowercase, collapse whitespace, strip leading/trailing punctuation.
public static func normalize(_ s: String) -> String {
let lowered = s.lowercased(with: Locale(identifier: "es"))
let collapsed = lowered.replacingOccurrences(of: "\\s+", with: " ", options: .regularExpression)
let trimmed = collapsed.trimmingCharacters(in: .whitespacesAndNewlines)
let punct = CharacterSet(charactersIn: ".,;:!?¿¡\"'()[]{}—–-")
return trimmed.trimmingCharacters(in: punct)
}
/// Remove combining diacritics (áa, ñn, üu).
public static func stripAccents(_ s: String) -> String {
s.folding(options: .diacriticInsensitive, locale: Locale(identifier: "en"))
}
/// Standard Levenshtein edit distance.
public static func levenshtein(_ a: String, _ b: String) -> Int {
if a == b { return 0 }
if a.isEmpty { return b.count }
if b.isEmpty { return a.count }
let aa = Array(a)
let bb = Array(b)
var prev = Array(0...bb.count)
var curr = Array(repeating: 0, count: bb.count + 1)
for i in 1...aa.count {
curr[0] = i
for j in 1...bb.count {
let cost = aa[i - 1] == bb[j - 1] ? 0 : 1
curr[j] = min(
prev[j] + 1,
curr[j - 1] + 1,
prev[j - 1] + cost
)
}
swap(&prev, &curr)
}
return prev[bb.count]
}
}

View File

@@ -0,0 +1,86 @@
import Foundation
import SwiftData
/// One chapter of the textbook. Ordered content blocks are stored as JSON in `bodyJSON`
/// (encoded [TextbookBlock]) since SwiftData @Model doesn't support heterogeneous arrays.
@Model
public final class TextbookChapter {
@Attribute(.unique) public var id: String = ""
public var number: Int = 0
public var title: String = ""
public var part: Int = 0 // 0 = no part assignment
public var courseName: String = ""
public var bodyJSON: Data = Data()
public var exerciseCount: Int = 0
public var vocabTableCount: Int = 0
public init(
id: String,
number: Int,
title: String,
part: Int,
courseName: String,
bodyJSON: Data,
exerciseCount: Int,
vocabTableCount: Int
) {
self.id = id
self.number = number
self.title = title
self.part = part
self.courseName = courseName
self.bodyJSON = bodyJSON
self.exerciseCount = exerciseCount
self.vocabTableCount = vocabTableCount
}
public func blocks() -> [TextbookBlock] {
(try? JSONDecoder().decode([TextbookBlock].self, from: bodyJSON)) ?? []
}
}
/// One content block within a chapter. Polymorphic via `kind`.
public struct TextbookBlock: Codable, Identifiable, Sendable {
public enum Kind: String, Codable, Sendable {
case heading
case paragraph
case keyVocabHeader = "key_vocab_header"
case vocabTable = "vocab_table"
case exercise
}
public var id: String { "\(kind.rawValue):\(index)" }
public var index: Int
public var kind: Kind
// heading
public var level: Int?
// heading / paragraph
public var text: String?
// vocab_table
public var sourceImage: String?
public var ocrLines: [String]?
public var ocrConfidence: Double?
public var cards: [TextbookVocabPair]?
// exercise
public var exerciseId: String?
public var instruction: String?
public var extra: [String]?
public var prompts: [String]?
public var answerItems: [TextbookAnswerItem]?
public var freeform: Bool?
}
public struct TextbookVocabPair: Codable, Sendable {
public var front: String
public var back: String
}
public struct TextbookAnswerItem: Codable, Sendable {
public var label: String? // A/B/C subpart label or nil
public var number: Int
public var answer: String
public var alternates: [String]
}

View File

@@ -0,0 +1,83 @@
import Foundation
import SwiftData
/// Per-prompt grading state recorded after the user submits an exercise.
public enum TextbookGrade: Int, Codable, Sendable {
case wrong = 0
case close = 1
case correct = 2
}
/// User's attempt for one exercise. Stored in the cloud container so progress
/// syncs across devices.
@Model
public final class TextbookExerciseAttempt {
/// Deterministic id: "<courseName>|<exerciseId>". CloudKit-synced models can't
/// use @Attribute(.unique); code that writes attempts must fetch-or-create.
public var id: String = ""
public var courseName: String = ""
public var chapterNumber: Int = 0
public var exerciseId: String = ""
/// JSON-encoded per-prompt state array.
/// Each entry: { "number": Int, "userText": String, "grade": Int }
public var stateJSON: Data = Data()
public var lastAttemptAt: Date = Date()
public var correctCount: Int = 0
public var closeCount: Int = 0
public var wrongCount: Int = 0
public var totalCount: Int = 0
public init(
id: String,
courseName: String,
chapterNumber: Int,
exerciseId: String,
stateJSON: Data = Data(),
lastAttemptAt: Date = Date(),
correctCount: Int = 0,
closeCount: Int = 0,
wrongCount: Int = 0,
totalCount: Int = 0
) {
self.id = id
self.courseName = courseName
self.chapterNumber = chapterNumber
self.exerciseId = exerciseId
self.stateJSON = stateJSON
self.lastAttemptAt = lastAttemptAt
self.correctCount = correctCount
self.closeCount = closeCount
self.wrongCount = wrongCount
self.totalCount = totalCount
}
public func promptStates() -> [TextbookPromptState] {
(try? JSONDecoder().decode([TextbookPromptState].self, from: stateJSON)) ?? []
}
public func setPromptStates(_ states: [TextbookPromptState]) {
stateJSON = (try? JSONEncoder().encode(states)) ?? Data()
correctCount = states.filter { $0.grade == .correct }.count
closeCount = states.filter { $0.grade == .close }.count
wrongCount = states.filter { $0.grade == .wrong }.count
totalCount = states.count
}
public static func attemptId(courseName: String, exerciseId: String) -> String {
"\(courseName)|\(exerciseId)"
}
}
public struct TextbookPromptState: Codable, Sendable {
public var number: Int
public var userText: String
public var grade: TextbookGrade
public init(number: Int, userText: String, grade: TextbookGrade) {
self.number = number
self.userText = userText
self.grade = grade
}
}

View File

@@ -0,0 +1,80 @@
import Testing
@testable import SharedModels
@Suite("AnswerGrader")
struct AnswerGraderTests {
@Test("exact match is correct")
func exact() {
#expect(AnswerGrader.grade(userText: "tengo", canonical: "tengo") == .correct)
#expect(AnswerGrader.grade(userText: "Tengo", canonical: "tengo") == .correct)
#expect(AnswerGrader.grade(userText: " tengo ", canonical: "tengo") == .correct)
}
@Test("missing accent is close")
func missingAccent() {
#expect(AnswerGrader.grade(userText: "esta", canonical: "está") == .close)
#expect(AnswerGrader.grade(userText: "nino", canonical: "niño") == .close)
#expect(AnswerGrader.grade(userText: "asi", canonical: "así") == .close)
}
@Test("single-char typo is close")
func singleCharTypo() {
// deletion
#expect(AnswerGrader.grade(userText: "tngo", canonical: "tengo") == .close)
// insertion
#expect(AnswerGrader.grade(userText: "tengoo", canonical: "tengo") == .close)
// substitution
#expect(AnswerGrader.grade(userText: "tengu", canonical: "tengo") == .close)
}
@Test("two-char typo is wrong")
func twoCharTypo() {
#expect(AnswerGrader.grade(userText: "tngu", canonical: "tengo") == .wrong)
}
@Test("empty is wrong")
func empty() {
#expect(AnswerGrader.grade(userText: "", canonical: "tengo") == .wrong)
#expect(AnswerGrader.grade(userText: " ", canonical: "tengo") == .wrong)
}
@Test("alternates accepted")
func alternates() {
#expect(AnswerGrader.grade(userText: "flaca", canonical: "delgada", alternates: ["flaca"]) == .correct)
#expect(AnswerGrader.grade(userText: "flacca", canonical: "delgada", alternates: ["flaca"]) == .close)
}
@Test("punctuation stripped")
func punctuation() {
#expect(AnswerGrader.grade(userText: "el libro.", canonical: "el libro") == .correct)
#expect(AnswerGrader.grade(userText: "¿dónde?", canonical: "dónde") == .correct)
}
@Test("very different text is wrong")
func wrong() {
#expect(AnswerGrader.grade(userText: "hola", canonical: "tengo") == .wrong)
#expect(AnswerGrader.grade(userText: "casa", canonical: "perro") == .wrong)
}
@Test("normalize produces expected output")
func normalize() {
#expect(AnswerGrader.normalize(" Hola ") == "hola")
#expect(AnswerGrader.normalize("ABC!") == "abc")
}
@Test("stripAccents handles common Spanish diacritics")
func stripAccents() {
#expect(AnswerGrader.stripAccents("niño") == "nino")
#expect(AnswerGrader.stripAccents("está") == "esta")
#expect(AnswerGrader.stripAccents("güero") == "guero")
}
@Test("levenshtein computes edit distance")
func levenshtein() {
#expect(AnswerGrader.levenshtein("kitten", "sitting") == 3)
#expect(AnswerGrader.levenshtein("flaw", "lawn") == 2)
#expect(AnswerGrader.levenshtein("abc", "abc") == 0)
#expect(AnswerGrader.levenshtein("", "abc") == 3)
}
}