Files
Spanish/Conjuga/Conjuga/Services/DataLoader.swift
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

518 lines
21 KiB
Swift

import SwiftData
import SharedModels
import Foundation
actor DataLoader {
static let courseDataVersion = 7
static let courseDataKey = "courseDataVersion"
static let textbookDataVersion = 8
static let textbookDataKey = "textbookDataVersion"
/// Quick check: does the DB need seeding or course data refresh?
static func needsSeeding(container: ModelContainer) async -> Bool {
let context = ModelContext(container)
let verbCount = (try? context.fetchCount(FetchDescriptor<Verb>())) ?? 0
if verbCount == 0 { return true }
let storedVersion = UserDefaults.standard.integer(forKey: courseDataKey)
if storedVersion < courseDataVersion { return true }
let textbookVersion = UserDefaults.standard.integer(forKey: textbookDataKey)
if textbookVersion < textbookDataVersion { return true }
return false
}
static func seedIfNeeded(container: ModelContainer) async {
let context = ModelContext(container)
let count: Int
do {
count = try context.fetchCount(FetchDescriptor<Verb>())
print("[DataLoader] seedIfNeeded: existing verb count = \(count)")
} catch {
print("[DataLoader] ⚠️ seedIfNeeded fetchCount threw: \(error)")
count = 0
}
if count > 0 { return }
print("Seeding database...")
// Try direct bundle lookup first, then subdirectory
let url = Bundle.main.url(forResource: "conjuga_data", withExtension: "json")
?? Bundle.main.url(forResource: "conjuga_data", withExtension: "json", subdirectory: "Resources")
?? Bundle.main.bundleURL.appendingPathComponent("Resources/conjuga_data.json")
guard let data = try? Data(contentsOf: url) else {
print("ERROR: Could not load conjuga_data.json from bundle at \(url)")
return
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("ERROR: Could not parse conjuga_data")
return
}
// Seed tense guides
if let guides = json["tenseGuides"] as? [[String: Any]] {
for g in guides {
guard let tenseId = g["tenseId"] as? String,
let title = g["title"] as? String,
let body = g["body"] as? String else { continue }
let guide = TenseGuide(tenseId: tenseId, title: title, body: body)
context.insert(guide)
}
}
// Seed verbs
var verbMap: [Int: Verb] = [:]
if let verbs = json["verbs"] as? [[String: Any]] {
for v in verbs {
guard let id = v["id"] as? Int,
let infinitive = v["infinitive"] as? String,
let english = v["english"] as? String,
let rank = v["rank"] as? Int,
let ending = v["ending"] as? String,
let reflexive = v["reflexive"] as? Int,
let level = v["level"] as? String else { continue }
let verb = Verb(id: id, infinitive: infinitive, english: english, rank: rank, ending: ending, reflexive: reflexive, level: level)
context.insert(verb)
verbMap[id] = verb
}
print("Inserted \(verbs.count) verbs")
}
try? context.save()
// Seed verb forms bulk insert, no relationship assignment (use verbId for queries)
let chunkSize = 20000
if let forms = json["verbForms"] as? [[String: Any]] {
for i in stride(from: 0, to: forms.count, by: chunkSize) {
autoreleasepool {
let end = min(i + chunkSize, forms.count)
for j in i..<end {
let f = forms[j]
guard let verbId = f["verbId"] as? Int,
let tenseId = f["tenseId"] as? String,
let personIndex = f["personIndex"] as? Int,
let form = f["form"] as? String,
let regularity = f["regularity"] as? String else { continue }
let vf = VerbForm(verbId: verbId, tenseId: tenseId, personIndex: personIndex, form: form, regularity: regularity)
context.insert(vf)
}
try? context.save()
}
}
print("Inserted \(forms.count) verb forms")
}
// Seed irregular spans bulk insert
if let spans = json["irregularSpans"] as? [[String: Any]] {
for i in stride(from: 0, to: spans.count, by: chunkSize) {
autoreleasepool {
let end = min(i + chunkSize, spans.count)
for j in i..<end {
let s = spans[j]
guard let verbId = s["verbId"] as? Int,
let tenseId = s["tenseId"] as? String,
let personIndex = s["personIndex"] as? Int,
let spanType = s["type"] as? Int,
let pattern = s["pattern"] as? Int,
let start = s["start"] as? Int,
let end = s["end"] as? Int else { continue }
let span = IrregularSpan(verbId: verbId, tenseId: tenseId, personIndex: personIndex, spanType: spanType, pattern: pattern, start: start, end: end)
context.insert(span)
}
try? context.save()
}
}
print("Inserted \(spans.count) irregular spans")
}
do {
try context.save()
} catch {
print("[DataLoader] 🔥 Final verb save error: \(error)")
}
print("Verb seeding complete")
// Seed course data (uses the same mainContext so @Query sees it)
seedCourseData(context: context)
// Seed textbook data
seedTextbookData(context: context)
UserDefaults.standard.set(textbookDataVersion, forKey: textbookDataKey)
}
/// Re-seed textbook data if the version has changed.
static func refreshTextbookDataIfNeeded(container: ModelContainer) async {
let shared = UserDefaults.standard
if shared.integer(forKey: textbookDataKey) >= textbookDataVersion { return }
print("Textbook data version outdated — re-seeding...")
let context = ModelContext(container)
// Only wipe textbook chapters and our textbook-scoped CourseDecks
// (not the LanGo decks, which live in the same tables).
try? context.delete(model: TextbookChapter.self)
let textbookCourseName = "Complete Spanish Step-by-Step"
let deckDescriptor = FetchDescriptor<CourseDeck>(
predicate: #Predicate<CourseDeck> { $0.courseName == textbookCourseName }
)
if let decks = try? context.fetch(deckDescriptor) {
for deck in decks { context.delete(deck) }
}
try? context.save()
seedTextbookData(context: context)
shared.set(textbookDataVersion, forKey: textbookDataKey)
print("Textbook data re-seeded to version \(textbookDataVersion)")
}
/// Re-seed course data if the version has changed (e.g. examples were added).
/// Call this on every launch it checks a version key and only re-seeds when needed.
static func refreshCourseDataIfNeeded(container: ModelContainer) async {
let shared = UserDefaults.standard
if shared.integer(forKey: courseDataKey) >= courseDataVersion { return }
print("Course data version outdated — re-seeding...")
let context = ModelContext(container)
// Delete existing course data + tense guides so they can be re-seeded
// with updated bodies from the bundled conjuga_data.json.
try? context.delete(model: VocabCard.self)
try? context.delete(model: CourseDeck.self)
try? context.delete(model: TenseGuide.self)
try? context.save()
// Re-seed tense guides from the bundled JSON
if let url = Bundle.main.url(forResource: "conjuga_data", withExtension: "json"),
let data = try? Data(contentsOf: url),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let guides = json["tenseGuides"] as? [[String: Any]] {
for g in guides {
guard let tenseId = g["tenseId"] as? String,
let title = g["title"] as? String,
let body = g["body"] as? String else { continue }
context.insert(TenseGuide(tenseId: tenseId, title: title, body: body))
}
try? context.save()
print("Re-seeded \(guides.count) tense guides")
}
// Re-seed course data
seedCourseData(context: context)
// Textbook's vocab decks/cards share the same CourseDeck/VocabCard
// entities, so they were just wiped above. Reseed them.
seedTextbookVocabDecks(context: context, courseName: "Complete Spanish Step-by-Step")
shared.set(courseDataVersion, forKey: courseDataKey)
print("Course data re-seeded to version \(courseDataVersion)")
}
static func migrateCourseProgressIfNeeded(
localContainer: ModelContainer,
cloudContainer: ModelContainer
) async {
let migrationVersion = 2
let key = "courseProgressMigrationVersion"
let shared = UserDefaults.standard
if shared.integer(forKey: key) >= migrationVersion { return }
let localContext = ModelContext(localContainer)
let cloudContext = ModelContext(cloudContainer)
let descriptor = FetchDescriptor<VocabCard>()
let allCards = (try? localContext.fetch(descriptor)) ?? []
var migratedCount = 0
for card in allCards where hasLegacyCourseProgress(card) {
let reviewKey = CourseCardStore.reviewKey(for: card)
let reviewCard = findOrCreateCourseReviewCard(
id: reviewKey,
deckId: card.deckId,
front: card.front,
back: card.back,
context: cloudContext
)
if let reviewDate = reviewCard.lastReviewDate,
let legacyDate = card.lastReviewDate,
reviewDate >= legacyDate {
continue
}
reviewCard.easeFactor = card.easeFactor
reviewCard.interval = card.interval
reviewCard.repetitions = card.repetitions
reviewCard.dueDate = card.dueDate
reviewCard.lastReviewDate = card.lastReviewDate
migratedCount += 1
}
if migratedCount > 0 {
try? cloudContext.save()
print("Migrated \(migratedCount) course progress cards to cloud store")
}
shared.set(migrationVersion, forKey: key)
}
private static func seedCourseData(context: ModelContext) {
let url = Bundle.main.url(forResource: "course_data", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("course_data.json")
guard let data = try? Data(contentsOf: url) else {
print("No course_data.json found — skipping course seeding")
return
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("ERROR: Could not parse course_data.json")
return
}
// Support both formats: {"courses": [...]} (new) and {"course": "...", "weeks": [...]} (old)
var courseList: [[String: Any]] = []
if let courses = json["courses"] as? [[String: Any]] {
courseList = courses
} else if json["weeks"] != nil {
courseList = [json]
}
var deckCount = 0
var cardCount = 0
for courseData in courseList {
guard let weeks = courseData["weeks"] as? [[String: Any]],
let courseName = courseData["course"] as? String else { continue }
let courseSlug = courseName.lowercased()
.replacingOccurrences(of: " ", with: "-")
.replacingOccurrences(of: "|", with: "")
for weekData in weeks {
guard let weekNum = weekData["week"] as? Int,
let decks = weekData["decks"] as? [[String: Any]] else { continue }
for (deckIndex, deckData) in decks.enumerated() {
guard let title = deckData["title"] as? String,
let cards = deckData["cards"] as? [Any] else { continue }
let isReversed = (deckData["isReversed"] as? Bool) ?? false
let deckId = "\(courseSlug)_w\(weekNum)_\(deckIndex)_\(isReversed ? "rev" : "fwd")"
let deck = CourseDeck(
id: deckId,
weekNumber: weekNum,
title: title,
cardCount: cards.count,
courseName: courseName,
isReversed: isReversed
)
context.insert(deck)
deckCount += 1
for rawCard in cards {
guard let cardDict = rawCard as? [String: Any],
let front = cardDict["front"] as? String,
let back = cardDict["back"] as? String else { continue }
// Parse example sentences
var exES: [String] = []
var exEN: [String] = []
var exBlanks: [String] = []
if let examples = cardDict["examples"] as? [[String: String]] {
for ex in examples {
if let es = ex["es"] {
exES.append(es)
exEN.append(ex["en"] ?? "")
exBlanks.append(ex["blank"] ?? "")
}
}
}
let card = VocabCard(front: front, back: back, deckId: deckId, examplesES: exES, examplesEN: exEN, examplesBlanks: exBlanks)
card.deck = deck
context.insert(card)
cardCount += 1
}
}
try? context.save()
}
}
print("Course seeding complete: \(deckCount) decks, \(cardCount) cards")
}
private static func hasLegacyCourseProgress(_ card: VocabCard) -> Bool {
card.repetitions > 0 ||
card.interval > 0 ||
abs(card.easeFactor - 2.5) > 0.0001 ||
card.lastReviewDate != nil
}
private static func findOrCreateCourseReviewCard(
id: String,
deckId: String,
front: String,
back: String,
context: ModelContext
) -> CourseReviewCard {
let descriptor = FetchDescriptor<CourseReviewCard>(
predicate: #Predicate<CourseReviewCard> { $0.id == id }
)
if let existing = (try? context.fetch(descriptor))?.first {
return existing
}
let reviewCard = CourseReviewCard(id: id, deckId: deckId, front: front, back: back)
context.insert(reviewCard)
return reviewCard
}
// MARK: - Textbook seeding
private static func seedTextbookData(context: ModelContext) {
let url = Bundle.main.url(forResource: "textbook_data", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("textbook_data.json")
guard let data = try? Data(contentsOf: url) else {
print("[DataLoader] textbook_data.json not bundled — skipping textbook seed")
return
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("[DataLoader] ERROR: Could not parse textbook_data.json")
return
}
let courseName = (json["courseName"] as? String) ?? "Textbook"
guard let chapters = json["chapters"] as? [[String: Any]] else {
print("[DataLoader] ERROR: textbook_data.json missing chapters")
return
}
var inserted = 0
for ch in chapters {
guard let id = ch["id"] as? String,
let number = ch["number"] as? Int,
let title = ch["title"] as? String,
let blocksRaw = ch["blocks"] as? [[String: Any]] else { continue }
let part = (ch["part"] as? Int) ?? 0
// Normalize each block to canonical keys expected by TextbookBlock decoder.
var normalized: [[String: Any]] = []
var exerciseCount = 0
var vocabTableCount = 0
for (i, b) in blocksRaw.enumerated() {
var out: [String: Any] = [:]
out["index"] = i
let kind = (b["kind"] as? String) ?? ""
out["kind"] = kind
switch kind {
case "heading":
if let level = b["level"] { out["level"] = level }
if let text = b["text"] { out["text"] = text }
case "paragraph":
if let text = b["text"] { out["text"] = text }
case "key_vocab_header":
break
case "vocab_table":
vocabTableCount += 1
if let src = b["sourceImage"] { out["sourceImage"] = src }
if let lines = b["ocrLines"] { out["ocrLines"] = lines }
if let conf = b["ocrConfidence"] { out["ocrConfidence"] = conf }
case "exercise":
exerciseCount += 1
if let exId = b["id"] { out["exerciseId"] = exId }
if let inst = b["instruction"] { out["instruction"] = inst }
if let extra = b["extra"] { out["extra"] = extra }
if let prompts = b["prompts"] { out["prompts"] = prompts }
if let items = b["answerItems"] { out["answerItems"] = items }
if let freeform = b["freeform"] { out["freeform"] = freeform }
default:
break
}
normalized.append(out)
}
let bodyJSON: Data
do {
bodyJSON = try JSONSerialization.data(withJSONObject: normalized, options: [])
} catch {
print("[DataLoader] failed to encode chapter \(number) blocks: \(error)")
continue
}
let chapter = TextbookChapter(
id: id,
number: number,
title: title,
part: part,
courseName: courseName,
bodyJSON: bodyJSON,
exerciseCount: exerciseCount,
vocabTableCount: vocabTableCount
)
context.insert(chapter)
inserted += 1
}
try? context.save()
// Seed textbook-derived vocabulary flashcards as CourseDecks so the
// existing Course UI can surface them alongside LanGo decks.
seedTextbookVocabDecks(context: context, courseName: courseName)
print("Textbook seeding complete: \(inserted) chapters")
}
private static func seedTextbookVocabDecks(context: ModelContext, courseName: String) {
let url = Bundle.main.url(forResource: "textbook_vocab", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("textbook_vocab.json")
guard let data = try? Data(contentsOf: url),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let chaptersArr = json["chapters"] as? [[String: Any]]
else { return }
let courseSlug = courseName.lowercased()
.replacingOccurrences(of: " ", with: "-")
var deckCount = 0
var cardCount = 0
for chData in chaptersArr {
guard let chNum = chData["chapter"] as? Int,
let cards = chData["cards"] as? [[String: Any]],
!cards.isEmpty else { continue }
let deckId = "textbook_\(courseSlug)_ch\(chNum)"
let title = "Chapter \(chNum) vocabulary"
let deck = CourseDeck(
id: deckId,
weekNumber: chNum,
title: title,
cardCount: cards.count,
courseName: courseName,
isReversed: false
)
context.insert(deck)
deckCount += 1
for c in cards {
guard let front = c["front"] as? String,
let back = c["back"] as? String else { continue }
let card = VocabCard(front: front, back: back, deckId: deckId)
card.deck = deck
context.insert(card)
cardCount += 1
}
}
try? context.save()
print("Textbook vocab seeding complete: \(deckCount) decks, \(cardCount) cards")
}
}