Files
Spanish/Conjuga/Conjuga/Services/DataLoader.swift
T
Trey T f368c24ad6 Fixes #32 — LLM vision pass for vocab pairs, fixes scrambled English/Spanish
The bbox-OCR pipeline mis-paired ~114 vocab tables across the book — the
chapter 7 "Other Idioms" image (issue #32) being the most visible.
Three failure modes were collapsing the data:
  1) classifier blind to subject pronouns ("yo", "I", etc.)
  2) right-then-left OCR reads on 2-col tables
  3) Y-cluster drift on multi-line cells in 4-col layouts

Replaced the entire vocab-extraction tier with a Claude vision pass over
all 931 vocab images. Output is keyed by image with three classifications:
  - pair_table       (extract all Spanish↔English pairs)
  - reference_only   (Spanish-only conjugation tables — no pairs, UI shows
                      the flat OCR lines as a reference list instead)
  - hybrid           (some header pairs + reference content beneath; only
                      the genuine pairs become cards)

merge_pdf_into_book.py now picks pair source by priority:
  llm-vision → bounding-box OCR → block-alternation heuristic.

Numbers (across the whole book):
  - mis-oriented tables: 114 → 5
  - quarantined cards:   250 → 2
  - extracted pairs:     2832 → 4569

textbookDataVersion bumped to 13. Per-batch agent outputs gitignored
under Conjuga/Scripts/textbook/paired_vocab_llm/ — only the merged
paired_vocab_llm.json (also gitignored) is needed to rebuild.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:48:04 -05:00

570 lines
24 KiB
Swift

import SwiftData
import SharedModels
import Foundation
actor DataLoader {
static let courseDataVersion = 7
static let courseDataKey = "courseDataVersion"
static let textbookDataVersion = 13
static let textbookDataKey = "textbookDataVersion"
/// Quick check: does the DB need seeding or course data refresh?
static func needsSeeding(container: ModelContainer) async -> Bool {
let context = ModelContext(container)
let verbCount = (try? context.fetchCount(FetchDescriptor<Verb>())) ?? 0
if verbCount == 0 { return true }
let storedVersion = UserDefaults.standard.integer(forKey: courseDataKey)
if storedVersion < courseDataVersion { return true }
let textbookVersion = UserDefaults.standard.integer(forKey: textbookDataKey)
if textbookVersion < textbookDataVersion { return true }
return false
}
static func seedIfNeeded(container: ModelContainer) async {
let context = ModelContext(container)
let count: Int
do {
count = try context.fetchCount(FetchDescriptor<Verb>())
print("[DataLoader] seedIfNeeded: existing verb count = \(count)")
} catch {
print("[DataLoader] ⚠️ seedIfNeeded fetchCount threw: \(error)")
count = 0
}
if count > 0 { return }
print("Seeding database...")
// Try direct bundle lookup first, then subdirectory
let url = Bundle.main.url(forResource: "conjuga_data", withExtension: "json")
?? Bundle.main.url(forResource: "conjuga_data", withExtension: "json", subdirectory: "Resources")
?? Bundle.main.bundleURL.appendingPathComponent("Resources/conjuga_data.json")
guard let data = try? Data(contentsOf: url) else {
print("ERROR: Could not load conjuga_data.json from bundle at \(url)")
return
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("ERROR: Could not parse conjuga_data")
return
}
// Seed tense guides
if let guides = json["tenseGuides"] as? [[String: Any]] {
for g in guides {
guard let tenseId = g["tenseId"] as? String,
let title = g["title"] as? String,
let body = g["body"] as? String else { continue }
let guide = TenseGuide(tenseId: tenseId, title: title, body: body)
context.insert(guide)
}
}
// Seed verbs
var verbMap: [Int: Verb] = [:]
if let verbs = json["verbs"] as? [[String: Any]] {
for v in verbs {
guard let id = v["id"] as? Int,
let infinitive = v["infinitive"] as? String,
let english = v["english"] as? String,
let rank = v["rank"] as? Int,
let ending = v["ending"] as? String,
let reflexive = v["reflexive"] as? Int,
let level = v["level"] as? String else { continue }
let verb = Verb(id: id, infinitive: infinitive, english: english, rank: rank, ending: ending, reflexive: reflexive, level: level)
context.insert(verb)
verbMap[id] = verb
}
print("Inserted \(verbs.count) verbs")
}
try? context.save()
// Seed verb forms bulk insert, no relationship assignment (use verbId for queries)
let chunkSize = 20000
if let forms = json["verbForms"] as? [[String: Any]] {
for i in stride(from: 0, to: forms.count, by: chunkSize) {
autoreleasepool {
let end = min(i + chunkSize, forms.count)
for j in i..<end {
let f = forms[j]
guard let verbId = f["verbId"] as? Int,
let tenseId = f["tenseId"] as? String,
let personIndex = f["personIndex"] as? Int,
let form = f["form"] as? String,
let regularity = f["regularity"] as? String else { continue }
let vf = VerbForm(verbId: verbId, tenseId: tenseId, personIndex: personIndex, form: form, regularity: regularity)
context.insert(vf)
}
try? context.save()
}
}
print("Inserted \(forms.count) verb forms")
}
// Seed irregular spans bulk insert
if let spans = json["irregularSpans"] as? [[String: Any]] {
for i in stride(from: 0, to: spans.count, by: chunkSize) {
autoreleasepool {
let end = min(i + chunkSize, spans.count)
for j in i..<end {
let s = spans[j]
guard let verbId = s["verbId"] as? Int,
let tenseId = s["tenseId"] as? String,
let personIndex = s["personIndex"] as? Int,
let spanType = s["type"] as? Int,
let pattern = s["pattern"] as? Int,
let start = s["start"] as? Int,
let end = s["end"] as? Int else { continue }
let span = IrregularSpan(verbId: verbId, tenseId: tenseId, personIndex: personIndex, spanType: spanType, pattern: pattern, start: start, end: end)
context.insert(span)
}
try? context.save()
}
}
print("Inserted \(spans.count) irregular spans")
}
do {
try context.save()
} catch {
print("[DataLoader] 🔥 Final verb save error: \(error)")
}
print("Verb seeding complete")
// Seed course data (uses the same mainContext so @Query sees it)
seedCourseData(context: context)
// Seed textbook data only bump the version key if the seed
// actually inserted rows, so a missing/unparseable bundle doesn't
// permanently lock us out of future re-seeds.
if seedTextbookData(context: context) {
UserDefaults.standard.set(textbookDataVersion, forKey: textbookDataKey)
}
}
/// Re-seed textbook data if the version has changed OR if the rows are
/// missing on disk. The row-count check exists because anything opening
/// this store with a subset schema (e.g. an out-of-date widget extension)
/// can destructively drop the rows without touching UserDefaults so a
/// pure version-flag trigger would leave us permanently empty.
static func refreshTextbookDataIfNeeded(container: ModelContainer) async {
let shared = UserDefaults.standard
let context = ModelContext(container)
let existingCount = (try? context.fetchCount(FetchDescriptor<TextbookChapter>())) ?? 0
let versionCurrent = shared.integer(forKey: textbookDataKey) >= textbookDataVersion
if versionCurrent && existingCount > 0 { return }
if versionCurrent {
print("Textbook data version current but store has \(existingCount) chapters — re-seeding...")
} else {
print("Textbook data version outdated — re-seeding...")
}
// Fetch + delete individually instead of batch delete. SwiftData's
// context.delete(model:) hits the store directly and doesn't always
// clear the unique-constraint index before the reseed's save runs,
// so re-inserting rows with the same .unique id can throw.
let textbookCourseName = "Complete Spanish Step-by-Step"
if let existing = try? context.fetch(FetchDescriptor<TextbookChapter>()) {
for chapter in existing { context.delete(chapter) }
}
let deckDescriptor = FetchDescriptor<CourseDeck>(
predicate: #Predicate<CourseDeck> { $0.courseName == textbookCourseName }
)
if let decks = try? context.fetch(deckDescriptor) {
for deck in decks { context.delete(deck) }
}
do {
try context.save()
} catch {
print("[DataLoader] ERROR: textbook wipe save failed: \(error)")
return
}
if seedTextbookData(context: context) {
shared.set(textbookDataVersion, forKey: textbookDataKey)
print("Textbook data re-seeded to version \(textbookDataVersion)")
} else {
print("Textbook re-seed failed — leaving version key untouched so next launch retries")
}
}
/// Re-seed course data if the version has changed (e.g. examples were added).
/// Call this on every launch it checks a version key and only re-seeds when needed.
static func refreshCourseDataIfNeeded(container: ModelContainer) async {
let shared = UserDefaults.standard
if shared.integer(forKey: courseDataKey) >= courseDataVersion { return }
print("Course data version outdated — re-seeding...")
let context = ModelContext(container)
// Delete existing course data + tense guides so they can be re-seeded
// with updated bodies from the bundled conjuga_data.json.
try? context.delete(model: VocabCard.self)
try? context.delete(model: CourseDeck.self)
try? context.delete(model: TenseGuide.self)
try? context.save()
// Re-seed tense guides from the bundled JSON
if let url = Bundle.main.url(forResource: "conjuga_data", withExtension: "json"),
let data = try? Data(contentsOf: url),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let guides = json["tenseGuides"] as? [[String: Any]] {
for g in guides {
guard let tenseId = g["tenseId"] as? String,
let title = g["title"] as? String,
let body = g["body"] as? String else { continue }
context.insert(TenseGuide(tenseId: tenseId, title: title, body: body))
}
try? context.save()
print("Re-seeded \(guides.count) tense guides")
}
// Re-seed course data
seedCourseData(context: context)
// Textbook's vocab decks/cards share the same CourseDeck/VocabCard
// entities, so they were just wiped above. Reseed them.
seedTextbookVocabDecks(context: context, courseName: "Complete Spanish Step-by-Step")
shared.set(courseDataVersion, forKey: courseDataKey)
print("Course data re-seeded to version \(courseDataVersion)")
}
static func migrateCourseProgressIfNeeded(
localContainer: ModelContainer,
cloudContainer: ModelContainer
) async {
let migrationVersion = 2
let key = "courseProgressMigrationVersion"
let shared = UserDefaults.standard
if shared.integer(forKey: key) >= migrationVersion { return }
let localContext = ModelContext(localContainer)
let cloudContext = ModelContext(cloudContainer)
let descriptor = FetchDescriptor<VocabCard>()
let allCards = (try? localContext.fetch(descriptor)) ?? []
var migratedCount = 0
for card in allCards where hasLegacyCourseProgress(card) {
let reviewKey = CourseCardStore.reviewKey(for: card)
let reviewCard = findOrCreateCourseReviewCard(
id: reviewKey,
deckId: card.deckId,
front: card.front,
back: card.back,
context: cloudContext
)
if let reviewDate = reviewCard.lastReviewDate,
let legacyDate = card.lastReviewDate,
reviewDate >= legacyDate {
continue
}
reviewCard.easeFactor = card.easeFactor
reviewCard.interval = card.interval
reviewCard.repetitions = card.repetitions
reviewCard.dueDate = card.dueDate
reviewCard.lastReviewDate = card.lastReviewDate
migratedCount += 1
}
if migratedCount > 0 {
try? cloudContext.save()
print("Migrated \(migratedCount) course progress cards to cloud store")
}
shared.set(migrationVersion, forKey: key)
}
private static func seedCourseData(context: ModelContext) {
let url = Bundle.main.url(forResource: "course_data", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("course_data.json")
guard let data = try? Data(contentsOf: url) else {
print("No course_data.json found — skipping course seeding")
return
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("ERROR: Could not parse course_data.json")
return
}
// Support both formats: {"courses": [...]} (new) and {"course": "...", "weeks": [...]} (old)
var courseList: [[String: Any]] = []
if let courses = json["courses"] as? [[String: Any]] {
courseList = courses
} else if json["weeks"] != nil {
courseList = [json]
}
var deckCount = 0
var cardCount = 0
for courseData in courseList {
guard let weeks = courseData["weeks"] as? [[String: Any]],
let courseName = courseData["course"] as? String else { continue }
let courseSlug = courseName.lowercased()
.replacingOccurrences(of: " ", with: "-")
.replacingOccurrences(of: "|", with: "")
for weekData in weeks {
guard let weekNum = weekData["week"] as? Int,
let decks = weekData["decks"] as? [[String: Any]] else { continue }
for (deckIndex, deckData) in decks.enumerated() {
guard let title = deckData["title"] as? String,
let cards = deckData["cards"] as? [Any] else { continue }
let isReversed = (deckData["isReversed"] as? Bool) ?? false
let deckId = "\(courseSlug)_w\(weekNum)_\(deckIndex)_\(isReversed ? "rev" : "fwd")"
let deck = CourseDeck(
id: deckId,
weekNumber: weekNum,
title: title,
cardCount: cards.count,
courseName: courseName,
isReversed: isReversed
)
context.insert(deck)
deckCount += 1
for rawCard in cards {
guard let cardDict = rawCard as? [String: Any],
let front = cardDict["front"] as? String,
let back = cardDict["back"] as? String else { continue }
// Parse example sentences
var exES: [String] = []
var exEN: [String] = []
var exBlanks: [String] = []
if let examples = cardDict["examples"] as? [[String: String]] {
for ex in examples {
if let es = ex["es"] {
exES.append(es)
exEN.append(ex["en"] ?? "")
exBlanks.append(ex["blank"] ?? "")
}
}
}
let card = VocabCard(front: front, back: back, deckId: deckId, examplesES: exES, examplesEN: exEN, examplesBlanks: exBlanks)
card.deck = deck
context.insert(card)
cardCount += 1
}
}
try? context.save()
}
}
print("Course seeding complete: \(deckCount) decks, \(cardCount) cards")
}
private static func hasLegacyCourseProgress(_ card: VocabCard) -> Bool {
card.repetitions > 0 ||
card.interval > 0 ||
abs(card.easeFactor - 2.5) > 0.0001 ||
card.lastReviewDate != nil
}
private static func findOrCreateCourseReviewCard(
id: String,
deckId: String,
front: String,
back: String,
context: ModelContext
) -> CourseReviewCard {
let descriptor = FetchDescriptor<CourseReviewCard>(
predicate: #Predicate<CourseReviewCard> { $0.id == id }
)
if let existing = (try? context.fetch(descriptor))?.first {
return existing
}
let reviewCard = CourseReviewCard(id: id, deckId: deckId, front: front, back: back)
context.insert(reviewCard)
return reviewCard
}
// MARK: - Textbook seeding
@discardableResult
private static func seedTextbookData(context: ModelContext) -> Bool {
let url = Bundle.main.url(forResource: "textbook_data", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("textbook_data.json")
guard let data = try? Data(contentsOf: url) else {
print("[DataLoader] textbook_data.json not bundled — skipping textbook seed")
return false
}
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
print("[DataLoader] ERROR: Could not parse textbook_data.json")
return false
}
let courseName = (json["courseName"] as? String) ?? "Textbook"
guard let chapters = json["chapters"] as? [[String: Any]] else {
print("[DataLoader] ERROR: textbook_data.json missing chapters")
return false
}
var inserted = 0
for ch in chapters {
guard let id = ch["id"] as? String,
let number = ch["number"] as? Int,
let title = ch["title"] as? String,
let blocksRaw = ch["blocks"] as? [[String: Any]] else { continue }
let part = (ch["part"] as? Int) ?? 0
// Normalize each block to canonical keys expected by TextbookBlock decoder.
var normalized: [[String: Any]] = []
var exerciseCount = 0
var vocabTableCount = 0
for (i, b) in blocksRaw.enumerated() {
var out: [String: Any] = [:]
out["index"] = i
let kind = (b["kind"] as? String) ?? ""
out["kind"] = kind
switch kind {
case "heading":
if let level = b["level"] { out["level"] = level }
if let text = b["text"] { out["text"] = text }
case "paragraph":
if let text = b["text"] { out["text"] = text }
case "key_vocab_header":
break
case "vocab_table":
vocabTableCount += 1
if let src = b["sourceImage"] { out["sourceImage"] = src }
if let lines = b["ocrLines"] { out["ocrLines"] = lines }
if let conf = b["ocrConfidence"] { out["ocrConfidence"] = conf }
// Paired SpanishEnglish cards from the bounding-box extractor.
if let cards = b["cards"] as? [[String: Any]], !cards.isEmpty {
let normalized: [[String: Any]] = cards.compactMap { c in
guard let front = c["front"] as? String,
let back = c["back"] as? String else { return nil }
return ["front": front, "back": back]
}
if !normalized.isEmpty {
out["cards"] = normalized
}
}
case "exercise":
exerciseCount += 1
if let exId = b["id"] { out["exerciseId"] = exId }
if let inst = b["instruction"] { out["instruction"] = inst }
if let extra = b["extra"] { out["extra"] = extra }
if let prompts = b["prompts"] { out["prompts"] = prompts }
if let items = b["answerItems"] { out["answerItems"] = items }
if let freeform = b["freeform"] { out["freeform"] = freeform }
default:
break
}
normalized.append(out)
}
let bodyJSON: Data
do {
bodyJSON = try JSONSerialization.data(withJSONObject: normalized, options: [])
} catch {
print("[DataLoader] failed to encode chapter \(number) blocks: \(error)")
continue
}
let chapter = TextbookChapter(
id: id,
number: number,
title: title,
part: part,
courseName: courseName,
bodyJSON: bodyJSON,
exerciseCount: exerciseCount,
vocabTableCount: vocabTableCount
)
context.insert(chapter)
inserted += 1
}
do {
try context.save()
} catch {
print("[DataLoader] ERROR: textbook chapter save failed: \(error)")
return false
}
// Verify rows actually hit the store guards against the case where
// save returned cleanly but no rows were persisted.
let persisted = (try? context.fetchCount(FetchDescriptor<TextbookChapter>())) ?? 0
guard persisted > 0 else {
print("[DataLoader] ERROR: textbook seeded \(inserted) chapters but persisted count is 0")
return false
}
// Seed textbook-derived vocabulary flashcards as CourseDecks so the
// existing Course UI can surface them alongside LanGo decks.
seedTextbookVocabDecks(context: context, courseName: courseName)
print("Textbook seeding complete: \(inserted) chapters inserted, \(persisted) persisted")
return true
}
private static func seedTextbookVocabDecks(context: ModelContext, courseName: String) {
let url = Bundle.main.url(forResource: "textbook_vocab", withExtension: "json")
?? Bundle.main.bundleURL.appendingPathComponent("textbook_vocab.json")
guard let data = try? Data(contentsOf: url),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let chaptersArr = json["chapters"] as? [[String: Any]]
else { return }
let courseSlug = courseName.lowercased()
.replacingOccurrences(of: " ", with: "-")
var deckCount = 0
var cardCount = 0
for chData in chaptersArr {
guard let chNum = chData["chapter"] as? Int,
let cards = chData["cards"] as? [[String: Any]],
!cards.isEmpty else { continue }
let deckId = "textbook_\(courseSlug)_ch\(chNum)"
let title = "Chapter \(chNum) vocabulary"
let deck = CourseDeck(
id: deckId,
weekNumber: chNum,
title: title,
cardCount: cards.count,
courseName: courseName,
isReversed: false
)
context.insert(deck)
deckCount += 1
for c in cards {
guard let front = c["front"] as? String,
let back = c["back"] as? String else { continue }
let card = VocabCard(front: front, back: back, deckId: deckId)
card.deck = deck
context.insert(card)
cardCount += 1
}
}
try? context.save()
print("Textbook vocab seeding complete: \(deckCount) decks, \(cardCount) cards")
}
}