Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
156
Conjuga/Scripts/textbook/validate_vocab.swift
Normal file
156
Conjuga/Scripts/textbook/validate_vocab.swift
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env swift
|
||||
// Validate every Spanish/English word in vocab_cards.json using NSSpellChecker.
|
||||
// For each flagged word, produce up to 3 candidate corrections.
|
||||
//
|
||||
// Usage: swift validate_vocab.swift <vocab_cards.json> <output_report.json>
|
||||
|
||||
import Foundation
|
||||
import AppKit
|
||||
|
||||
guard CommandLine.arguments.count >= 3 else {
|
||||
print("Usage: swift validate_vocab.swift <vocab_cards.json> <output_report.json>")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let inputURL = URL(fileURLWithPath: CommandLine.arguments[1])
|
||||
let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
|
||||
|
||||
guard let data = try? Data(contentsOf: inputURL),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let chapters = json["chapters"] as? [[String: Any]] else {
|
||||
print("Could not load \(inputURL.path)")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let checker = NSSpellChecker.shared
|
||||
|
||||
// Tokenize — only letter runs (Unicode aware for Spanish accents)
|
||||
func tokens(_ s: String) -> [String] {
|
||||
let letters = CharacterSet.letters
|
||||
return s.unicodeScalars
|
||||
.split { !letters.contains($0) }
|
||||
.map { String(String.UnicodeScalarView($0)) }
|
||||
.filter { !$0.isEmpty }
|
||||
}
|
||||
|
||||
// Minimal stopword set — names, proper nouns, numeric tokens already filtered
|
||||
let stopES: Set<String> = [
|
||||
"el", "la", "los", "las", "un", "una", "unos", "unas", "del", "al", "de",
|
||||
"a", "en", "y", "o", "que", "no", "se", "con", "por", "para", "lo", "le",
|
||||
"su", "mi", "tu", "yo", "te", "me", "es", "son", "está", "están",
|
||||
]
|
||||
let stopEN: Set<String> = [
|
||||
"the", "a", "an", "to", "of", "in", "and", "or", "is", "are", "was", "were",
|
||||
"be", "been", "my", "his", "her", "our", "their", "your",
|
||||
]
|
||||
|
||||
func checkWord(_ w: String, lang: String, stop: Set<String>) -> [String]? {
|
||||
// Return nil if word is OK, else list of candidate corrections.
|
||||
if w.count < 2 { return nil }
|
||||
if stop.contains(w.lowercased()) { return nil }
|
||||
if w.rangeOfCharacter(from: .decimalDigits) != nil { return nil }
|
||||
|
||||
let range = checker.checkSpelling(
|
||||
of: w,
|
||||
startingAt: 0,
|
||||
language: lang,
|
||||
wrap: false,
|
||||
inSpellDocumentWithTag: 0,
|
||||
wordCount: nil
|
||||
)
|
||||
// Range of `(0, 0)` means no misspelling; otherwise we have a misspelling.
|
||||
if range.location == NSNotFound || range.length == 0 { return nil }
|
||||
|
||||
let guesses = checker.guesses(
|
||||
forWordRange: NSRange(location: 0, length: (w as NSString).length),
|
||||
in: w,
|
||||
language: lang,
|
||||
inSpellDocumentWithTag: 0
|
||||
) ?? []
|
||||
return Array(guesses.prefix(3))
|
||||
}
|
||||
|
||||
struct Flag: Encodable {
|
||||
var chapter: Int
|
||||
var front: String
|
||||
var back: String
|
||||
var badFront: [BadWord]
|
||||
var badBack: [BadWord]
|
||||
var sourceImage: String
|
||||
}
|
||||
struct BadWord: Encodable {
|
||||
var word: String
|
||||
var suggestions: [String]
|
||||
var side: String // "es" or "en"
|
||||
}
|
||||
|
||||
var flags: [Flag] = []
|
||||
var totalCards = 0
|
||||
var totalBadES = 0
|
||||
var totalBadEN = 0
|
||||
|
||||
for ch in chapters {
|
||||
guard let chNum = ch["chapter"] as? Int,
|
||||
let cards = ch["cards"] as? [[String: Any]] else { continue }
|
||||
for card in cards {
|
||||
totalCards += 1
|
||||
let front = (card["front"] as? String) ?? ""
|
||||
let back = (card["back"] as? String) ?? ""
|
||||
let img = (card["sourceImage"] as? String) ?? ""
|
||||
|
||||
var badFront: [BadWord] = []
|
||||
for w in tokens(front) {
|
||||
if let sugg = checkWord(w, lang: "es", stop: stopES) {
|
||||
badFront.append(BadWord(word: w, suggestions: sugg, side: "es"))
|
||||
totalBadES += 1
|
||||
}
|
||||
}
|
||||
var badBack: [BadWord] = []
|
||||
for w in tokens(back) {
|
||||
if let sugg = checkWord(w, lang: "en", stop: stopEN) {
|
||||
badBack.append(BadWord(word: w, suggestions: sugg, side: "en"))
|
||||
totalBadEN += 1
|
||||
}
|
||||
}
|
||||
if !badFront.isEmpty || !badBack.isEmpty {
|
||||
flags.append(Flag(
|
||||
chapter: chNum,
|
||||
front: front,
|
||||
back: back,
|
||||
badFront: badFront,
|
||||
badBack: badBack,
|
||||
sourceImage: img
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Report: Encodable {
|
||||
var totalCards: Int
|
||||
var flaggedCards: Int
|
||||
var flaggedSpanishWords: Int
|
||||
var flaggedEnglishWords: Int
|
||||
var flags: [Flag]
|
||||
}
|
||||
let report = Report(
|
||||
totalCards: totalCards,
|
||||
flaggedCards: flags.count,
|
||||
flaggedSpanishWords: totalBadES,
|
||||
flaggedEnglishWords: totalBadEN,
|
||||
flags: flags
|
||||
)
|
||||
|
||||
let encoder = JSONEncoder()
|
||||
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||
do {
|
||||
let data = try encoder.encode(report)
|
||||
try data.write(to: outputURL)
|
||||
print("Cards: \(totalCards)")
|
||||
print("Flagged cards: \(flags.count) (\(Double(flags.count)/Double(totalCards)*100.0 as Double)%)")
|
||||
print("Flagged ES words: \(totalBadES)")
|
||||
print("Flagged EN words: \(totalBadEN)")
|
||||
print("Wrote \(outputURL.path)")
|
||||
} catch {
|
||||
print("Error writing output: \(error)")
|
||||
exit(1)
|
||||
}
|
||||
Reference in New Issue
Block a user