#!/usr/bin/env swift // Validate every Spanish/English word in vocab_cards.json using NSSpellChecker. // For each flagged word, produce up to 3 candidate corrections. // // Usage: swift validate_vocab.swift import Foundation import AppKit guard CommandLine.arguments.count >= 3 else { print("Usage: swift validate_vocab.swift ") exit(1) } let inputURL = URL(fileURLWithPath: CommandLine.arguments[1]) let outputURL = URL(fileURLWithPath: CommandLine.arguments[2]) guard let data = try? Data(contentsOf: inputURL), let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], let chapters = json["chapters"] as? [[String: Any]] else { print("Could not load \(inputURL.path)") exit(1) } let checker = NSSpellChecker.shared // Tokenize — only letter runs (Unicode aware for Spanish accents) func tokens(_ s: String) -> [String] { let letters = CharacterSet.letters return s.unicodeScalars .split { !letters.contains($0) } .map { String(String.UnicodeScalarView($0)) } .filter { !$0.isEmpty } } // Minimal stopword set — names, proper nouns, numeric tokens already filtered let stopES: Set = [ "el", "la", "los", "las", "un", "una", "unos", "unas", "del", "al", "de", "a", "en", "y", "o", "que", "no", "se", "con", "por", "para", "lo", "le", "su", "mi", "tu", "yo", "te", "me", "es", "son", "está", "están", ] let stopEN: Set = [ "the", "a", "an", "to", "of", "in", "and", "or", "is", "are", "was", "were", "be", "been", "my", "his", "her", "our", "their", "your", ] func checkWord(_ w: String, lang: String, stop: Set) -> [String]? { // Return nil if word is OK, else list of candidate corrections. if w.count < 2 { return nil } if stop.contains(w.lowercased()) { return nil } if w.rangeOfCharacter(from: .decimalDigits) != nil { return nil } let range = checker.checkSpelling( of: w, startingAt: 0, language: lang, wrap: false, inSpellDocumentWithTag: 0, wordCount: nil ) // Range of `(0, 0)` means no misspelling; otherwise we have a misspelling. if range.location == NSNotFound || range.length == 0 { return nil } let guesses = checker.guesses( forWordRange: NSRange(location: 0, length: (w as NSString).length), in: w, language: lang, inSpellDocumentWithTag: 0 ) ?? [] return Array(guesses.prefix(3)) } struct Flag: Encodable { var chapter: Int var front: String var back: String var badFront: [BadWord] var badBack: [BadWord] var sourceImage: String } struct BadWord: Encodable { var word: String var suggestions: [String] var side: String // "es" or "en" } var flags: [Flag] = [] var totalCards = 0 var totalBadES = 0 var totalBadEN = 0 for ch in chapters { guard let chNum = ch["chapter"] as? Int, let cards = ch["cards"] as? [[String: Any]] else { continue } for card in cards { totalCards += 1 let front = (card["front"] as? String) ?? "" let back = (card["back"] as? String) ?? "" let img = (card["sourceImage"] as? String) ?? "" var badFront: [BadWord] = [] for w in tokens(front) { if let sugg = checkWord(w, lang: "es", stop: stopES) { badFront.append(BadWord(word: w, suggestions: sugg, side: "es")) totalBadES += 1 } } var badBack: [BadWord] = [] for w in tokens(back) { if let sugg = checkWord(w, lang: "en", stop: stopEN) { badBack.append(BadWord(word: w, suggestions: sugg, side: "en")) totalBadEN += 1 } } if !badFront.isEmpty || !badBack.isEmpty { flags.append(Flag( chapter: chNum, front: front, back: back, badFront: badFront, badBack: badBack, sourceImage: img )) } } } struct Report: Encodable { var totalCards: Int var flaggedCards: Int var flaggedSpanishWords: Int var flaggedEnglishWords: Int var flags: [Flag] } let report = Report( totalCards: totalCards, flaggedCards: flags.count, flaggedSpanishWords: totalBadES, flaggedEnglishWords: totalBadEN, flags: flags ) let encoder = JSONEncoder() encoder.outputFormatting = [.prettyPrinted, .sortedKeys] do { let data = try encoder.encode(report) try data.write(to: outputURL) print("Cards: \(totalCards)") print("Flagged cards: \(flags.count) (\(Double(flags.count)/Double(totalCards)*100.0 as Double)%)") print("Flagged ES words: \(totalBadES)") print("Flagged EN words: \(totalBadEN)") print("Wrote \(outputURL.path)") } catch { print("Error writing output: \(error)") exit(1) }