Switch audio session to .record-only, use nil tap format so the system picks a compatible format, and route through AVAudioEngine with a 4096 buffer. Avoids the mDataByteSize(0) assertion seen on some devices. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
154 lines
5.4 KiB
Swift
154 lines
5.4 KiB
Swift
import Foundation
|
|
import Speech
|
|
import AVFoundation
|
|
|
|
@MainActor
|
|
@Observable
|
|
final class PronunciationService {
|
|
var isRecording = false
|
|
var transcript = ""
|
|
var isAuthorized = false
|
|
|
|
private var recognizer: SFSpeechRecognizer?
|
|
private var audioEngine: AVAudioEngine?
|
|
private var request: SFSpeechAudioBufferRecognitionRequest?
|
|
private var task: SFSpeechRecognitionTask?
|
|
private var recognizerResolved = false
|
|
|
|
func requestAuthorization() {
|
|
#if targetEnvironment(simulator)
|
|
print("[PronunciationService] skipping speech auth on simulator")
|
|
return
|
|
#else
|
|
// Check current status first to avoid unnecessary prompt
|
|
let currentStatus = SFSpeechRecognizer.authorizationStatus()
|
|
if currentStatus == .authorized {
|
|
isAuthorized = true
|
|
return
|
|
}
|
|
if currentStatus == .denied || currentStatus == .restricted {
|
|
isAuthorized = false
|
|
return
|
|
}
|
|
|
|
// Only request if not determined yet — do it on a background queue
|
|
// to avoid blocking main thread, then update state on main
|
|
DispatchQueue.global(qos: .userInitiated).async {
|
|
SFSpeechRecognizer.requestAuthorization { status in
|
|
DispatchQueue.main.async { [weak self] in
|
|
self?.isAuthorized = (status == .authorized)
|
|
print("[PronunciationService] authorization status: \(status.rawValue)")
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
private func resolveRecognizerIfNeeded() {
|
|
guard !recognizerResolved else { return }
|
|
recognizerResolved = true
|
|
recognizer = SFSpeechRecognizer(locale: Locale(identifier: "es-ES"))
|
|
}
|
|
|
|
func startRecording() {
|
|
guard isAuthorized else {
|
|
print("[PronunciationService] not authorized")
|
|
return
|
|
}
|
|
resolveRecognizerIfNeeded()
|
|
guard let recognizer, recognizer.isAvailable else {
|
|
print("[PronunciationService] recognizer unavailable")
|
|
return
|
|
}
|
|
|
|
stopRecording()
|
|
|
|
do {
|
|
let audioSession = AVAudioSession.sharedInstance()
|
|
try audioSession.setCategory(.record, mode: .measurement, options: [.duckOthers])
|
|
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
|
|
|
|
// Use SFSpeechAudioBufferRecognitionRequest with the recognizer
|
|
// directly — avoid AVAudioEngine entirely since it produces
|
|
// zero-length buffers on some devices causing assertion crashes.
|
|
request = SFSpeechAudioBufferRecognitionRequest()
|
|
guard let request else { return }
|
|
request.shouldReportPartialResults = true
|
|
request.requiresOnDeviceRecognition = recognizer.supportsOnDeviceRecognition
|
|
|
|
// Use AVAudioEngine with the native input format
|
|
audioEngine = AVAudioEngine()
|
|
guard let audioEngine else { return }
|
|
|
|
let inputNode = audioEngine.inputNode
|
|
|
|
// Use nil format — lets the system pick a compatible format
|
|
// and avoids the mDataByteSize(0) assertion from format mismatches
|
|
inputNode.installTap(onBus: 0, bufferSize: 4096, format: nil) { buffer, _ in
|
|
request.append(buffer)
|
|
}
|
|
|
|
audioEngine.prepare()
|
|
try audioEngine.start()
|
|
|
|
transcript = ""
|
|
isRecording = true
|
|
|
|
task = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
|
DispatchQueue.main.async {
|
|
if let result {
|
|
self?.transcript = result.bestTranscription.formattedString
|
|
}
|
|
if error != nil || (result?.isFinal == true) {
|
|
self?.stopRecording()
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
print("[PronunciationService] startRecording failed: \(error)")
|
|
stopRecording()
|
|
}
|
|
}
|
|
|
|
func stopRecording() {
|
|
audioEngine?.stop()
|
|
audioEngine?.inputNode.removeTap(onBus: 0)
|
|
request?.endAudio()
|
|
task?.cancel()
|
|
task = nil
|
|
request = nil
|
|
audioEngine = nil
|
|
isRecording = false
|
|
}
|
|
|
|
/// Compare spoken transcript against expected text, returns matched word ratio (0.0-1.0).
|
|
static func scoreMatch(expected: String, spoken: String) -> (score: Double, matches: [WordMatch]) {
|
|
let expectedWords = expected.lowercased()
|
|
.components(separatedBy: .whitespacesAndNewlines)
|
|
.map { $0.trimmingCharacters(in: .punctuationCharacters) }
|
|
.filter { !$0.isEmpty }
|
|
|
|
let spokenWords = spoken.lowercased()
|
|
.components(separatedBy: .whitespacesAndNewlines)
|
|
.map { $0.trimmingCharacters(in: .punctuationCharacters) }
|
|
.filter { !$0.isEmpty }
|
|
|
|
let spokenSet = Set(spokenWords)
|
|
var matches: [WordMatch] = []
|
|
|
|
for word in expectedWords {
|
|
matches.append(WordMatch(word: word, matched: spokenSet.contains(word)))
|
|
}
|
|
|
|
let matchCount = matches.filter(\.matched).count
|
|
let score = expectedWords.isEmpty ? 0 : Double(matchCount) / Double(expectedWords.count)
|
|
return (score, matches)
|
|
}
|
|
|
|
struct WordMatch: Identifiable {
|
|
let word: String
|
|
let matched: Bool
|
|
var id: String { word }
|
|
}
|
|
}
|