Files
Spanish/Conjuga/Conjuga/Services/PronunciationService.swift
Trey t f809bc2a1d Fix speech recognition crash from audio format mismatch
Switch audio session to .record-only, use nil tap format so the system
picks a compatible format, and route through AVAudioEngine with a 4096
buffer. Avoids the mDataByteSize(0) assertion seen on some devices.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:22:45 -05:00

154 lines
5.4 KiB
Swift

import Foundation
import Speech
import AVFoundation
@MainActor
@Observable
final class PronunciationService {
var isRecording = false
var transcript = ""
var isAuthorized = false
private var recognizer: SFSpeechRecognizer?
private var audioEngine: AVAudioEngine?
private var request: SFSpeechAudioBufferRecognitionRequest?
private var task: SFSpeechRecognitionTask?
private var recognizerResolved = false
func requestAuthorization() {
#if targetEnvironment(simulator)
print("[PronunciationService] skipping speech auth on simulator")
return
#else
// Check current status first to avoid unnecessary prompt
let currentStatus = SFSpeechRecognizer.authorizationStatus()
if currentStatus == .authorized {
isAuthorized = true
return
}
if currentStatus == .denied || currentStatus == .restricted {
isAuthorized = false
return
}
// Only request if not determined yet do it on a background queue
// to avoid blocking main thread, then update state on main
DispatchQueue.global(qos: .userInitiated).async {
SFSpeechRecognizer.requestAuthorization { status in
DispatchQueue.main.async { [weak self] in
self?.isAuthorized = (status == .authorized)
print("[PronunciationService] authorization status: \(status.rawValue)")
}
}
}
#endif
}
private func resolveRecognizerIfNeeded() {
guard !recognizerResolved else { return }
recognizerResolved = true
recognizer = SFSpeechRecognizer(locale: Locale(identifier: "es-ES"))
}
func startRecording() {
guard isAuthorized else {
print("[PronunciationService] not authorized")
return
}
resolveRecognizerIfNeeded()
guard let recognizer, recognizer.isAvailable else {
print("[PronunciationService] recognizer unavailable")
return
}
stopRecording()
do {
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: [.duckOthers])
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
// Use SFSpeechAudioBufferRecognitionRequest with the recognizer
// directly avoid AVAudioEngine entirely since it produces
// zero-length buffers on some devices causing assertion crashes.
request = SFSpeechAudioBufferRecognitionRequest()
guard let request else { return }
request.shouldReportPartialResults = true
request.requiresOnDeviceRecognition = recognizer.supportsOnDeviceRecognition
// Use AVAudioEngine with the native input format
audioEngine = AVAudioEngine()
guard let audioEngine else { return }
let inputNode = audioEngine.inputNode
// Use nil format lets the system pick a compatible format
// and avoids the mDataByteSize(0) assertion from format mismatches
inputNode.installTap(onBus: 0, bufferSize: 4096, format: nil) { buffer, _ in
request.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
transcript = ""
isRecording = true
task = recognizer.recognitionTask(with: request) { [weak self] result, error in
DispatchQueue.main.async {
if let result {
self?.transcript = result.bestTranscription.formattedString
}
if error != nil || (result?.isFinal == true) {
self?.stopRecording()
}
}
}
} catch {
print("[PronunciationService] startRecording failed: \(error)")
stopRecording()
}
}
func stopRecording() {
audioEngine?.stop()
audioEngine?.inputNode.removeTap(onBus: 0)
request?.endAudio()
task?.cancel()
task = nil
request = nil
audioEngine = nil
isRecording = false
}
/// Compare spoken transcript against expected text, returns matched word ratio (0.0-1.0).
static func scoreMatch(expected: String, spoken: String) -> (score: Double, matches: [WordMatch]) {
let expectedWords = expected.lowercased()
.components(separatedBy: .whitespacesAndNewlines)
.map { $0.trimmingCharacters(in: .punctuationCharacters) }
.filter { !$0.isEmpty }
let spokenWords = spoken.lowercased()
.components(separatedBy: .whitespacesAndNewlines)
.map { $0.trimmingCharacters(in: .punctuationCharacters) }
.filter { !$0.isEmpty }
let spokenSet = Set(spokenWords)
var matches: [WordMatch] = []
for word in expectedWords {
matches.append(WordMatch(word: word, matched: spokenSet.contains(word)))
}
let matchCount = matches.filter(\.matched).count
let score = expectedWords.isEmpty ? 0 : Double(matchCount) / Double(expectedWords.count)
return (score, matches)
}
struct WordMatch: Identifiable {
let word: String
let matched: Bool
var id: String { word }
}
}