diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 2c29478..8d8746b 100644 --- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -7,7 +7,7 @@ "location" : "https://github.com/FluidInference/FluidAudio", "state" : { "branch" : "main", - "revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3" + "revision" : "328036d255ef76b8d661eacc16ac108eb45f9218" } }, { diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index 1d2f7ea..59c60b5 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -1,4 +1,5 @@ import Foundation +import CoreML import AVFoundation import FluidAudio import os.log @@ -7,15 +8,13 @@ import os.log class ParakeetTranscriptionService: TranscriptionService { private var asrManager: AsrManager? + private var vadManager: VadManager? private let customModelsDirectory: URL? @Published var isModelLoaded = false - - // Logger for Parakeet transcription service private let logger = Logger(subsystem: "com.voiceink.app", category: "ParakeetTranscriptionService") init(customModelsDirectory: URL? = nil) { self.customModelsDirectory = customModelsDirectory - logger.notice("🦜 ParakeetTranscriptionService initialized with directory: \(customModelsDirectory?.path ?? "default")") } func loadModel() async throws { @@ -23,39 +22,16 @@ class ParakeetTranscriptionService: TranscriptionService { return } - logger.notice("🦜 Starting Parakeet model loading") - - do { - - asrManager = AsrManager(config: .default) - let models: AsrModels - if let customDirectory = customModelsDirectory { - logger.notice("🦜 Loading models from custom directory: \(customDirectory.path)") - models = try await AsrModels.downloadAndLoad(to: customDirectory) - } else { - logger.notice("🦜 Loading models from default directory") - models = try await AsrModels.downloadAndLoad() + if let customModelsDirectory { + do { + asrManager = AsrManager(config: .default) + let models = try await AsrModels.load(from: customModelsDirectory) + try await asrManager?.initialize(models: models) + isModelLoaded = true + } catch { + isModelLoaded = false + asrManager = nil } - - try await asrManager?.initialize(models: models) - isModelLoaded = true - logger.notice("🦜 Parakeet model loaded successfully") - - } catch let error as ASRError { - logger.notice("🦜 Parakeet-specific error loading model: \(error.localizedDescription)") - isModelLoaded = false - asrManager = nil - throw error - } catch let error as AsrModelsError { - logger.notice("🦜 Parakeet model management error loading model: \(error.localizedDescription)") - isModelLoaded = false - asrManager = nil - throw error - } catch { - logger.notice("🦜 Unexpected error loading Parakeet model: \(error.localizedDescription)") - isModelLoaded = false - asrManager = nil - throw error } } @@ -64,81 +40,57 @@ class ParakeetTranscriptionService: TranscriptionService { try await loadModel() } - guard let asrManager = asrManager else { - logger.notice("🦜 Parakeet manager is still nil after attempting to load the model.") - throw ASRError.notInitialized - } + guard let asrManager = asrManager else { + throw ASRError.notInitialized + } let audioSamples = try readAudioSamples(from: audioURL) - - // Validate audio data before VAD - guard !audioSamples.isEmpty else { - logger.notice("🦜 Audio is empty, skipping transcription.") - throw ASRError.invalidAudioData - } - // Use VAD to get speech segments - var speechAudio: [Float] = [] + let durationSeconds = Double(audioSamples.count) / 16000.0 + let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true - if isVADEnabled { - if let modelPath = await VADModelManager.shared.getModelPath() { - if let vad = VoiceActivityDetector(modelPath: modelPath) { - let speechSegments = vad.process(audioSamples: audioSamples) - logger.notice("🦜 VAD detected \(speechSegments.count) speech segments.") + let speechAudio: [Float] + if durationSeconds < 20.0 || !isVADEnabled { + speechAudio = audioSamples + } else { + let vadConfig = VadConfig(threshold: 0.7) + if vadManager == nil, let customModelsDirectory { + do { + vadManager = try await VadManager( + config: vadConfig, + modelDirectory: customModelsDirectory.deletingLastPathComponent() + ) + } catch { + // Silent failure + } + } - let sampleRate = 16000 // Assuming 16kHz sample rate - for segment in speechSegments { - let startSample = Int(segment.start * Double(sampleRate)) - var endSample = Int(segment.end * Double(sampleRate)) - - // Cap endSample to the audio buffer size - if endSample > audioSamples.count { - endSample = audioSamples.count - } - - if startSample < endSample { - speechAudio.append(contentsOf: audioSamples[startSample..= 16000 else { - logger.notice("🦜 Audio too short for transcription after VAD: \(speechAudio.count) samples") - throw ASRError.invalidAudioData - } - + let result = try await asrManager.transcribe(speechAudio) - print(result.text) + + Task { + asrManager.cleanup() + isModelLoaded = false + logger.notice("🦜 Parakeet ASR models cleaned up from memory") + } - // Reset decoder state and cleanup after transcription to avoid blocking the transcription start - Task { - asrManager.cleanup() - isModelLoaded = false - logger.notice("🦜 Parakeet ASR models cleaned up from memory") - } - - // Check for empty results (vocabulary issue indicator) - if result.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { - logger.notice("🦜 Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue") - } - - var text = result.text + let text = result.text return text } @@ -146,12 +98,9 @@ class ParakeetTranscriptionService: TranscriptionService { private func readAudioSamples(from url: URL) throws -> [Float] { do { let data = try Data(contentsOf: url) - - // Check minimum file size for valid WAV header - guard data.count > 44 else { - logger.notice("🦜 Audio file too small (\(data.count) bytes), expected > 44 bytes") - throw ASRError.invalidAudioData - } + guard data.count > 44 else { + throw ASRError.invalidAudioData + } let floats = stride(from: 44, to: data.count, by: 2).map { return data[$0..<$0 + 2].withUnsafeBytes { @@ -161,10 +110,9 @@ class ParakeetTranscriptionService: TranscriptionService { } return floats - } catch { - logger.notice("🦜 Failed to read audio file: \(error.localizedDescription)") - throw ASRError.invalidAudioData - } + } catch { + throw ASRError.invalidAudioData + } } } diff --git a/VoiceInk/Whisper/WhisperState+Parakeet.swift b/VoiceInk/Whisper/WhisperState+Parakeet.swift index 8b9fe4b..2c19991 100644 --- a/VoiceInk/Whisper/WhisperState+Parakeet.swift +++ b/VoiceInk/Whisper/WhisperState+Parakeet.swift @@ -33,6 +33,14 @@ extension WhisperState { do { _ = try await AsrModels.downloadAndLoad(to: parakeetModelsDirectory) + + // Also download VAD model into the same parent directory as ASR models + let parentDir = parakeetModelsDirectory.deletingLastPathComponent() + _ = try await DownloadUtils.loadModels( + .vad, + modelNames: Array(ModelNames.VAD.requiredModels), + directory: parentDir + ) self.isParakeetModelDownloaded = true downloadProgress["parakeet-tdt-0.6b"] = 1.0 } catch {