From c0ed2dc78a4c6be8982a8126310082ee5ce91e3a Mon Sep 17 00:00:00 2001 From: Beingpax Date: Sat, 6 Sep 2025 07:13:06 +0545 Subject: [PATCH] Improved VAD for Parakeet model --- .../ParakeetTranscriptionService.swift | 24 ++++++++-- VoiceInk/Services/VoiceActivityDetector.swift | 45 ++++++------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index cb39801..1d60491 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -78,11 +78,29 @@ class ParakeetTranscriptionService: TranscriptionService { } // Use VAD to get speech segments - let speechAudio: [Float] + var speechAudio: [Float] = [] if let modelPath = await VADModelManager.shared.getModelPath() { if let vad = VoiceActivityDetector(modelPath: modelPath) { - speechAudio = vad.process(audioSamples: audioSamples) - logger.notice("🦜 VAD processed audio, resulting in \(speechAudio.count) samples.") + let speechSegments = vad.process(audioSamples: audioSamples) + logger.notice("🦜 VAD detected \(speechSegments.count) speech segments.") + + let sampleRate = 16000 // Assuming 16kHz sample rate + for segment in speechSegments { + let startSample = Int(segment.start * Double(sampleRate)) + var endSample = Int(segment.end * Double(sampleRate)) + + // Cap endSample to the audio buffer size + if endSample > audioSamples.count { + endSample = audioSamples.count + } + + if startSample < endSample { + speechAudio.append(contentsOf: audioSamples[startSample.. [Float] { + /// Processes audio samples to detect speech segments and returns an array of (start: TimeInterval, end: TimeInterval) tuples. + func process(audioSamples: [Float]) -> [(start: TimeInterval, end: TimeInterval)] { // 1. Detect speech and get probabilities internally in the context let success = audioSamples.withUnsafeBufferPointer { buffer in whisper_vad_detect_speech(vadContext, buffer.baseAddress!, Int32(audioSamples.count)) @@ -100,10 +100,12 @@ class VoiceActivityDetector { // 2. Get segments from probabilities var vadParams = whisper_vad_default_params() - vadParams.threshold = 0.5 - vadParams.min_speech_duration_ms = 250 - vadParams.min_silence_duration_ms = 100 - vadParams.speech_pad_ms = 30 + vadParams.threshold = 0.45 + vadParams.min_speech_duration_ms = 150 + vadParams.min_silence_duration_ms = 750 + vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude // Use the largest representable Float value for no max duration + vadParams.speech_pad_ms = 100 + vadParams.samples_overlap = 0.1 // Add samples_overlap parameter guard let segments = whisper_vad_segments_from_probs(vadContext, vadParams) else { logger.error("Failed to get VAD segments from probabilities.") @@ -117,36 +119,15 @@ class VoiceActivityDetector { let nSegments = whisper_vad_segments_n_segments(segments) logger.notice("Detected \(nSegments) speech segments.") - // 3. Stitch audio segments together - var stitchedAudio = [Float]() - let sampleRate = 16000 // Assuming 16kHz sample rate - + var speechSegments: [(start: TimeInterval, end: TimeInterval)] = [] for i in 0.. audioSamples.count { - logger.debug("Capping endSample from \(endSample, privacy: .public) to \(audioSamples.count, privacy: .public)") - endSample = audioSamples.count - } - - if startSample < endSample { - stitchedAudio.append(contentsOf: audioSamples[startSample..