Native Fluid Audio VAD

2025-09-19 19:24:02 +05:45 · 2025-09-19 19:24:02 +05:45 · 91734bda45
commit 91734bda45
parent 6e6773068f
7 changed files with 1162 additions and 13 deletions
--- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
+++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
@ -7,7 +7,7 @@
      "location" : "https://github.com/FluidInference/FluidAudio",
      "state" : {
        "branch" : "main",
-        "revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
+        "revision" : "1416b2f8d6be50d7aa47f32a3baeeb8669c375e9"
      }
    },
    {
--- a/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/analytics/coremldata.bin
+++ b/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/analytics/coremldata.bin
--- a/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/coremldata.bin
+++ b/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/coremldata.bin
--- a/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/metadata.json
+++ b/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/metadata.json
@ -0,0 +1,120 @@
+[
+  {
+    "shortDescription" : "Silero VAD Unified Model 256ms (STFT + Encoder + Decoder) with noisy-OR aggregation",
+    "metadataOutputVersion" : "3.0",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "vad_output",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128)",
+        "shortDescription" : "",
+        "shape" : "[1, 128]",
+        "name" : "new_hidden_state",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128)",
+        "shortDescription" : "",
+        "shape" : "[1, 128]",
+        "name" : "new_cell_state",
+        "type" : "MultiArray"
+      }
+    ],
+    "version" : "6.0.0",
+    "modelParameters" : [
+
+    ],
+    "author" : "Fluid Infernece + Silero Team",
+    "specificationVersion" : 6,
+    "storagePrecision" : "Mixed (Float16, Float32)",
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 9,
+      "Lstm" : 8,
+      "SliceByIndex" : 41,
+      "Clip" : 32,
+      "Pow" : 16,
+      "Transpose" : 16,
+      "Sub" : 2,
+      "Relu" : 40,
+      "Squeeze" : 18,
+      "Cast" : 54,
+      "Sigmoid" : 8,
+      "Add" : 16,
+      "ExpandDims" : 26,
+      "Sqrt" : 8,
+      "Mul" : 7,
+      "Conv" : 48,
+      "Pad" : 8
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "stateSchema" : [
+
+    ],
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "12.0",
+      "tvOS" : "15.0",
+      "visionOS" : "1.0",
+      "watchOS" : "8.0",
+      "iOS" : "15.0",
+      "macCatalyst" : "15.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 4160)",
+        "shortDescription" : "",
+        "shape" : "[1, 4160]",
+        "name" : "audio_input",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128)",
+        "shortDescription" : "",
+        "shape" : "[1, 128]",
+        "name" : "hidden_state",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128)",
+        "shortDescription" : "",
+        "shape" : "[1, 128]",
+        "name" : "cell_state",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2025-09-15",
+      "com.github.apple.coremltools.source" : "torch==2.7.0",
+      "com.github.apple.coremltools.version" : "9.0b1",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "generatedClassName" : "silero_vad_unified_256ms_v6_0_0",
+    "method" : "predict"
+  }
+]
--- a/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/model.mil
+++ b/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/model.mil
--- a/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/weights/weight.bin
+++ b/VoiceInk/Resources/models/silero-vad-unified-256ms-v6.0.0.mlmodelc/weights/weight.bin
--- a/VoiceInk/Services/ParakeetTranscriptionService.swift
+++ b/VoiceInk/Services/ParakeetTranscriptionService.swift
@ -1,4 +1,5 @@
 import Foundation
+import CoreML
 import AVFoundation
 import FluidAudio
 import os.log
@ -7,10 +8,9 @@ import os.log

 class ParakeetTranscriptionService: TranscriptionService {
    private var asrManager: AsrManager?
+    private var vadManager: VadManager?
    private let customModelsDirectory: URL?
    @Published var isModelLoaded = false
-    
-    // Logger for Parakeet transcription service
    private let logger = Logger(subsystem: "com.voiceink.app", category: "ParakeetTranscriptionService")
    
    init(customModelsDirectory: URL? = nil) {
@ -21,8 +21,6 @@ class ParakeetTranscriptionService: TranscriptionService {
        if isModelLoaded {
            return
        }
-
-		
        
        do {
         
@ -30,10 +28,11 @@ class ParakeetTranscriptionService: TranscriptionService {
            let models: AsrModels
 			if let customDirectory = customModelsDirectory {
 				logger.notice("🦜 Loading Parakeet models from: \(customDirectory.path)")
-				models = try await AsrModels.downloadAndLoad(to: customDirectory)
+				models = try await AsrModels.load(from: customDirectory)
 			} else {
 				logger.notice("🦜 Loading Parakeet models from default directory")
-				models = try await AsrModels.downloadAndLoad()
+				let defaultDir = AsrModels.defaultCacheDirectory()
+				models = try await AsrModels.load(from: defaultDir)
 			}
            
            try await asrManager?.initialize(models: models)
@ -60,13 +59,43 @@ class ParakeetTranscriptionService: TranscriptionService {
        
        let audioSamples = try readAudioSamples(from: audioURL)

-        // Use full audio for transcription
-        let speechAudio: [Float] = audioSamples
+        let sampleRate = 16000.0
+        let durationSeconds = Double(audioSamples.count) / sampleRate
+
+        let speechAudio: [Float]
+        if durationSeconds < 20.0 {
+            speechAudio = audioSamples
+        } else {
+            let vadConfig = VadConfig(threshold: 0.7)
+            if vadManager == nil {
+                if let bundledVadURL = Bundle.main.url(forResource: ModelNames.VAD.sileroVad, withExtension: "mlmodelc") {
+                    do {
+                        let bundledModel = try MLModel(contentsOf: bundledVadURL)
+                        vadManager = VadManager(config: vadConfig, vadModel: bundledModel)
+                    } catch {
+                    }
+                } else {
+                }
+            }
+
+            do {
+                if let vadManager {
+                    let segments = try await vadManager.segmentSpeechAudio(audioSamples)
+                    if segments.isEmpty {
+                        speechAudio = audioSamples
+                    } else {
+                        speechAudio = segments.flatMap { $0 }
+                    }
+                } else {
+                    speechAudio = audioSamples
+                }
+            } catch {
+                speechAudio = audioSamples
+            }
+        }

        let result = try await asrManager.transcribe(speechAudio)
 		
-        
-        // Reset decoder state and cleanup after transcription to avoid blocking the transcription start
 		Task {
 			asrManager.cleanup()
 			isModelLoaded = false
@ -81,8 +110,6 @@ class ParakeetTranscriptionService: TranscriptionService {
    private func readAudioSamples(from url: URL) throws -> [Float] {
        do {
            let data = try Data(contentsOf: url)
-            
-			// Check minimum file size for valid WAV header
 			guard data.count > 44 else {
 				throw ASRError.invalidAudioData
 			}