Native Fluid Audio VAD

This commit is contained in:
Beingpax 2025-09-19 19:24:02 +05:45
parent 6e6773068f
commit 91734bda45
7 changed files with 1162 additions and 13 deletions

View File

@ -7,7 +7,7 @@
"location" : "https://github.com/FluidInference/FluidAudio",
"state" : {
"branch" : "main",
"revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
"revision" : "1416b2f8d6be50d7aa47f32a3baeeb8669c375e9"
}
},
{

View File

@ -0,0 +1,120 @@
[
{
"shortDescription" : "Silero VAD Unified Model 256ms (STFT + Encoder + Decoder) with noisy-OR aggregation",
"metadataOutputVersion" : "3.0",
"outputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
"shortDescription" : "",
"shape" : "[1, 1, 1]",
"name" : "vad_output",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "new_hidden_state",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "new_cell_state",
"type" : "MultiArray"
}
],
"version" : "6.0.0",
"modelParameters" : [
],
"author" : "Fluid Infernece + Silero Team",
"specificationVersion" : 6,
"storagePrecision" : "Mixed (Float16, Float32)",
"mlProgramOperationTypeHistogram" : {
"Concat" : 9,
"Lstm" : 8,
"SliceByIndex" : 41,
"Clip" : 32,
"Pow" : 16,
"Transpose" : 16,
"Sub" : 2,
"Relu" : 40,
"Squeeze" : 18,
"Cast" : 54,
"Sigmoid" : 8,
"Add" : 16,
"ExpandDims" : 26,
"Sqrt" : 8,
"Mul" : 7,
"Conv" : 48,
"Pad" : 8
},
"computePrecision" : "Mixed (Float16, Float32, Int32)",
"stateSchema" : [
],
"isUpdatable" : "0",
"availability" : {
"macOS" : "12.0",
"tvOS" : "15.0",
"visionOS" : "1.0",
"watchOS" : "8.0",
"iOS" : "15.0",
"macCatalyst" : "15.0"
},
"modelType" : {
"name" : "MLModelType_mlProgram"
},
"inputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 4160)",
"shortDescription" : "",
"shape" : "[1, 4160]",
"name" : "audio_input",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "hidden_state",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "cell_state",
"type" : "MultiArray"
}
],
"userDefinedMetadata" : {
"com.github.apple.coremltools.conversion_date" : "2025-09-15",
"com.github.apple.coremltools.source" : "torch==2.7.0",
"com.github.apple.coremltools.version" : "9.0b1",
"com.github.apple.coremltools.source_dialect" : "TorchScript"
},
"generatedClassName" : "silero_vad_unified_256ms_v6_0_0",
"method" : "predict"
}
]

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
import Foundation
import CoreML
import AVFoundation
import FluidAudio
import os.log
@ -7,10 +8,9 @@ import os.log
class ParakeetTranscriptionService: TranscriptionService {
private var asrManager: AsrManager?
private var vadManager: VadManager?
private let customModelsDirectory: URL?
@Published var isModelLoaded = false
// Logger for Parakeet transcription service
private let logger = Logger(subsystem: "com.voiceink.app", category: "ParakeetTranscriptionService")
init(customModelsDirectory: URL? = nil) {
@ -21,8 +21,6 @@ class ParakeetTranscriptionService: TranscriptionService {
if isModelLoaded {
return
}
do {
@ -30,10 +28,11 @@ class ParakeetTranscriptionService: TranscriptionService {
let models: AsrModels
if let customDirectory = customModelsDirectory {
logger.notice("🦜 Loading Parakeet models from: \(customDirectory.path)")
models = try await AsrModels.downloadAndLoad(to: customDirectory)
models = try await AsrModels.load(from: customDirectory)
} else {
logger.notice("🦜 Loading Parakeet models from default directory")
models = try await AsrModels.downloadAndLoad()
let defaultDir = AsrModels.defaultCacheDirectory()
models = try await AsrModels.load(from: defaultDir)
}
try await asrManager?.initialize(models: models)
@ -60,13 +59,43 @@ class ParakeetTranscriptionService: TranscriptionService {
let audioSamples = try readAudioSamples(from: audioURL)
// Use full audio for transcription
let speechAudio: [Float] = audioSamples
let sampleRate = 16000.0
let durationSeconds = Double(audioSamples.count) / sampleRate
let speechAudio: [Float]
if durationSeconds < 20.0 {
speechAudio = audioSamples
} else {
let vadConfig = VadConfig(threshold: 0.7)
if vadManager == nil {
if let bundledVadURL = Bundle.main.url(forResource: ModelNames.VAD.sileroVad, withExtension: "mlmodelc") {
do {
let bundledModel = try MLModel(contentsOf: bundledVadURL)
vadManager = VadManager(config: vadConfig, vadModel: bundledModel)
} catch {
}
} else {
}
}
do {
if let vadManager {
let segments = try await vadManager.segmentSpeechAudio(audioSamples)
if segments.isEmpty {
speechAudio = audioSamples
} else {
speechAudio = segments.flatMap { $0 }
}
} else {
speechAudio = audioSamples
}
} catch {
speechAudio = audioSamples
}
}
let result = try await asrManager.transcribe(speechAudio)
// Reset decoder state and cleanup after transcription to avoid blocking the transcription start
Task {
asrManager.cleanup()
isModelLoaded = false
@ -81,8 +110,6 @@ class ParakeetTranscriptionService: TranscriptionService {
private func readAudioSamples(from url: URL) throws -> [Float] {
do {
let data = try Data(contentsOf: url)
// Check minimum file size for valid WAV header
guard data.count > 44 else {
throw ASRError.invalidAudioData
}