Native Fluid Audio VAD
This commit is contained in:
parent
6e6773068f
commit
91734bda45
@ -7,7 +7,7 @@
|
||||
"location" : "https://github.com/FluidInference/FluidAudio",
|
||||
"state" : {
|
||||
"branch" : "main",
|
||||
"revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
|
||||
"revision" : "1416b2f8d6be50d7aa47f32a3baeeb8669c375e9"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,120 @@
|
||||
[
|
||||
{
|
||||
"shortDescription" : "Silero VAD Unified Model 256ms (STFT + Encoder + Decoder) with noisy-OR aggregation",
|
||||
"metadataOutputVersion" : "3.0",
|
||||
"outputSchema" : [
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 1, 1]",
|
||||
"name" : "vad_output",
|
||||
"type" : "MultiArray"
|
||||
},
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 128)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 128]",
|
||||
"name" : "new_hidden_state",
|
||||
"type" : "MultiArray"
|
||||
},
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 128)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 128]",
|
||||
"name" : "new_cell_state",
|
||||
"type" : "MultiArray"
|
||||
}
|
||||
],
|
||||
"version" : "6.0.0",
|
||||
"modelParameters" : [
|
||||
|
||||
],
|
||||
"author" : "Fluid Infernece + Silero Team",
|
||||
"specificationVersion" : 6,
|
||||
"storagePrecision" : "Mixed (Float16, Float32)",
|
||||
"mlProgramOperationTypeHistogram" : {
|
||||
"Concat" : 9,
|
||||
"Lstm" : 8,
|
||||
"SliceByIndex" : 41,
|
||||
"Clip" : 32,
|
||||
"Pow" : 16,
|
||||
"Transpose" : 16,
|
||||
"Sub" : 2,
|
||||
"Relu" : 40,
|
||||
"Squeeze" : 18,
|
||||
"Cast" : 54,
|
||||
"Sigmoid" : 8,
|
||||
"Add" : 16,
|
||||
"ExpandDims" : 26,
|
||||
"Sqrt" : 8,
|
||||
"Mul" : 7,
|
||||
"Conv" : 48,
|
||||
"Pad" : 8
|
||||
},
|
||||
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
||||
"stateSchema" : [
|
||||
|
||||
],
|
||||
"isUpdatable" : "0",
|
||||
"availability" : {
|
||||
"macOS" : "12.0",
|
||||
"tvOS" : "15.0",
|
||||
"visionOS" : "1.0",
|
||||
"watchOS" : "8.0",
|
||||
"iOS" : "15.0",
|
||||
"macCatalyst" : "15.0"
|
||||
},
|
||||
"modelType" : {
|
||||
"name" : "MLModelType_mlProgram"
|
||||
},
|
||||
"inputSchema" : [
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 4160)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 4160]",
|
||||
"name" : "audio_input",
|
||||
"type" : "MultiArray"
|
||||
},
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 128)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 128]",
|
||||
"name" : "hidden_state",
|
||||
"type" : "MultiArray"
|
||||
},
|
||||
{
|
||||
"hasShapeFlexibility" : "0",
|
||||
"isOptional" : "0",
|
||||
"dataType" : "Float32",
|
||||
"formattedType" : "MultiArray (Float32 1 × 128)",
|
||||
"shortDescription" : "",
|
||||
"shape" : "[1, 128]",
|
||||
"name" : "cell_state",
|
||||
"type" : "MultiArray"
|
||||
}
|
||||
],
|
||||
"userDefinedMetadata" : {
|
||||
"com.github.apple.coremltools.conversion_date" : "2025-09-15",
|
||||
"com.github.apple.coremltools.source" : "torch==2.7.0",
|
||||
"com.github.apple.coremltools.version" : "9.0b1",
|
||||
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
||||
},
|
||||
"generatedClassName" : "silero_vad_unified_256ms_v6_0_0",
|
||||
"method" : "predict"
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,4 +1,5 @@
|
||||
import Foundation
|
||||
import CoreML
|
||||
import AVFoundation
|
||||
import FluidAudio
|
||||
import os.log
|
||||
@ -7,10 +8,9 @@ import os.log
|
||||
|
||||
class ParakeetTranscriptionService: TranscriptionService {
|
||||
private var asrManager: AsrManager?
|
||||
private var vadManager: VadManager?
|
||||
private let customModelsDirectory: URL?
|
||||
@Published var isModelLoaded = false
|
||||
|
||||
// Logger for Parakeet transcription service
|
||||
private let logger = Logger(subsystem: "com.voiceink.app", category: "ParakeetTranscriptionService")
|
||||
|
||||
init(customModelsDirectory: URL? = nil) {
|
||||
@ -21,8 +21,6 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
if isModelLoaded {
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
|
||||
do {
|
||||
|
||||
@ -30,10 +28,11 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
let models: AsrModels
|
||||
if let customDirectory = customModelsDirectory {
|
||||
logger.notice("🦜 Loading Parakeet models from: \(customDirectory.path)")
|
||||
models = try await AsrModels.downloadAndLoad(to: customDirectory)
|
||||
models = try await AsrModels.load(from: customDirectory)
|
||||
} else {
|
||||
logger.notice("🦜 Loading Parakeet models from default directory")
|
||||
models = try await AsrModels.downloadAndLoad()
|
||||
let defaultDir = AsrModels.defaultCacheDirectory()
|
||||
models = try await AsrModels.load(from: defaultDir)
|
||||
}
|
||||
|
||||
try await asrManager?.initialize(models: models)
|
||||
@ -60,13 +59,43 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
|
||||
let audioSamples = try readAudioSamples(from: audioURL)
|
||||
|
||||
// Use full audio for transcription
|
||||
let speechAudio: [Float] = audioSamples
|
||||
let sampleRate = 16000.0
|
||||
let durationSeconds = Double(audioSamples.count) / sampleRate
|
||||
|
||||
let speechAudio: [Float]
|
||||
if durationSeconds < 20.0 {
|
||||
speechAudio = audioSamples
|
||||
} else {
|
||||
let vadConfig = VadConfig(threshold: 0.7)
|
||||
if vadManager == nil {
|
||||
if let bundledVadURL = Bundle.main.url(forResource: ModelNames.VAD.sileroVad, withExtension: "mlmodelc") {
|
||||
do {
|
||||
let bundledModel = try MLModel(contentsOf: bundledVadURL)
|
||||
vadManager = VadManager(config: vadConfig, vadModel: bundledModel)
|
||||
} catch {
|
||||
}
|
||||
} else {
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if let vadManager {
|
||||
let segments = try await vadManager.segmentSpeechAudio(audioSamples)
|
||||
if segments.isEmpty {
|
||||
speechAudio = audioSamples
|
||||
} else {
|
||||
speechAudio = segments.flatMap { $0 }
|
||||
}
|
||||
} else {
|
||||
speechAudio = audioSamples
|
||||
}
|
||||
} catch {
|
||||
speechAudio = audioSamples
|
||||
}
|
||||
}
|
||||
|
||||
let result = try await asrManager.transcribe(speechAudio)
|
||||
|
||||
|
||||
// Reset decoder state and cleanup after transcription to avoid blocking the transcription start
|
||||
Task {
|
||||
asrManager.cleanup()
|
||||
isModelLoaded = false
|
||||
@ -81,8 +110,6 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
private func readAudioSamples(from url: URL) throws -> [Float] {
|
||||
do {
|
||||
let data = try Data(contentsOf: url)
|
||||
|
||||
// Check minimum file size for valid WAV header
|
||||
guard data.count > 44 else {
|
||||
throw ASRError.invalidAudioData
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user