Respect VAD flag, downloading & updated to latest version

This commit is contained in:
Beingpax 2025-09-20 17:00:28 +05:45
parent 91734bda45
commit 97c6234fb3
8 changed files with 32 additions and 1158 deletions

View File

@ -7,7 +7,7 @@
"location" : "https://github.com/FluidInference/FluidAudio",
"state" : {
"branch" : "main",
"revision" : "1416b2f8d6be50d7aa47f32a3baeeb8669c375e9"
"revision" : "328036d255ef76b8d661eacc16ac108eb45f9218"
}
},
{

View File

@ -1,120 +0,0 @@
[
{
"shortDescription" : "Silero VAD Unified Model 256ms (STFT + Encoder + Decoder) with noisy-OR aggregation",
"metadataOutputVersion" : "3.0",
"outputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
"shortDescription" : "",
"shape" : "[1, 1, 1]",
"name" : "vad_output",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "new_hidden_state",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "new_cell_state",
"type" : "MultiArray"
}
],
"version" : "6.0.0",
"modelParameters" : [
],
"author" : "Fluid Infernece + Silero Team",
"specificationVersion" : 6,
"storagePrecision" : "Mixed (Float16, Float32)",
"mlProgramOperationTypeHistogram" : {
"Concat" : 9,
"Lstm" : 8,
"SliceByIndex" : 41,
"Clip" : 32,
"Pow" : 16,
"Transpose" : 16,
"Sub" : 2,
"Relu" : 40,
"Squeeze" : 18,
"Cast" : 54,
"Sigmoid" : 8,
"Add" : 16,
"ExpandDims" : 26,
"Sqrt" : 8,
"Mul" : 7,
"Conv" : 48,
"Pad" : 8
},
"computePrecision" : "Mixed (Float16, Float32, Int32)",
"stateSchema" : [
],
"isUpdatable" : "0",
"availability" : {
"macOS" : "12.0",
"tvOS" : "15.0",
"visionOS" : "1.0",
"watchOS" : "8.0",
"iOS" : "15.0",
"macCatalyst" : "15.0"
},
"modelType" : {
"name" : "MLModelType_mlProgram"
},
"inputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 4160)",
"shortDescription" : "",
"shape" : "[1, 4160]",
"name" : "audio_input",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "hidden_state",
"type" : "MultiArray"
},
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 128)",
"shortDescription" : "",
"shape" : "[1, 128]",
"name" : "cell_state",
"type" : "MultiArray"
}
],
"userDefinedMetadata" : {
"com.github.apple.coremltools.conversion_date" : "2025-09-15",
"com.github.apple.coremltools.source" : "torch==2.7.0",
"com.github.apple.coremltools.version" : "9.0b1",
"com.github.apple.coremltools.source_dialect" : "TorchScript"
},
"generatedClassName" : "silero_vad_unified_256ms_v6_0_0",
"method" : "predict"
}
]

View File

@ -21,30 +21,17 @@ class ParakeetTranscriptionService: TranscriptionService {
if isModelLoaded {
return
}
do {
asrManager = AsrManager(config: .default)
let models: AsrModels
if let customDirectory = customModelsDirectory {
logger.notice("🦜 Loading Parakeet models from: \(customDirectory.path)")
models = try await AsrModels.load(from: customDirectory)
} else {
logger.notice("🦜 Loading Parakeet models from default directory")
let defaultDir = AsrModels.defaultCacheDirectory()
models = try await AsrModels.load(from: defaultDir)
}
try await asrManager?.initialize(models: models)
isModelLoaded = true
logger.notice("🦜 Parakeet model loaded successfully")
} catch {
let description = (error as? LocalizedError)?.errorDescription ?? error.localizedDescription
logger.error("🦜 Failed to load Parakeet model: \(description)")
isModelLoaded = false
asrManager = nil
throw error
if let customModelsDirectory {
do {
asrManager = AsrManager(config: .default)
let models = try await AsrModels.load(from: customModelsDirectory)
try await asrManager?.initialize(models: models)
isModelLoaded = true
} catch {
isModelLoaded = false
asrManager = nil
}
}
}
@ -59,22 +46,23 @@ class ParakeetTranscriptionService: TranscriptionService {
let audioSamples = try readAudioSamples(from: audioURL)
let sampleRate = 16000.0
let durationSeconds = Double(audioSamples.count) / sampleRate
let durationSeconds = Double(audioSamples.count) / 16000.0
let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true
let speechAudio: [Float]
if durationSeconds < 20.0 {
if durationSeconds < 20.0 || !isVADEnabled {
speechAudio = audioSamples
} else {
let vadConfig = VadConfig(threshold: 0.7)
if vadManager == nil {
if let bundledVadURL = Bundle.main.url(forResource: ModelNames.VAD.sileroVad, withExtension: "mlmodelc") {
do {
let bundledModel = try MLModel(contentsOf: bundledVadURL)
vadManager = VadManager(config: vadConfig, vadModel: bundledModel)
} catch {
}
} else {
if vadManager == nil, let customModelsDirectory {
do {
vadManager = try await VadManager(
config: vadConfig,
modelDirectory: customModelsDirectory.deletingLastPathComponent()
)
} catch {
// Silent failure
}
}

View File

@ -33,6 +33,14 @@ extension WhisperState {
do {
_ = try await AsrModels.downloadAndLoad(to: parakeetModelsDirectory)
// Also download VAD model into the same parent directory as ASR models
let parentDir = parakeetModelsDirectory.deletingLastPathComponent()
_ = try await DownloadUtils.loadModels(
.vad,
modelNames: Array(ModelNames.VAD.requiredModels),
directory: parentDir
)
self.isParakeetModelDownloaded = true
downloadProgress["parakeet-tdt-0.6b"] = 1.0
} catch {