Update app to support Parakeet B3 model

This commit is contained in:
Beingpax 2025-08-25 13:00:35 +05:45
parent 2708cc502a
commit 6a308b81bf
8 changed files with 99 additions and 51 deletions

View File

@ -7,12 +7,12 @@
objects = {
/* Begin PBXBuildFile section */
E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E10F06082E3F390600F7FBDC /* FluidAudio */; };
E17382402E4C7D0E001BAEBE /* whisper.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = E1B2DCAA2E3DE70A008DFD68 /* whisper.xcframework */; };
E17382412E4C7D0E001BAEBE /* whisper.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = E1B2DCAA2E3DE70A008DFD68 /* whisper.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */ = {isa = PBXBuildFile; productRef = E1A261112CC143AC00B233D1 /* KeyboardShortcuts */; };
E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD4592CC5352A00303ECB /* LaunchAtLogin */; };
E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD45E2CC544F100303ECB /* Sparkle */; };
E1C550882E5C391D00823A34 /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E1C550872E5C391D00823A34 /* FluidAudio */; };
E1D7EF992E35E16C00640029 /* MediaRemoteAdapter in Frameworks */ = {isa = PBXBuildFile; productRef = E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */; };
E1D7EF9A2E35E19B00640029 /* MediaRemoteAdapter in Embed Frameworks */ = {isa = PBXBuildFile; productRef = E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; };
E1ECEC162E44591300DFFBA8 /* Zip in Frameworks */ = {isa = PBXBuildFile; productRef = E1ECEC152E44591300DFFBA8 /* Zip */; };
@ -83,9 +83,9 @@
files = (
E1ECEC162E44591300DFFBA8 /* Zip in Frameworks */,
E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */,
E1C550882E5C391D00823A34 /* FluidAudio in Frameworks */,
E1D7EF992E35E16C00640029 /* MediaRemoteAdapter in Frameworks */,
E17382402E4C7D0E001BAEBE /* whisper.xcframework in Frameworks */,
E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */,
E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */,
E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */,
);
@ -163,8 +163,8 @@
E1ADD4592CC5352A00303ECB /* LaunchAtLogin */,
E1ADD45E2CC544F100303ECB /* Sparkle */,
E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */,
E10F06082E3F390600F7FBDC /* FluidAudio */,
E1ECEC152E44591300DFFBA8 /* Zip */,
E1C550872E5C391D00823A34 /* FluidAudio */,
);
productName = VoiceInk;
productReference = E11473B02CBE0F0A00318EE4 /* VoiceInk.app */;
@ -253,8 +253,8 @@
E1ADD4582CC5352A00303ECB /* XCRemoteSwiftPackageReference "LaunchAtLogin-Modern" */,
E1ADD45D2CC544F100303ECB /* XCRemoteSwiftPackageReference "Sparkle" */,
E1D7EF972E35E16C00640029 /* XCRemoteSwiftPackageReference "mediaremote-adapter" */,
E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */,
E1ECEC142E44590200DFFBA8 /* XCRemoteSwiftPackageReference "Zip" */,
E1C550862E5C391D00823A34 /* XCLocalSwiftPackageReference "../FluidAudio" */,
);
preferredProjectObjectVersion = 77;
productRefGroup = E11473B12CBE0F0A00318EE4 /* Products */;
@ -626,15 +626,14 @@
};
/* End XCConfigurationList section */
/* Begin XCRemoteSwiftPackageReference section */
E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */ = {
isa = XCRemoteSwiftPackageReference;
repositoryURL = "https://github.com/FluidInference/FluidAudio";
requirement = {
branch = main;
kind = branch;
};
/* Begin XCLocalSwiftPackageReference section */
E1C550862E5C391D00823A34 /* XCLocalSwiftPackageReference "../FluidAudio" */ = {
isa = XCLocalSwiftPackageReference;
relativePath = ../FluidAudio;
};
/* End XCLocalSwiftPackageReference section */
/* Begin XCRemoteSwiftPackageReference section */
E1A261102CC143AC00B233D1 /* XCRemoteSwiftPackageReference "KeyboardShortcuts" */ = {
isa = XCRemoteSwiftPackageReference;
repositoryURL = "https://github.com/sindresorhus/KeyboardShortcuts";
@ -678,11 +677,6 @@
/* End XCRemoteSwiftPackageReference section */
/* Begin XCSwiftPackageProductDependency section */
E10F06082E3F390600F7FBDC /* FluidAudio */ = {
isa = XCSwiftPackageProductDependency;
package = E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */;
productName = FluidAudio;
};
E1A261112CC143AC00B233D1 /* KeyboardShortcuts */ = {
isa = XCSwiftPackageProductDependency;
package = E1A261102CC143AC00B233D1 /* XCRemoteSwiftPackageReference "KeyboardShortcuts" */;
@ -698,6 +692,10 @@
package = E1ADD45D2CC544F100303ECB /* XCRemoteSwiftPackageReference "Sparkle" */;
productName = Sparkle;
};
E1C550872E5C391D00823A34 /* FluidAudio */ = {
isa = XCSwiftPackageProductDependency;
productName = FluidAudio;
};
E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */ = {
isa = XCSwiftPackageProductDependency;
package = E1D7EF972E35E16C00640029 /* XCRemoteSwiftPackageReference "mediaremote-adapter" */;

View File

@ -1,15 +1,6 @@
{
"originHash" : "0b9379abd19d2f53581c233273d09235e935a8d2b1180cf253dd69baa2784b39",
"originHash" : "9616310154c7e55deebbc79c5d81e757c482b0338de08ed6e0c7d6522a9d34e9",
"pins" : [
{
"identity" : "fluidaudio",
"kind" : "remoteSourceControl",
"location" : "https://github.com/FluidInference/FluidAudio",
"state" : {
"branch" : "main",
"revision" : "2a3d6a948cb332b3fd8ae479a9942e33ade2cc9e"
}
},
{
"identity" : "keyboardshortcuts",
"kind" : "remoteSourceControl",

View File

@ -90,13 +90,13 @@ import Foundation
// Parakeet Model
ParakeetModel(
name: "parakeet-tdt-0.6b",
displayName: "Parakeet",
description: "NVIDIA's ASR model for lightning-fast english transcription.",
size: "600 MB",
displayName: "Parakeet V3",
description: "NVIDIA's ASR model V3 for lightning-fast multilingual transcription with multi-lingual(English + European) support.",
size: "500 MB",
speed: 0.99,
accuracy: 0.94,
ramUsage: 0.8,
supportedLanguages: getLanguageDictionary(isMultilingual: false, provider: .parakeet)
supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .parakeet)
),
// Local Models

View File

@ -41,6 +41,15 @@ struct ConfigurationView: View {
// State for prompt editing (similar to EnhancementSettingsView)
@State private var isEditingPrompt = false
@State private var selectedPromptForEdit: CustomPrompt?
private func languageSelectionDisabled() -> Bool {
guard let selectedModelName = effectiveModelName,
let model = whisperState.allAvailableModels.first(where: { $0.name == selectedModelName })
else {
return false
}
return model.provider == .parakeet || model.provider == .gemini
}
// Whisper state for model selection
@EnvironmentObject private var whisperState: WhisperState
@ -376,9 +385,21 @@ struct ConfigurationView: View {
}
}
if let selectedModel = effectiveModelName,
let modelInfo = whisperState.allAvailableModels.first(where: { $0.name == selectedModel }),
modelInfo.isMultilingualModel {
if languageSelectionDisabled() {
HStack {
Text("Language")
.font(.subheadline)
.foregroundColor(.secondary)
Text("Autodetected")
.font(.subheadline)
.foregroundColor(.secondary)
Spacer()
}
} else if let selectedModel = effectiveModelName,
let modelInfo = whisperState.allAvailableModels.first(where: { $0.name == selectedModel }),
modelInfo.isMultilingualModel {
let languageBinding = Binding<String?>(
get: {

View File

@ -26,15 +26,8 @@ class ParakeetTranscriptionService: TranscriptionService {
logger.notice("🦜 Starting Parakeet model loading")
do {
let asrConfig = ASRConfig(
maxSymbolsPerFrame: 3,
realtimeMode: true,
chunkSizeMs: 1500,
tdtConfig: TdtConfig(
durations: [0, 1, 2, 3, 4],
maxSymbolsPerStep: 3
)
)
let tdtConfig = TdtConfig(maxSymbolsPerStep: 3, durationBins: [0, 1, 2, 3, 4])
let asrConfig = ASRConfig(tdtConfig: tdtConfig)
asrManager = AsrManager(config: asrConfig)
let models: AsrModels
@ -78,6 +71,9 @@ class ParakeetTranscriptionService: TranscriptionService {
throw ASRError.notInitialized
}
// Reset the decoder state before each transcription to ensure no state leaks from previous runs
try await asrManager.resetDecoderState(for: .microphone)
let audioSamples = try readAudioSamples(from: audioURL)
// Validate audio data before transcription
@ -110,7 +106,7 @@ class ParakeetTranscriptionService: TranscriptionService {
let data = try Data(contentsOf: url)
// Check minimum file size for valid WAV header
guard data.count > 44 else {
guard data.count > 44 else {
logger.notice("🦜 Audio file too small (\(data.count) bytes), expected > 44 bytes")
throw ASRError.invalidAudioData
}
@ -129,4 +125,4 @@ class ParakeetTranscriptionService: TranscriptionService {
}
}
}
}

View File

@ -33,6 +33,13 @@ struct LanguageSelectionView: View {
return currentModel.isMultilingualModel
}
private func languageSelectionDisabled() -> Bool {
guard let provider = whisperState.currentTranscriptionModel?.provider else {
return false
}
return provider == .parakeet || provider == .gemini
}
// Function to get current model's supported languages
private func getCurrentModelLanguages() -> [String: String] {
guard let currentModel = whisperState.currentTranscriptionModel else {
@ -69,7 +76,22 @@ struct LanguageSelectionView: View {
if let currentModel = whisperState.currentTranscriptionModel
{
if isMultilingualModel() {
if languageSelectionDisabled() {
VStack(alignment: .leading, spacing: 8) {
Text("Language: Autodetected")
.font(.subheadline)
.foregroundColor(.primary)
Text("Current model: \(currentModel.displayName)")
.font(.caption)
.foregroundColor(.secondary)
Text("The transcription language is automatically detected by the model.")
.font(.caption)
.foregroundColor(.secondary)
}
.disabled(true)
} else if isMultilingualModel() {
VStack(alignment: .leading, spacing: 8) {
Picker("Select Language", selection: $selectedLanguage) {
ForEach(
@ -134,7 +156,15 @@ struct LanguageSelectionView: View {
// New compact view for menu bar
private var menuItemView: some View {
Group {
if isMultilingualModel() {
if languageSelectionDisabled() {
Button {
// Do nothing, just showing info
} label: {
Text("Language: Autodetected")
.foregroundColor(.secondary)
}
.disabled(true)
} else if isMultilingualModel() {
Menu {
ForEach(
getCurrentModelLanguages().sorted(by: {

View File

@ -104,7 +104,8 @@ struct ParakeetModelCardRowView: View {
private var progressSection: some View {
Group {
if isDownloading {
ProgressView() // Indeterminate for now
let progress = whisperState.downloadProgress["parakeet-tdt-0.6b"] ?? 0.0
ProgressView(value: progress)
.progressViewStyle(LinearProgressViewStyle())
.frame(maxWidth: .infinity, alignment: .leading)
.padding(.top, 8)

View File

@ -22,13 +22,24 @@ extension WhisperState {
isDownloadingParakeet = true
downloadProgress["parakeet-tdt-0.6b"] = 0.0
// Start progress simulation
let timer = Timer.scheduledTimer(withTimeInterval: 1.2, repeats: true) { timer in
Task { @MainActor in
if let currentProgress = self.downloadProgress["parakeet-tdt-0.6b"], currentProgress < 0.9 {
self.downloadProgress["parakeet-tdt-0.6b"] = currentProgress + 0.0125
}
}
}
do {
_ = try await AsrModels.downloadAndLoad(to: parakeetModelsDirectory)
self.isParakeetModelDownloaded = true
downloadProgress["parakeet-tdt-0.6b"] = 1.0
} catch {
self.isParakeetModelDownloaded = false
}
timer.invalidate()
isDownloadingParakeet = false
downloadProgress["parakeet-tdt-0.6b"] = nil
@ -46,14 +57,14 @@ extension WhisperState {
// First try: app support directory + bundle path
let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
.appendingPathComponent("com.prakashjoshipax.VoiceInk")
let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml")
let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml")
if FileManager.default.fileExists(atPath: parakeetModelDirectory.path) {
try FileManager.default.removeItem(at: parakeetModelDirectory)
} else {
// Second try: root of application support directory
let rootAppSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
let rootParakeetModelDirectory = rootAppSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml")
let rootParakeetModelDirectory = rootAppSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml")
if FileManager.default.fileExists(atPath: rootParakeetModelDirectory.path) {
try FileManager.default.removeItem(at: rootParakeetModelDirectory)
@ -73,7 +84,7 @@ extension WhisperState {
func showParakeetModelInFinder() {
let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
.appendingPathComponent("com.prakashjoshipax.VoiceInk")
let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml")
let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml")
if FileManager.default.fileExists(atPath: parakeetModelDirectory.path) {
NSWorkspace.shared.selectFile(parakeetModelDirectory.path, inFileViewerRootedAtPath: "")