From 6a308b81bf47143f521b03fbc6cd2b2bbf8bacd3 Mon Sep 17 00:00:00 2001 From: Beingpax Date: Mon, 25 Aug 2025 13:00:35 +0545 Subject: [PATCH] Update app to support Parakeet B3 model --- VoiceInk.xcodeproj/project.pbxproj | 32 ++++++++--------- .../xcshareddata/swiftpm/Package.resolved | 11 +----- VoiceInk/Models/PredefinedModels.swift | 8 ++--- VoiceInk/PowerMode/PowerModeConfigView.swift | 27 +++++++++++++-- .../ParakeetTranscriptionService.swift | 18 ++++------ .../AI Models/LanguageSelectionView.swift | 34 +++++++++++++++++-- .../AI Models/ParakeetModelCardRowView.swift | 3 +- VoiceInk/Whisper/WhisperState+Parakeet.swift | 17 ++++++++-- 8 files changed, 99 insertions(+), 51 deletions(-) diff --git a/VoiceInk.xcodeproj/project.pbxproj b/VoiceInk.xcodeproj/project.pbxproj index 1ee1943..2a79a33 100644 --- a/VoiceInk.xcodeproj/project.pbxproj +++ b/VoiceInk.xcodeproj/project.pbxproj @@ -7,12 +7,12 @@ objects = { /* Begin PBXBuildFile section */ - E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E10F06082E3F390600F7FBDC /* FluidAudio */; }; E17382402E4C7D0E001BAEBE /* whisper.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = E1B2DCAA2E3DE70A008DFD68 /* whisper.xcframework */; }; E17382412E4C7D0E001BAEBE /* whisper.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = E1B2DCAA2E3DE70A008DFD68 /* whisper.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */ = {isa = PBXBuildFile; productRef = E1A261112CC143AC00B233D1 /* KeyboardShortcuts */; }; E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD4592CC5352A00303ECB /* LaunchAtLogin */; }; E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD45E2CC544F100303ECB /* Sparkle */; }; + E1C550882E5C391D00823A34 /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E1C550872E5C391D00823A34 /* FluidAudio */; }; E1D7EF992E35E16C00640029 /* MediaRemoteAdapter in Frameworks */ = {isa = PBXBuildFile; productRef = E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */; }; E1D7EF9A2E35E19B00640029 /* MediaRemoteAdapter in Embed Frameworks */ = {isa = PBXBuildFile; productRef = E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; }; E1ECEC162E44591300DFFBA8 /* Zip in Frameworks */ = {isa = PBXBuildFile; productRef = E1ECEC152E44591300DFFBA8 /* Zip */; }; @@ -83,9 +83,9 @@ files = ( E1ECEC162E44591300DFFBA8 /* Zip in Frameworks */, E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */, + E1C550882E5C391D00823A34 /* FluidAudio in Frameworks */, E1D7EF992E35E16C00640029 /* MediaRemoteAdapter in Frameworks */, E17382402E4C7D0E001BAEBE /* whisper.xcframework in Frameworks */, - E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */, E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */, E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */, ); @@ -163,8 +163,8 @@ E1ADD4592CC5352A00303ECB /* LaunchAtLogin */, E1ADD45E2CC544F100303ECB /* Sparkle */, E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */, - E10F06082E3F390600F7FBDC /* FluidAudio */, E1ECEC152E44591300DFFBA8 /* Zip */, + E1C550872E5C391D00823A34 /* FluidAudio */, ); productName = VoiceInk; productReference = E11473B02CBE0F0A00318EE4 /* VoiceInk.app */; @@ -253,8 +253,8 @@ E1ADD4582CC5352A00303ECB /* XCRemoteSwiftPackageReference "LaunchAtLogin-Modern" */, E1ADD45D2CC544F100303ECB /* XCRemoteSwiftPackageReference "Sparkle" */, E1D7EF972E35E16C00640029 /* XCRemoteSwiftPackageReference "mediaremote-adapter" */, - E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */, E1ECEC142E44590200DFFBA8 /* XCRemoteSwiftPackageReference "Zip" */, + E1C550862E5C391D00823A34 /* XCLocalSwiftPackageReference "../FluidAudio" */, ); preferredProjectObjectVersion = 77; productRefGroup = E11473B12CBE0F0A00318EE4 /* Products */; @@ -626,15 +626,14 @@ }; /* End XCConfigurationList section */ -/* Begin XCRemoteSwiftPackageReference section */ - E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */ = { - isa = XCRemoteSwiftPackageReference; - repositoryURL = "https://github.com/FluidInference/FluidAudio"; - requirement = { - branch = main; - kind = branch; - }; +/* Begin XCLocalSwiftPackageReference section */ + E1C550862E5C391D00823A34 /* XCLocalSwiftPackageReference "../FluidAudio" */ = { + isa = XCLocalSwiftPackageReference; + relativePath = ../FluidAudio; }; +/* End XCLocalSwiftPackageReference section */ + +/* Begin XCRemoteSwiftPackageReference section */ E1A261102CC143AC00B233D1 /* XCRemoteSwiftPackageReference "KeyboardShortcuts" */ = { isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/sindresorhus/KeyboardShortcuts"; @@ -678,11 +677,6 @@ /* End XCRemoteSwiftPackageReference section */ /* Begin XCSwiftPackageProductDependency section */ - E10F06082E3F390600F7FBDC /* FluidAudio */ = { - isa = XCSwiftPackageProductDependency; - package = E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */; - productName = FluidAudio; - }; E1A261112CC143AC00B233D1 /* KeyboardShortcuts */ = { isa = XCSwiftPackageProductDependency; package = E1A261102CC143AC00B233D1 /* XCRemoteSwiftPackageReference "KeyboardShortcuts" */; @@ -698,6 +692,10 @@ package = E1ADD45D2CC544F100303ECB /* XCRemoteSwiftPackageReference "Sparkle" */; productName = Sparkle; }; + E1C550872E5C391D00823A34 /* FluidAudio */ = { + isa = XCSwiftPackageProductDependency; + productName = FluidAudio; + }; E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */ = { isa = XCSwiftPackageProductDependency; package = E1D7EF972E35E16C00640029 /* XCRemoteSwiftPackageReference "mediaremote-adapter" */; diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 57c2157..d951486 100644 --- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,15 +1,6 @@ { - "originHash" : "0b9379abd19d2f53581c233273d09235e935a8d2b1180cf253dd69baa2784b39", + "originHash" : "9616310154c7e55deebbc79c5d81e757c482b0338de08ed6e0c7d6522a9d34e9", "pins" : [ - { - "identity" : "fluidaudio", - "kind" : "remoteSourceControl", - "location" : "https://github.com/FluidInference/FluidAudio", - "state" : { - "branch" : "main", - "revision" : "2a3d6a948cb332b3fd8ae479a9942e33ade2cc9e" - } - }, { "identity" : "keyboardshortcuts", "kind" : "remoteSourceControl", diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index d8d8f70..4deae01 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -90,13 +90,13 @@ import Foundation // Parakeet Model ParakeetModel( name: "parakeet-tdt-0.6b", - displayName: "Parakeet", - description: "NVIDIA's ASR model for lightning-fast english transcription.", - size: "600 MB", + displayName: "Parakeet V3", + description: "NVIDIA's ASR model V3 for lightning-fast multilingual transcription with multi-lingual(English + European) support.", + size: "500 MB", speed: 0.99, accuracy: 0.94, ramUsage: 0.8, - supportedLanguages: getLanguageDictionary(isMultilingual: false, provider: .parakeet) + supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .parakeet) ), // Local Models diff --git a/VoiceInk/PowerMode/PowerModeConfigView.swift b/VoiceInk/PowerMode/PowerModeConfigView.swift index fb35a73..a44566d 100644 --- a/VoiceInk/PowerMode/PowerModeConfigView.swift +++ b/VoiceInk/PowerMode/PowerModeConfigView.swift @@ -41,6 +41,15 @@ struct ConfigurationView: View { // State for prompt editing (similar to EnhancementSettingsView) @State private var isEditingPrompt = false @State private var selectedPromptForEdit: CustomPrompt? + + private func languageSelectionDisabled() -> Bool { + guard let selectedModelName = effectiveModelName, + let model = whisperState.allAvailableModels.first(where: { $0.name == selectedModelName }) + else { + return false + } + return model.provider == .parakeet || model.provider == .gemini + } // Whisper state for model selection @EnvironmentObject private var whisperState: WhisperState @@ -376,9 +385,21 @@ struct ConfigurationView: View { } } - if let selectedModel = effectiveModelName, - let modelInfo = whisperState.allAvailableModels.first(where: { $0.name == selectedModel }), - modelInfo.isMultilingualModel { + if languageSelectionDisabled() { + HStack { + Text("Language") + .font(.subheadline) + .foregroundColor(.secondary) + + Text("Autodetected") + .font(.subheadline) + .foregroundColor(.secondary) + + Spacer() + } + } else if let selectedModel = effectiveModelName, + let modelInfo = whisperState.allAvailableModels.first(where: { $0.name == selectedModel }), + modelInfo.isMultilingualModel { let languageBinding = Binding( get: { diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index 1173ab5..ae837fa 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -26,15 +26,8 @@ class ParakeetTranscriptionService: TranscriptionService { logger.notice("🦜 Starting Parakeet model loading") do { - let asrConfig = ASRConfig( - maxSymbolsPerFrame: 3, - realtimeMode: true, - chunkSizeMs: 1500, - tdtConfig: TdtConfig( - durations: [0, 1, 2, 3, 4], - maxSymbolsPerStep: 3 - ) - ) + let tdtConfig = TdtConfig(maxSymbolsPerStep: 3, durationBins: [0, 1, 2, 3, 4]) + let asrConfig = ASRConfig(tdtConfig: tdtConfig) asrManager = AsrManager(config: asrConfig) let models: AsrModels @@ -78,6 +71,9 @@ class ParakeetTranscriptionService: TranscriptionService { throw ASRError.notInitialized } + // Reset the decoder state before each transcription to ensure no state leaks from previous runs + try await asrManager.resetDecoderState(for: .microphone) + let audioSamples = try readAudioSamples(from: audioURL) // Validate audio data before transcription @@ -110,7 +106,7 @@ class ParakeetTranscriptionService: TranscriptionService { let data = try Data(contentsOf: url) // Check minimum file size for valid WAV header - guard data.count > 44 else { + guard data.count > 44 else { logger.notice("🦜 Audio file too small (\(data.count) bytes), expected > 44 bytes") throw ASRError.invalidAudioData } @@ -129,4 +125,4 @@ class ParakeetTranscriptionService: TranscriptionService { } } -} \ No newline at end of file +} diff --git a/VoiceInk/Views/AI Models/LanguageSelectionView.swift b/VoiceInk/Views/AI Models/LanguageSelectionView.swift index 140e11e..98112d5 100644 --- a/VoiceInk/Views/AI Models/LanguageSelectionView.swift +++ b/VoiceInk/Views/AI Models/LanguageSelectionView.swift @@ -33,6 +33,13 @@ struct LanguageSelectionView: View { return currentModel.isMultilingualModel } + private func languageSelectionDisabled() -> Bool { + guard let provider = whisperState.currentTranscriptionModel?.provider else { + return false + } + return provider == .parakeet || provider == .gemini + } + // Function to get current model's supported languages private func getCurrentModelLanguages() -> [String: String] { guard let currentModel = whisperState.currentTranscriptionModel else { @@ -69,7 +76,22 @@ struct LanguageSelectionView: View { if let currentModel = whisperState.currentTranscriptionModel { - if isMultilingualModel() { + if languageSelectionDisabled() { + VStack(alignment: .leading, spacing: 8) { + Text("Language: Autodetected") + .font(.subheadline) + .foregroundColor(.primary) + + Text("Current model: \(currentModel.displayName)") + .font(.caption) + .foregroundColor(.secondary) + + Text("The transcription language is automatically detected by the model.") + .font(.caption) + .foregroundColor(.secondary) + } + .disabled(true) + } else if isMultilingualModel() { VStack(alignment: .leading, spacing: 8) { Picker("Select Language", selection: $selectedLanguage) { ForEach( @@ -134,7 +156,15 @@ struct LanguageSelectionView: View { // New compact view for menu bar private var menuItemView: some View { Group { - if isMultilingualModel() { + if languageSelectionDisabled() { + Button { + // Do nothing, just showing info + } label: { + Text("Language: Autodetected") + .foregroundColor(.secondary) + } + .disabled(true) + } else if isMultilingualModel() { Menu { ForEach( getCurrentModelLanguages().sorted(by: { diff --git a/VoiceInk/Views/AI Models/ParakeetModelCardRowView.swift b/VoiceInk/Views/AI Models/ParakeetModelCardRowView.swift index 7f59955..784ff22 100644 --- a/VoiceInk/Views/AI Models/ParakeetModelCardRowView.swift +++ b/VoiceInk/Views/AI Models/ParakeetModelCardRowView.swift @@ -104,7 +104,8 @@ struct ParakeetModelCardRowView: View { private var progressSection: some View { Group { if isDownloading { - ProgressView() // Indeterminate for now + let progress = whisperState.downloadProgress["parakeet-tdt-0.6b"] ?? 0.0 + ProgressView(value: progress) .progressViewStyle(LinearProgressViewStyle()) .frame(maxWidth: .infinity, alignment: .leading) .padding(.top, 8) diff --git a/VoiceInk/Whisper/WhisperState+Parakeet.swift b/VoiceInk/Whisper/WhisperState+Parakeet.swift index 9746a49..8b9fe4b 100644 --- a/VoiceInk/Whisper/WhisperState+Parakeet.swift +++ b/VoiceInk/Whisper/WhisperState+Parakeet.swift @@ -22,13 +22,24 @@ extension WhisperState { isDownloadingParakeet = true downloadProgress["parakeet-tdt-0.6b"] = 0.0 + // Start progress simulation + let timer = Timer.scheduledTimer(withTimeInterval: 1.2, repeats: true) { timer in + Task { @MainActor in + if let currentProgress = self.downloadProgress["parakeet-tdt-0.6b"], currentProgress < 0.9 { + self.downloadProgress["parakeet-tdt-0.6b"] = currentProgress + 0.0125 + } + } + } + do { _ = try await AsrModels.downloadAndLoad(to: parakeetModelsDirectory) self.isParakeetModelDownloaded = true + downloadProgress["parakeet-tdt-0.6b"] = 1.0 } catch { self.isParakeetModelDownloaded = false } + timer.invalidate() isDownloadingParakeet = false downloadProgress["parakeet-tdt-0.6b"] = nil @@ -46,14 +57,14 @@ extension WhisperState { // First try: app support directory + bundle path let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] .appendingPathComponent("com.prakashjoshipax.VoiceInk") - let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml") + let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml") if FileManager.default.fileExists(atPath: parakeetModelDirectory.path) { try FileManager.default.removeItem(at: parakeetModelDirectory) } else { // Second try: root of application support directory let rootAppSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] - let rootParakeetModelDirectory = rootAppSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml") + let rootParakeetModelDirectory = rootAppSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml") if FileManager.default.fileExists(atPath: rootParakeetModelDirectory.path) { try FileManager.default.removeItem(at: rootParakeetModelDirectory) @@ -73,7 +84,7 @@ extension WhisperState { func showParakeetModelInFinder() { let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] .appendingPathComponent("com.prakashjoshipax.VoiceInk") - let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v2-coreml") + let parakeetModelDirectory = appSupportDirectory.appendingPathComponent("parakeet-tdt-0.6b-v3-coreml") if FileManager.default.fileExists(atPath: parakeetModelDirectory.path) { NSWorkspace.shared.selectFile(parakeetModelDirectory.path, inFileViewerRootedAtPath: "")