From 3eebbc4e3bdb63585280f089e2db7dd5bb94b5ab Mon Sep 17 00:00:00 2001 From: Beingpax Date: Sun, 3 Aug 2025 12:44:13 +0545 Subject: [PATCH] Better Parakeet error handling --- VoiceInk.xcodeproj/project.pbxproj | 28 ++-- .../xcshareddata/swiftpm/Package.resolved | 19 ++- .../ParakeetTranscriptionService.swift | 156 +++++++----------- 3 files changed, 93 insertions(+), 110 deletions(-) diff --git a/VoiceInk.xcodeproj/project.pbxproj b/VoiceInk.xcodeproj/project.pbxproj index d6fe44b..fabb13b 100644 --- a/VoiceInk.xcodeproj/project.pbxproj +++ b/VoiceInk.xcodeproj/project.pbxproj @@ -7,7 +7,7 @@ objects = { /* Begin PBXBuildFile section */ - E12E7E972E3F109C006276F2 /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E12E7E962E3F109C006276F2 /* FluidAudio */; }; + E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */ = {isa = PBXBuildFile; productRef = E10F06082E3F390600F7FBDC /* FluidAudio */; }; E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */ = {isa = PBXBuildFile; productRef = E1A261112CC143AC00B233D1 /* KeyboardShortcuts */; }; E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD4592CC5352A00303ECB /* LaunchAtLogin */; }; E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */ = {isa = PBXBuildFile; productRef = E1ADD45E2CC544F100303ECB /* Sparkle */; }; @@ -82,7 +82,7 @@ files = ( E1ADD45A2CC5352A00303ECB /* LaunchAtLogin in Frameworks */, E1D7EF992E35E16C00640029 /* MediaRemoteAdapter in Frameworks */, - E12E7E972E3F109C006276F2 /* FluidAudio in Frameworks */, + E10F06092E3F390600F7FBDC /* FluidAudio in Frameworks */, E1B2DCAB2E3DE70A008DFD68 /* whisper.xcframework in Frameworks */, E1ADD45F2CC544F100303ECB /* Sparkle in Frameworks */, E1A261122CC143AC00B233D1 /* KeyboardShortcuts in Frameworks */, @@ -162,7 +162,7 @@ E1ADD45E2CC544F100303ECB /* Sparkle */, E1F5FA792DA6CBF900B1FD8A /* Zip */, E1D7EF982E35E16C00640029 /* MediaRemoteAdapter */, - E12E7E962E3F109C006276F2 /* FluidAudio */, + E10F06082E3F390600F7FBDC /* FluidAudio */, ); productName = VoiceInk; productReference = E11473B02CBE0F0A00318EE4 /* VoiceInk.app */; @@ -252,7 +252,7 @@ E1ADD45D2CC544F100303ECB /* XCRemoteSwiftPackageReference "Sparkle" */, E1F5FA782DA6CBF900B1FD8A /* XCRemoteSwiftPackageReference "Zip" */, E1D7EF972E35E16C00640029 /* XCRemoteSwiftPackageReference "mediaremote-adapter" */, - E10A90B12E3F0E0400E5D6AF /* XCLocalSwiftPackageReference "../FluidAudio" */, + E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */, ); preferredProjectObjectVersion = 77; productRefGroup = E11473B12CBE0F0A00318EE4 /* Products */; @@ -621,14 +621,15 @@ }; /* End XCConfigurationList section */ -/* Begin XCLocalSwiftPackageReference section */ - E10A90B12E3F0E0400E5D6AF /* XCLocalSwiftPackageReference "../FluidAudio" */ = { - isa = XCLocalSwiftPackageReference; - relativePath = ../FluidAudio; - }; -/* End XCLocalSwiftPackageReference section */ - /* Begin XCRemoteSwiftPackageReference section */ + E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */ = { + isa = XCRemoteSwiftPackageReference; + repositoryURL = "https://github.com/FluidInference/FluidAudio"; + requirement = { + branch = main; + kind = branch; + }; + }; E1A261102CC143AC00B233D1 /* XCRemoteSwiftPackageReference "KeyboardShortcuts" */ = { isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/sindresorhus/KeyboardShortcuts"; @@ -663,7 +664,7 @@ }; E1F5FA782DA6CBF900B1FD8A /* XCRemoteSwiftPackageReference "Zip" */ = { isa = XCRemoteSwiftPackageReference; - repositoryURL = "https://github.com/marmelroy/Zip?tab=readme-ov-file"; + repositoryURL = "https://github.com/marmelroy/Zip"; requirement = { kind = upToNextMajorVersion; minimumVersion = 2.1.2; @@ -672,8 +673,9 @@ /* End XCRemoteSwiftPackageReference section */ /* Begin XCSwiftPackageProductDependency section */ - E12E7E962E3F109C006276F2 /* FluidAudio */ = { + E10F06082E3F390600F7FBDC /* FluidAudio */ = { isa = XCSwiftPackageProductDependency; + package = E10FFA112E3F37D100F7FBDC /* XCRemoteSwiftPackageReference "FluidAudio" */; productName = FluidAudio; }; E1A261112CC143AC00B233D1 /* KeyboardShortcuts */ = { diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 9435b1e..57c2157 100644 --- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,6 +1,15 @@ { - "originHash" : "ae3f634e8c4b39a1a80bcd04f018e2208c0491e42ee824cd94a92d7b88893420", + "originHash" : "0b9379abd19d2f53581c233273d09235e935a8d2b1180cf253dd69baa2784b39", "pins" : [ + { + "identity" : "fluidaudio", + "kind" : "remoteSourceControl", + "location" : "https://github.com/FluidInference/FluidAudio", + "state" : { + "branch" : "main", + "revision" : "2a3d6a948cb332b3fd8ae479a9942e33ade2cc9e" + } + }, { "identity" : "keyboardshortcuts", "kind" : "remoteSourceControl", @@ -33,14 +42,14 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/sparkle-project/Sparkle", "state" : { - "revision" : "0ca3004e98712ea2b39dd881d28448630cce1c99", - "version" : "2.7.0" + "revision" : "df074165274afaa39539c05d57b0832620775b11", + "version" : "2.7.1" } }, { - "identity" : "zip?tab=readme-ov-file", + "identity" : "zip", "kind" : "remoteSourceControl", - "location" : "https://github.com/marmelroy/Zip?tab=readme-ov-file", + "location" : "https://github.com/marmelroy/Zip", "state" : { "revision" : "67fa55813b9e7b3b9acee9c0ae501def28746d76", "version" : "2.1.2" diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index 2aaf2d5..1173ab5 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -46,27 +46,22 @@ class ParakeetTranscriptionService: TranscriptionService { models = try await AsrModels.downloadAndLoad() } - // Check vocabulary file before initialization - let vocabPath = getVocabularyPath() - let vocabExists = FileManager.default.fileExists(atPath: vocabPath.path) - logger.notice("🦜 Vocabulary file exists at \(vocabPath.lastPathComponent): \(vocabExists)") - - if vocabExists { - do { - let vocabData = try Data(contentsOf: vocabPath) - let vocabDict = try JSONSerialization.jsonObject(with: vocabData) as? [String: String] ?? [:] - logger.notice("🦜 Vocabulary loaded with \(vocabDict.count) entries") - } catch { - logger.notice("🦜 Failed to parse vocabulary file: \(error.localizedDescription)") - } - } - try await asrManager?.initialize(models: models) isModelLoaded = true logger.notice("🦜 Parakeet model loaded successfully") + } catch let error as ASRError { + logger.notice("🦜 Parakeet-specific error loading model: \(error.localizedDescription)") + isModelLoaded = false + asrManager = nil + throw error + } catch let error as AsrModelsError { + logger.notice("🦜 Parakeet model management error loading model: \(error.localizedDescription)") + isModelLoaded = false + asrManager = nil + throw error } catch { - logger.notice("🦜 Failed to load Parakeet model: \(error.localizedDescription)") + logger.notice("🦜 Unexpected error loading Parakeet model: \(error.localizedDescription)") isModelLoaded = false asrManager = nil throw error @@ -74,87 +69,64 @@ class ParakeetTranscriptionService: TranscriptionService { } func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { - do { - - if !isModelLoaded { - try await loadModel() - } - - guard let asrManager = asrManager else { - logger.notice("🦜 ASR manager is nil after model loading") - throw NSError(domain: "ParakeetTranscriptionService", code: -1, userInfo: [NSLocalizedDescriptionKey: "Failed to initialize ASR manager."]) - } - - logger.notice("🦜 Starting Parakeet transcription") - let audioSamples = try readAudioSamples(from: audioURL) - logger.notice("🦜 Audio samples loaded: \(audioSamples.count) samples") - - let result = try await asrManager.transcribe(audioSamples) - logger.notice("🦜 Parakeet transcription completed") - - // Check for empty results (vocabulary issue indicator) - if result.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { - logger.notice("🦜 Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue") - } - - if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true { - return WhisperTextFormatter.format(result.text) - } - return result.text - } catch { - logger.notice("🦜 Parakeet transcription failed: \(error.localizedDescription)") - let errorMessage = error.localizedDescription - await MainActor.run { - NotificationManager.shared.showNotification( - title: "Transcription Failed: \(errorMessage)", - type: .error - ) - } - return "" + if asrManager == nil || !isModelLoaded { + try await loadModel() } + + guard let asrManager = asrManager else { + logger.notice("🦜 Parakeet manager is still nil after attempting to load the model.") + throw ASRError.notInitialized + } + + let audioSamples = try readAudioSamples(from: audioURL) + + // Validate audio data before transcription + guard audioSamples.count >= 16000 else { + logger.notice("🦜 Audio too short for transcription: \(audioSamples.count) samples") + throw ASRError.invalidAudioData + } + + let result = try await asrManager.transcribe(audioSamples) + + Task { + asrManager.cleanup() + isModelLoaded = false + logger.notice("🦜 Parakeet ASR models cleaned up from memory") + } + + // Check for empty results (vocabulary issue indicator) + if result.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + logger.notice("🦜 Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue") + } + + if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true { + return WhisperTextFormatter.format(result.text) + } + return result.text } private func readAudioSamples(from url: URL) throws -> [Float] { - logger.notice("🦜 Reading audio file: \(url.lastPathComponent)") - let data = try Data(contentsOf: url) - logger.notice("🦜 Audio file size: \(data.count) bytes") - - // A basic check, assuming a more robust check happens elsewhere. - guard data.count > 44 else { - logger.notice("🦜 Warning: Audio file too small (\(data.count) bytes), expected > 44 bytes") - return [] - } - - let floats = stride(from: 44, to: data.count, by: 2).map { - return data[$0..<$0 + 2].withUnsafeBytes { - let short = Int16(littleEndian: $0.load(as: Int16.self)) - return max(-1.0, min(Float(short) / 32767.0, 1.0)) + do { + let data = try Data(contentsOf: url) + + // Check minimum file size for valid WAV header + guard data.count > 44 else { + logger.notice("🦜 Audio file too small (\(data.count) bytes), expected > 44 bytes") + throw ASRError.invalidAudioData } - } - - logger.notice("🦜 Processed audio: \(floats.count) samples from \(data.count) bytes") - - // Check if we have enough samples for transcription (minimum 16,000 samples = 1 second at 16kHz) - if floats.count < 16000 { - logger.notice("🦜 Warning: Audio too short (\(floats.count) samples), minimum 16,000 required") - } - - return floats - } - - // Helper function to get vocabulary path based on model directory - private func getVocabularyPath() -> URL { - if let customDirectory = customModelsDirectory { - return customDirectory.appendingPathComponent("parakeet_vocab.json") - } else { - let applicationSupportURL = FileManager.default.urls( - for: .applicationSupportDirectory, in: .userDomainMask - ).first! - return applicationSupportURL - .appendingPathComponent("FluidAudio", isDirectory: true) - .appendingPathComponent("Models", isDirectory: true) - .appendingPathComponent("parakeet-tdt-0.6b-v2-coreml", isDirectory: true) - .appendingPathComponent("parakeet_vocab.json") + + let floats = stride(from: 44, to: data.count, by: 2).map { + return data[$0..<$0 + 2].withUnsafeBytes { + let short = Int16(littleEndian: $0.load(as: Int16.self)) + return max(-1.0, min(Float(short) / 32767.0, 1.0)) + } + } + + return floats + } catch { + logger.notice("🦜 Failed to read audio file: \(error.localizedDescription)") + throw ASRError.invalidAudioData } } + } \ No newline at end of file