diff --git a/VoiceInk.xcodeproj/project.pbxproj b/VoiceInk.xcodeproj/project.pbxproj index c7e5704..7ba5594 100644 --- a/VoiceInk.xcodeproj/project.pbxproj +++ b/VoiceInk.xcodeproj/project.pbxproj @@ -471,21 +471,24 @@ COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 169; DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\""; - DEVELOPMENT_TEAM = V6J6A3VWY2; + DEVELOPMENT_TEAM = QP43ZA49TG; ENABLE_HARDENED_RUNTIME = YES; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = VoiceInk/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = VoiceInk; INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity"; + INFOPLIST_KEY_LSUIElement = NO; + INFOPLIST_KEY_NSAppleEventsUsageDescription = "VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations."; INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "VoiceInk needs access to your microphone to record audio for transcription."; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/../Frameworks", ); MACOSX_DEPLOYMENT_TARGET = 14.0; MARKETING_VERSION = 1.69; - PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk; + PRODUCT_BUNDLE_IDENTIFIER = "--com.jakeshore.VoiceInk-com.jakeshore.VoiceInk"; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)"; SWIFT_EMIT_LOC_STRINGS = YES; @@ -505,21 +508,24 @@ COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 169; DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\""; - DEVELOPMENT_TEAM = V6J6A3VWY2; + DEVELOPMENT_TEAM = QP43ZA49TG; ENABLE_HARDENED_RUNTIME = YES; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = VoiceInk/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = VoiceInk; INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity"; + INFOPLIST_KEY_LSUIElement = NO; + INFOPLIST_KEY_NSAppleEventsUsageDescription = "VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations."; INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "VoiceInk needs access to your microphone to record audio for transcription."; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/../Frameworks", ); MACOSX_DEPLOYMENT_TARGET = 14.0; MARKETING_VERSION = 1.69; - PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk; + PRODUCT_BUNDLE_IDENTIFIER = "--com.jakeshore.VoiceInk-com.jakeshore.VoiceInk"; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)"; SWIFT_EMIT_LOC_STRINGS = YES; diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 8f05b55..c1dc954 100644 --- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "93572b72309723585f9fa623350a6b09a152df9dec03f14a5b938629e0f677a0", + "originHash" : "144ae35ef0b62c92588dc767eb6b2d443797062688bf1347662bed55d75a7ec2", "pins" : [ { "identity" : "axswift", @@ -16,7 +16,7 @@ "location" : "https://github.com/FluidInference/FluidAudio", "state" : { "branch" : "main", - "revision" : "ddee663c4a9806d4f139943b0978b0f0a961587b" + "revision" : "11805437821b7e2efc044fc9c5b9b8ce88f6f29f" } }, { @@ -52,7 +52,7 @@ "location" : "https://github.com/ejbills/mediaremote-adapter", "state" : { "branch" : "master", - "revision" : "3529aa25023082a2ceadebcd2c9c4a9430ee96b9" + "revision" : "78aae86c03adab11a7b352211cc82381737cf854" } }, { @@ -69,8 +69,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/sparkle-project/Sparkle", "state" : { - "revision" : "9a1d2a19d3595fcf8d9c447173f9a1687b3dcadb", - "version" : "2.8.0" + "revision" : "5581748cef2bae787496fe6d61139aebe0a451f6", + "version" : "2.8.1" } }, { diff --git a/VoiceInk/CoreAudioRecorder.swift b/VoiceInk/CoreAudioRecorder.swift index 09a08fe..49b4a67 100644 --- a/VoiceInk/CoreAudioRecorder.swift +++ b/VoiceInk/CoreAudioRecorder.swift @@ -48,6 +48,9 @@ final class CoreAudioRecorder { private var renderBuffer: UnsafeMutablePointer? private var renderBufferSize: UInt32 = 0 + // Streaming callback for real-time audio processing (called from audio thread) + var streamingAudioCallback: ((_ samples: UnsafePointer, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)? + // MARK: - Initialization init() {} @@ -541,7 +544,6 @@ final class CoreAudioRecorder { inBusNumber: UInt32, inNumberFrames: UInt32 ) -> OSStatus { - guard let audioUnit = audioUnit, isRecording, let renderBuf = renderBuffer else { return noErr } @@ -581,6 +583,11 @@ final class CoreAudioRecorder { return status } + // Call streaming callback with raw audio samples (for real-time transcription) + if let callback = streamingAudioCallback { + callback(renderBuf, inNumberFrames, deviceFormat.mSampleRate, channelCount) + } + // Calculate audio meters from input buffer calculateMeters(from: &bufferList, frameCount: inNumberFrames) diff --git a/VoiceInk/CursorPaster.swift b/VoiceInk/CursorPaster.swift index c8954dd..1535ca5 100644 --- a/VoiceInk/CursorPaster.swift +++ b/VoiceInk/CursorPaster.swift @@ -1,12 +1,31 @@ import Foundation import AppKit +import os.log class CursorPaster { + private static let logger = Logger(subsystem: "com.jakeshore.VoiceInk", category: "CursorPaster") + + // MARK: - Streaming Mode + // When streaming is active, we skip clipboard save/restore to avoid conflicts + // with rapid consecutive paste operations + private static var isStreamingMode: Bool = false + + /// Enable or disable streaming mode. When enabled, clipboard save/restore is skipped + /// to prevent race conditions during rapid streaming text updates. + static func setStreamingMode(_ enabled: Bool) { + isStreamingMode = enabled + logger.notice("📋 Streaming mode \(enabled ? "enabled" : "disabled")") + } static func pasteAtCursor(_ text: String) { + logger.notice("📋 pasteAtCursor called with \(text.count) chars: '\(text.prefix(50))...'") + logger.notice("📋 AXIsProcessTrusted = \(AXIsProcessTrusted())") let pasteboard = NSPasteboard.general - // Default to true if not explicitly set by user - let shouldRestoreClipboard = UserDefaults.standard.object(forKey: "restoreClipboardAfterPaste") as? Bool ?? true + + // During streaming mode, skip clipboard save/restore to avoid race conditions + // with rapid consecutive paste operations + let userWantsRestore = UserDefaults.standard.object(forKey: "restoreClipboardAfterPaste") as? Bool ?? true + let shouldRestoreClipboard = userWantsRestore && !isStreamingMode var savedContents: [(NSPasteboard.PasteboardType, Data)] = [] @@ -67,25 +86,29 @@ class CursorPaster { } private static func pasteUsingCommandV() { + logger.notice("📋 pasteUsingCommandV called") guard AXIsProcessTrusted() else { + logger.error("❌ pasteUsingCommandV: AXIsProcessTrusted() returned false!") return } - + let source = CGEventSource(stateID: .hidSystemState) - + let cmdDown = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: true) let vDown = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: true) let vUp = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: false) let cmdUp = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: false) - + cmdDown?.flags = .maskCommand vDown?.flags = .maskCommand vUp?.flags = .maskCommand - + cmdUp?.flags = .maskCommand // Fix: cmdUp also needs .maskCommand flag + cmdDown?.post(tap: .cghidEventTap) vDown?.post(tap: .cghidEventTap) vUp?.post(tap: .cghidEventTap) cmdUp?.post(tap: .cghidEventTap) + logger.notice("📋 pasteUsingCommandV: Posted Cmd+V events") } // Simulate pressing the Return / Enter key @@ -97,4 +120,32 @@ class CursorPaster { enterDown?.post(tap: .cghidEventTap) enterUp?.post(tap: .cghidEventTap) } + + /// Deletes the specified number of characters by simulating backspace key presses + /// Includes inter-key delays to ensure reliable deletion across all applications + static func deleteCharacters(count: Int) { + logger.notice("📋 deleteCharacters called with count=\(count)") + guard AXIsProcessTrusted() else { + logger.error("❌ deleteCharacters: AXIsProcessTrusted() returned false!") + return + } + guard count > 0 else { return } + + let source = CGEventSource(stateID: .hidSystemState) + let backspaceKeyCode: CGKeyCode = 0x33 // Backspace key + + for i in 0.. - SUEnableInstallerLauncherService - - SUFeedURL - https://beingpax.github.io/VoiceInk/appcast.xml - SUPublicEDKey - rLRdZIjK3gHKfqNlAF9nT7FbjwSvwkJ8BVn0v2mD1Mo= - LSUIElement - - SUEnableAutomaticChecks - - NSMicrophoneUsageDescription - VoiceInk needs access to your microphone to record audio for transcription. - NSAppleEventsUsageDescription - VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations. - NSScreenCaptureUsageDescription - VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy. CFBundleDocumentTypes - CFBundleTypeName - Audio/Video File - CFBundleTypeRole - Viewer - LSHandlerRank - Alternate - LSItemContentTypes - - public.audio - public.movie - CFBundleTypeExtensions wav @@ -44,7 +17,28 @@ flac caf + CFBundleTypeName + Audio/Video File + CFBundleTypeRole + Viewer + LSHandlerRank + Alternate + LSItemContentTypes + + public.audio + public.movie + + NSScreenCaptureUsageDescription + VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy. + SUEnableAutomaticChecks + + SUEnableInstallerLauncherService + + SUFeedURL + https://beingpax.github.io/VoiceInk/appcast.xml + SUPublicEDKey + rLRdZIjK3gHKfqNlAF9nT7FbjwSvwkJ8BVn0v2mD1Mo= diff --git a/VoiceInk/Models/LicenseViewModel.swift b/VoiceInk/Models/LicenseViewModel.swift index e867d52..32d1e05 100644 --- a/VoiceInk/Models/LicenseViewModel.swift +++ b/VoiceInk/Models/LicenseViewModel.swift @@ -19,9 +19,11 @@ class LicenseViewModel: ObservableObject { private let polarService = PolarService() private let userDefaults = UserDefaults.standard private let licenseManager = LicenseManager.shared + private var isInitializing = true init() { loadLicenseState() + isInitializing = false } func startTrial() { @@ -29,7 +31,10 @@ class LicenseViewModel: ObservableObject { if licenseManager.trialStartDate == nil { licenseManager.trialStartDate = Date() licenseState = .trial(daysRemaining: trialPeriodDays) - NotificationCenter.default.post(name: .licenseStatusChanged, object: nil) + // Don't post notification during initialization to prevent recursive loop + if !isInitializing { + NotificationCenter.default.post(name: .licenseStatusChanged, object: nil) + } } } diff --git a/VoiceInk/PlaybackController.swift b/VoiceInk/PlaybackController.swift index 54b280f..5a6b7a5 100644 --- a/VoiceInk/PlaybackController.swift +++ b/VoiceInk/PlaybackController.swift @@ -40,7 +40,7 @@ class PlaybackController: ObservableObject { private func setupMediaControllerCallbacks() { mediaController.onTrackInfoReceived = { [weak self] trackInfo in - self?.isMediaPlaying = trackInfo.payload.isPlaying ?? false + self?.isMediaPlaying = trackInfo?.payload.isPlaying ?? false self?.lastKnownTrackInfo = trackInfo } diff --git a/VoiceInk/Recorder.swift b/VoiceInk/Recorder.swift index ed95f78..ed27eb8 100644 --- a/VoiceInk/Recorder.swift +++ b/VoiceInk/Recorder.swift @@ -18,6 +18,9 @@ class Recorder: NSObject, ObservableObject { private var audioMeterUpdateTask: Task? private var audioRestorationTask: Task? private var hasDetectedAudioInCurrentSession = false + + /// Stored streaming callback - applied when CoreAudioRecorder is created + private var pendingStreamingCallback: ((_ samples: UnsafePointer, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)? enum RecorderError: Error { case couldNotStartRecording @@ -127,6 +130,12 @@ class Recorder: NSObject, ObservableObject { let coreAudioRecorder = CoreAudioRecorder() recorder = coreAudioRecorder + // Apply any pending streaming callback that was set before recording started + if let callback = pendingStreamingCallback { + coreAudioRecorder.streamingAudioCallback = callback + logger.notice("🎙️ Applied pending streaming callback to recorder") + } + try coreAudioRecorder.startRecording(toOutputFile: url, deviceID: deviceID) audioRestorationTask?.cancel() @@ -179,6 +188,7 @@ class Recorder: NSObject, ObservableObject { func stopRecording() { audioLevelCheckTask?.cancel() audioMeterUpdateTask?.cancel() + recorder?.streamingAudioCallback = nil // Clear streaming callback recorder?.stopRecording() recorder = nil audioMeter = AudioMeter(averagePower: 0, peakPower: 0) @@ -190,6 +200,15 @@ class Recorder: NSObject, ObservableObject { deviceManager.isRecordingActive = false } + /// Sets a callback to receive real-time audio samples for streaming transcription. + /// The callback is invoked on the audio thread - do not perform blocking operations. + /// Note: The callback is stored and applied when recording starts (CoreAudioRecorder is created lazily). + func setStreamingAudioCallback(_ callback: ((_ samples: UnsafePointer, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)?) { + pendingStreamingCallback = callback + // Also apply immediately if recorder already exists + recorder?.streamingAudioCallback = callback + } + private func handleRecordingError(_ error: Error) async { logger.error("❌ Recording error occurred: \(error.localizedDescription)") diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index 82927a5..ada1a31 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -21,6 +21,17 @@ class ParakeetTranscriptionService: TranscriptionService { private var activeVersion: AsrModelVersion? private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink.parakeet", category: "ParakeetTranscriptionService") + init() { + logger.notice("🆕 ParakeetTranscriptionService initialized (v4 - raw audio, no preprocessing)") + } + + // MARK: - Streaming Properties (using StreamingEouAsrManager for low-latency 160ms chunks) + private var streamingEouManager: StreamingEouAsrManager? + private var streamingTask: Task? + private var streamingContinuation: AsyncStream.Continuation? + private var streamAudioCallCount = 0 + private var lastPartialTranscript: String = "" + private func version(for model: any TranscriptionModel) -> AsrModelVersion { model.name.lowercased().contains("v2") ? .v2 : .v3 } @@ -121,4 +132,190 @@ class ParakeetTranscriptionService: TranscriptionService { vadManager = nil activeVersion = nil } + + // MARK: - Streaming Transcription (Low-Latency EOU Mode) + + /// Gets the directory for EOU streaming models + private func getEouModelsDirectory() -> URL { + let applicationSupportURL = FileManager.default.urls( + for: .applicationSupportDirectory, in: .userDomainMask + ).first! + let appDirectory = applicationSupportURL.appendingPathComponent("FluidAudio", isDirectory: true) + return appDirectory.appendingPathComponent("Models/parakeet-eou-streaming/160ms", isDirectory: true) + } + + /// Downloads EOU models if not already present + private func ensureEouModelsDownloaded() async throws -> URL { + let modelsDir = getEouModelsDirectory() + let encoderPath = modelsDir.appendingPathComponent("streaming_encoder.mlmodelc") + + if !FileManager.default.fileExists(atPath: encoderPath.path) { + logger.notice("🎙️ Downloading Parakeet EOU 160ms models for streaming preview...") + let baseDir = modelsDir.deletingLastPathComponent().deletingLastPathComponent() + try await DownloadUtils.downloadRepo(.parakeetEou160, to: baseDir) + logger.notice("🎙️ EOU 160ms models downloaded successfully") + } + + return modelsDir + } + + /// Starts a streaming transcription session using StreamingEouAsrManager for near-instant results. + /// Uses 160ms chunks for lowest latency (~160ms between updates). + /// Returns an AsyncStream that emits transcription text updates as they arrive. + func startStreaming(model: ParakeetModel) async throws -> AsyncStream { + logger.notice("🎙️ Starting low-latency EOU streaming transcription") + + // Reset state + streamAudioCallCount = 0 + lastPartialTranscript = "" + + // Download EOU models if needed + let modelsDir = try await ensureEouModelsDownloaded() + + // Create StreamingEouAsrManager with 160ms chunks for lowest latency preview + // In HYBRID mode: streaming is just for visual feedback, batch provides accuracy + // EOU debounce of 1280ms means end-of-utterance detection after ~1.3s of silence + let manager = StreamingEouAsrManager(chunkSize: .ms160, eouDebounceMs: 1280) + streamingEouManager = manager + + // Load Parakeet EOU models + try await manager.loadModels(modelDir: modelsDir) + + logger.notice("🎙️ EOU streaming preview started with 160ms chunks (batch will provide accuracy)") + + // Create stream using makeStream for proper continuation management + let (stream, continuation) = AsyncStream.makeStream() + self.streamingContinuation = continuation + + // Set up partial callback BEFORE returning the stream (fixes race condition) + await manager.setPartialCallback { [weak self] partialText in + guard let self = self else { return } + let trimmed = partialText.trimmingCharacters(in: .whitespaces) + if !trimmed.isEmpty && trimmed != self.lastPartialTranscript { + self.lastPartialTranscript = trimmed + self.logger.notice("🎙️ Partial update: '\(trimmed.prefix(50))...'") + continuation.yield(trimmed) + } + } + + // Note: Removed onTermination callback that called cancelStreaming() + // This was causing a race condition where the manager was nullified + // before finishStreaming() could call manager.finish() + // Cleanup is handled by finishStreaming()'s defer block instead + + logger.notice("🎙️ Callback registered, streaming ready") + return stream + } + + /// Feeds raw audio samples to the streaming EOU transcription engine. + /// Called from the audio thread - creates AVAudioPCMBuffer and forwards to manager. + /// SDK handles resampling to 16kHz internally. No preprocessing applied (research shows it hurts accuracy). + func streamAudio(samples: UnsafePointer, frameCount: UInt32, sampleRate: Double, channels: UInt32) { + streamAudioCallCount += 1 + + // Create buffer at original sample rate + // SDK's process() method handles resampling to 16kHz internally via AudioConverter + guard let audioBuffer = createOriginalFormatBuffer(samples: samples, frameCount: frameCount, sampleRate: sampleRate, channels: channels) else { + if streamAudioCallCount <= 5 { + logger.warning("Failed to create audio buffer at chunk #\(self.streamAudioCallCount)") + } + return + } + + guard streamingEouManager != nil else { + return + } + + // StreamingEouAsrManager.process is an actor method, dispatch to avoid blocking audio thread + Task.detached { [weak self, audioBuffer] in + do { + _ = try await self?.streamingEouManager?.process(audioBuffer: audioBuffer) + } catch { + self?.logger.warning("EOU process error: \(error.localizedDescription)") + } + } + } + + /// Creates a MONO AVAudioPCMBuffer from interleaved input samples. + /// No preprocessing - research shows gain control and noise reduction HURT ASR accuracy. + /// Just converts stereo to mono if needed, passes raw audio otherwise. + private func createOriginalFormatBuffer(samples: UnsafePointer, frameCount: UInt32, sampleRate: Double, channels: UInt32) -> AVAudioPCMBuffer? { + // Create MONO non-interleaved format - simplest format for ASR + guard let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: sampleRate, + channels: 1, // Output is MONO + interleaved: false + ) else { + return nil + } + + guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else { + return nil + } + + buffer.frameLength = frameCount + + guard let monoData = buffer.floatChannelData?[0] else { + return nil + } + + let channelCount = Int(channels) + let frames = Int(frameCount) + + if channelCount == 1 { + // Already mono - direct copy (no gain, no processing) + for frame in 0.. String { + defer { + streamingTask?.cancel() + streamingTask = nil + streamingContinuation?.finish() + streamingContinuation = nil + streamingEouManager = nil + lastPartialTranscript = "" + } + + guard let manager = streamingEouManager else { + return "" + } + let finalText = try await manager.finish() + logger.notice("🎙️ EOU streaming finished with \(finalText.count) characters") + return finalText + } + + /// Cancels the streaming session without returning results. + func cancelStreaming() async { + streamingTask?.cancel() + streamingTask = nil + streamingContinuation?.finish() + streamingContinuation = nil + + if let manager = streamingEouManager { + await manager.reset() + streamingEouManager = nil + lastPartialTranscript = "" + logger.notice("🎙️ Cancelled EOU streaming transcription") + } + } + } diff --git a/VoiceInk/Whisper/WhisperState.swift b/VoiceInk/Whisper/WhisperState.swift index aedb491..0aae02e 100644 --- a/VoiceInk/Whisper/WhisperState.swift +++ b/VoiceInk/Whisper/WhisperState.swift @@ -28,6 +28,11 @@ class WhisperState: NSObject, ObservableObject { @Published var miniRecorderError: String? @Published var shouldCancelRecording = false + // MARK: - Streaming Transcription Properties + private var streamingUpdateTask: Task? + private var lastStreamedText: String = "" + private var isStreamingActive: Bool = false + @Published var recorderType: String = UserDefaults.standard.string(forKey: "RecorderType") ?? "mini" { didSet { @@ -100,7 +105,12 @@ class WhisperState: NSObject, ObservableObject { // For model progress tracking @Published var downloadProgress: [String: Double] = [:] @Published var parakeetDownloadStates: [String: Bool] = [:] - + + /// Returns true if the current transcription model supports streaming (Parakeet only) + var isStreamingSupported: Bool { + currentTranscriptionModel?.provider == .parakeet + } + init(modelContext: ModelContext, enhancementService: AIEnhancementService? = nil) { self.modelContext = modelContext let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] @@ -141,28 +151,41 @@ class WhisperState: NSObject, ObservableObject { func toggleRecord(powerModeId: UUID? = nil) async { if recordingState == .recording { await recorder.stopRecording() - if let recordedFile { - if !shouldCancelRecording { - let audioAsset = AVURLAsset(url: recordedFile) - let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0 - let transcription = Transcription( - text: "", - duration: duration, - audioFileURL: recordedFile.absoluteString, - transcriptionStatus: .pending - ) - modelContext.insert(transcription) - try? modelContext.save() - NotificationCenter.default.post(name: .transcriptionCreated, object: transcription) - - await transcribeAudio(on: transcription) - } else { - await MainActor.run { - recordingState = .idle - } - await cleanupModelResources() + // Handle cancellation - clean up streaming if active + if shouldCancelRecording { + if isStreamingActive { + await cancelStreamingTranscription() } + await MainActor.run { + recordingState = .idle + } + await cleanupModelResources() + return + } + + // Handle streaming transcription completion + if isStreamingActive { + await handleStreamingCompletion() + return + } + + // Non-streaming (batch) transcription + if let recordedFile { + let audioAsset = AVURLAsset(url: recordedFile) + let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0 + + let transcription = Transcription( + text: "", + duration: duration, + audioFileURL: recordedFile.absoluteString, + transcriptionStatus: .pending + ) + modelContext.insert(transcription) + try? modelContext.save() + NotificationCenter.default.post(name: .transcriptionCreated, object: transcription) + + await transcribeAudio(on: transcription) } else { logger.error("❌ No recorded file found after stopping recording") await MainActor.run { @@ -189,7 +212,16 @@ class WhisperState: NSObject, ObservableObject { let permanentURL = self.recordingsDirectory.appendingPathComponent(fileName) self.recordedFile = permanentURL + // IMPORTANT: Set up streaming BEFORE starting recording to avoid losing early audio + // Check if we're using a Parakeet model and set up streaming first + let isParakeetModel = self.currentTranscriptionModel is ParakeetModel + if isParakeetModel { + self.logger.notice("🎙️ Detected Parakeet model, setting up streaming BEFORE recording...") + await self.startStreamingTranscription() + } + try await self.recorder.startRecording(toOutputFile: permanentURL) + self.logger.notice("🎙️ Recording started\(isParakeetModel ? " (streaming already active)" : "")") await MainActor.run { self.recordingState = .recording @@ -202,9 +234,19 @@ class WhisperState: NSObject, ObservableObject { // Load model and capture context in background without blocking Task.detached { [weak self] in - guard let self = self else { return } + guard let self = self else { + print("⚠️ Self was deallocated in Task.detached!") + return + } + + // Debug: Check what model type we have + let modelType = await type(of: self.currentTranscriptionModel) + let modelName = await self.currentTranscriptionModel?.displayName ?? "nil" + print("🔍 DEBUG: Model type = \(modelType), name = \(modelName)") + print("🔍 DEBUG: Is ParakeetModel? \(await self.currentTranscriptionModel is ParakeetModel)") // Only load model if it's a local model and not already loaded + // Note: Parakeet streaming is now set up BEFORE recording starts (above) if let model = await self.currentTranscriptionModel, model.provider == .local { if let localWhisperModel = await self.availableModels.first(where: { $0.name == model.name }), await self.whisperContext == nil { @@ -214,8 +256,10 @@ class WhisperState: NSObject, ObservableObject { await self.logger.error("❌ Model loading failed: \(error.localizedDescription)") } } - } else if let parakeetModel = await self.currentTranscriptionModel as? ParakeetModel { - try? await self.serviceRegistry.parakeetTranscriptionService.loadModel(for: parakeetModel) + } else if !(await self.currentTranscriptionModel is ParakeetModel) { + // Non-Parakeet, non-local models - just log + let modelDesc = await self.currentTranscriptionModel?.displayName ?? "nil" + await self.logger.notice("🎙️ Model is not local or Parakeet: \(modelDesc)") } if let enhancementService = await self.enhancementService { @@ -244,7 +288,320 @@ class WhisperState: NSObject, ObservableObject { private func requestRecordPermission(response: @escaping (Bool) -> Void) { response(true) } - + + // MARK: - Streaming Transcription Methods + + /// Starts streaming transcription for Parakeet models + private func startStreamingTranscription() async { + guard let parakeetModel = currentTranscriptionModel as? ParakeetModel else { return } + + // Capture direct reference to the service to avoid @MainActor isolation issues in audio callback + let parakeetService = serviceRegistry.parakeetTranscriptionService + + // Set up audio callback BEFORE starting streaming to avoid losing early audio + // Note: callback runs on audio thread, so we capture parakeetService directly + // Audio will be silently dropped until manager is created (streamAudio has a guard) + logger.notice("🎙️ Setting up streaming audio callback") + recorder.setStreamingAudioCallback { samples, frameCount, sampleRate, channels in + parakeetService.streamAudio( + samples: samples, + frameCount: frameCount, + sampleRate: sampleRate, + channels: channels + ) + } + + do { + let transcriptStream = try await parakeetService.startStreaming(model: parakeetModel) + + isStreamingActive = true + lastStreamedText = "" + + // Enable streaming mode in CursorPaster to skip clipboard save/restore + // This prevents race conditions during rapid paste operations + CursorPaster.setStreamingMode(true) + + // Start task to handle streaming updates + logger.notice("🎙️ Starting streaming update task...") + streamingUpdateTask = Task { + self.logger.notice("🎙️ Streaming update task running, waiting for transcripts...") + for await text in transcriptStream { + self.logger.notice("🎙️ Got transcript from stream: '\(text.prefix(30))...'") + await self.handleStreamingUpdate(text) + } + self.logger.notice("🎙️ Streaming update task ended") + } + + logger.notice("🎙️ Started streaming transcription - all setup complete") + } catch { + logger.error("❌ Failed to start streaming transcription: \(error.localizedDescription)") + isStreamingActive = false + } + } + + /// Handles incoming streaming transcription updates by pasting text to active app + /// Optimized to use differential updates when possible to reduce flicker + private func handleStreamingUpdate(_ newText: String) async { + guard isStreamingActive else { return } + + await MainActor.run { + let oldText = self.lastStreamedText + + // Optimization: If new text starts with old text, just append the delta + // This is the common case during continuous speech and avoids flicker + if newText.hasPrefix(oldText) && !oldText.isEmpty { + let deltaText = String(newText.dropFirst(oldText.count)) + if !deltaText.isEmpty { + self.lastStreamedText = newText + CursorPaster.pasteAtCursor(deltaText) + self.logger.notice("🎙️ Appended delta: '\(deltaText.prefix(30))...'") + } + return + } + + // Full replacement needed (model corrected itself or first update) + let charsToDelete = oldText.count + + // Step 1: Delete previously streamed text + if charsToDelete > 0 { + CursorPaster.deleteCharacters(count: charsToDelete) + } + + // Step 2: Wait for deletions to complete before pasting + let deleteWaitTime = max(0.02, Double(charsToDelete) * 0.002) // ~2ms per char, min 20ms + + DispatchQueue.main.asyncAfter(deadline: .now() + deleteWaitTime) { [weak self] in + guard let self = self, self.isStreamingActive else { return } + + self.lastStreamedText = newText + CursorPaster.pasteAtCursor(newText) + self.logger.notice("🎙️ Full replacement: '\(newText.prefix(30))...'") + } + } + } + + /// Finishes streaming and returns the final transcription text + private func finishStreamingTranscription() async -> String? { + guard isStreamingActive else { return nil } + + // Stop receiving updates + streamingUpdateTask?.cancel() + streamingUpdateTask = nil + + // Clear the audio callback + recorder.setStreamingAudioCallback(nil) + + // Get final text + var finalText: String + do { + finalText = try await serviceRegistry.parakeetTranscriptionService.finishStreaming() + // If EOU returns empty but we have streamed text, use that as fallback + if finalText.isEmpty && !self.lastStreamedText.isEmpty { + logger.warning("⚠️ EOU returned empty, using lastStreamedText fallback (\(self.lastStreamedText.count) chars)") + finalText = self.lastStreamedText + } + } catch { + logger.error("❌ Failed to finish streaming: \(error.localizedDescription)") + finalText = self.lastStreamedText // Fall back to last streamed text + } + + // Delete the streamed preview text (will be replaced by batch transcription in hybrid mode) + await MainActor.run { + if !self.lastStreamedText.isEmpty { + CursorPaster.deleteCharacters(count: self.lastStreamedText.count) + } + } + + self.isStreamingActive = false + self.lastStreamedText = "" + + // Disable streaming mode - clipboard operations can resume normally + CursorPaster.setStreamingMode(false) + + logger.notice("🎙️ Finished streaming transcription: \(finalText.count) characters") + return finalText + } + + /// Cancels streaming transcription + private func cancelStreamingTranscription() async { + guard isStreamingActive else { return } + + streamingUpdateTask?.cancel() + streamingUpdateTask = nil + recorder.setStreamingAudioCallback(nil) + + await serviceRegistry.parakeetTranscriptionService.cancelStreaming() + + // Delete any streamed text + await MainActor.run { + if !lastStreamedText.isEmpty { + CursorPaster.deleteCharacters(count: lastStreamedText.count) + } + } + + isStreamingActive = false + lastStreamedText = "" + + // Disable streaming mode - clipboard operations can resume normally + CursorPaster.setStreamingMode(false) + + logger.notice("🎙️ Cancelled streaming transcription") + } + + /// Handles completion of streaming transcription using HYBRID approach: + /// 1. Streaming provided real-time preview (low accuracy, fast) + /// 2. Now run BATCH transcription for accurate final result + private func handleStreamingCompletion() async { + guard let recordedFile = recordedFile else { + await MainActor.run { + recordingState = .idle + } + return + } + + // Step 1: Clean up streaming and delete the preview text + // We discard the streaming result and use batch transcription for accuracy + _ = await finishStreamingTranscription() + + // If there was streamed text, it's already been deleted by finishStreamingTranscription() + // Now we'll paste the accurate batch result + + // Play stop sound + Task { + let isSystemMuteEnabled = UserDefaults.standard.bool(forKey: "isSystemMuteEnabled") + if isSystemMuteEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + } + await MainActor.run { + SoundManager.shared.playStopSound() + } + } + + // Step 2: Switch to transcribing state for batch processing + await MainActor.run { + recordingState = .transcribing + } + + logger.notice("🎙️ HYBRID: Streaming preview done, now running accurate batch transcription...") + + // Get audio duration + let audioAsset = AVURLAsset(url: recordedFile) + let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0 + + // Create transcription record + let transcription = Transcription( + text: "", + duration: duration, + audioFileURL: recordedFile.absoluteString, + transcriptionStatus: .pending + ) + modelContext.insert(transcription) + try? modelContext.save() + NotificationCenter.default.post(name: .transcriptionCreated, object: transcription) + + // Step 3: Run BATCH transcription for accurate result + // HYBRID MODE: Prefer Whisper for accuracy (2.7% WER) over Parakeet (6.05% WER) + var text: String + do { + guard let model = currentTranscriptionModel else { + throw WhisperStateError.transcriptionFailed + } + + // Check if we should prefer Whisper for better accuracy + var transcriptionModel: any TranscriptionModel = model + var usedWhisper = false + + if model is ParakeetModel { + // Parakeet was selected for streaming, but check if Whisper is available for better batch accuracy + // Look for Whisper large-v3-turbo in available models (2.7% WER vs Parakeet's 6.05%) + if let turboModel = allAvailableModels.first(where: { + $0.provider == .local && $0.name.contains("large-v3-turbo") + }) { + // Check if this model is actually downloaded + let isDownloaded = availableModels.contains(where: { $0.name == turboModel.name }) + if isDownloaded { + transcriptionModel = turboModel + usedWhisper = true + logger.notice("🎙️ HYBRID: Using Whisper turbo for accuracy: \(turboModel.name)") + } + } + } + + text = try await serviceRegistry.transcribe(audioURL: recordedFile, model: transcriptionModel) + logger.notice("🎙️ HYBRID: Batch transcription complete\(usedWhisper ? " (Whisper)" : ""): \(text.prefix(50))...") + } catch { + logger.error("❌ Batch transcription failed: \(error.localizedDescription)") + transcription.text = "Transcription Failed: \(error.localizedDescription)" + transcription.transcriptionStatus = TranscriptionStatus.failed.rawValue + try? modelContext.save() + await MainActor.run { + recordingState = .idle + } + await dismissMiniRecorder() + return + } + + // Step 4: Apply post-processing pipeline + text = TranscriptionOutputFilter.filter(text) + + let shouldFormatText = UserDefaults.standard.object(forKey: "EnableTextFormatting") as? Bool ?? true + if shouldFormatText { + text = WhisperTextFormatter.format(text) + } + + text = WordReplacementService.shared.applyReplacements(to: text, using: modelContext) + + // Update transcription record + transcription.text = text + transcription.transcriptionModelName = currentTranscriptionModel?.displayName + + // AI Enhancement (if enabled) + var enhancedText: String? + if let enhancementService = enhancementService, + enhancementService.isEnhancementEnabled, + enhancementService.isConfigured { + await MainActor.run { + recordingState = .enhancing + } + + do { + let (enhanced, enhancementDuration, promptName) = try await enhancementService.enhance(text) + enhancedText = enhanced + transcription.enhancedText = enhanced + transcription.enhancementDuration = enhancementDuration + transcription.promptName = promptName + } catch { + logger.error("❌ Enhancement failed: \(error.localizedDescription)") + } + } + + // Mark transcription as complete + transcription.transcriptionStatus = TranscriptionStatus.completed.rawValue + try? modelContext.save() + + NotificationCenter.default.post(name: .transcriptionCompleted, object: transcription) + + // Step 5: Paste the accurate final text + let finalText = enhancedText ?? text + await MainActor.run { + DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { + CursorPaster.pasteAtCursor(finalText + " ") + + // Auto-send if Power Mode enabled + let powerMode = PowerModeManager.shared + if let activeConfig = powerMode.currentActiveConfiguration, + activeConfig.isAutoSendEnabled { + CursorPaster.pressEnter() + } + } + } + + await MainActor.run { + recordingState = .idle + } + await dismissMiniRecorder() + } + private func transcribeAudio(on transcription: Transcription) async { guard let urlString = transcription.audioFileURL, let url = URL(string: urlString) else { logger.error("❌ Invalid audio file URL in transcription object.") diff --git a/default.profraw b/default.profraw new file mode 100644 index 0000000..e69de29