Add hybrid streaming transcription for improved accuracy
- Implement real-time streaming preview using Parakeet EOU (160ms chunks) - Add batch transcription on completion for accurate final result - Prefer Whisper large-v3-turbo (2.7% WER) over Parakeet (6.05% WER) when available - Remove audio preprocessing that hurts ASR accuracy (gain control, noise reduction) - Add streaming audio callback support in Recorder and CoreAudioRecorder - Raw audio passthrough - SDK handles resampling internally Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
652859414c
commit
de1c1e51aa
@ -471,21 +471,24 @@
|
|||||||
COMBINE_HIDPI_IMAGES = YES;
|
COMBINE_HIDPI_IMAGES = YES;
|
||||||
CURRENT_PROJECT_VERSION = 169;
|
CURRENT_PROJECT_VERSION = 169;
|
||||||
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
||||||
DEVELOPMENT_TEAM = V6J6A3VWY2;
|
DEVELOPMENT_TEAM = QP43ZA49TG;
|
||||||
ENABLE_HARDENED_RUNTIME = YES;
|
ENABLE_HARDENED_RUNTIME = YES;
|
||||||
ENABLE_PREVIEWS = YES;
|
ENABLE_PREVIEWS = YES;
|
||||||
GENERATE_INFOPLIST_FILE = YES;
|
GENERATE_INFOPLIST_FILE = YES;
|
||||||
INFOPLIST_FILE = VoiceInk/Info.plist;
|
INFOPLIST_FILE = VoiceInk/Info.plist;
|
||||||
INFOPLIST_KEY_CFBundleDisplayName = VoiceInk;
|
INFOPLIST_KEY_CFBundleDisplayName = VoiceInk;
|
||||||
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity";
|
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity";
|
||||||
|
INFOPLIST_KEY_LSUIElement = NO;
|
||||||
|
INFOPLIST_KEY_NSAppleEventsUsageDescription = "VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.";
|
||||||
INFOPLIST_KEY_NSHumanReadableCopyright = "";
|
INFOPLIST_KEY_NSHumanReadableCopyright = "";
|
||||||
|
INFOPLIST_KEY_NSMicrophoneUsageDescription = "VoiceInk needs access to your microphone to record audio for transcription.";
|
||||||
LD_RUNPATH_SEARCH_PATHS = (
|
LD_RUNPATH_SEARCH_PATHS = (
|
||||||
"$(inherited)",
|
"$(inherited)",
|
||||||
"@executable_path/../Frameworks",
|
"@executable_path/../Frameworks",
|
||||||
);
|
);
|
||||||
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
||||||
MARKETING_VERSION = 1.69;
|
MARKETING_VERSION = 1.69;
|
||||||
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
|
PRODUCT_BUNDLE_IDENTIFIER = "--com.jakeshore.VoiceInk-com.jakeshore.VoiceInk";
|
||||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
||||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||||
@ -505,21 +508,24 @@
|
|||||||
COMBINE_HIDPI_IMAGES = YES;
|
COMBINE_HIDPI_IMAGES = YES;
|
||||||
CURRENT_PROJECT_VERSION = 169;
|
CURRENT_PROJECT_VERSION = 169;
|
||||||
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
||||||
DEVELOPMENT_TEAM = V6J6A3VWY2;
|
DEVELOPMENT_TEAM = QP43ZA49TG;
|
||||||
ENABLE_HARDENED_RUNTIME = YES;
|
ENABLE_HARDENED_RUNTIME = YES;
|
||||||
ENABLE_PREVIEWS = YES;
|
ENABLE_PREVIEWS = YES;
|
||||||
GENERATE_INFOPLIST_FILE = YES;
|
GENERATE_INFOPLIST_FILE = YES;
|
||||||
INFOPLIST_FILE = VoiceInk/Info.plist;
|
INFOPLIST_FILE = VoiceInk/Info.plist;
|
||||||
INFOPLIST_KEY_CFBundleDisplayName = VoiceInk;
|
INFOPLIST_KEY_CFBundleDisplayName = VoiceInk;
|
||||||
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity";
|
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.productivity";
|
||||||
|
INFOPLIST_KEY_LSUIElement = NO;
|
||||||
|
INFOPLIST_KEY_NSAppleEventsUsageDescription = "VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.";
|
||||||
INFOPLIST_KEY_NSHumanReadableCopyright = "";
|
INFOPLIST_KEY_NSHumanReadableCopyright = "";
|
||||||
|
INFOPLIST_KEY_NSMicrophoneUsageDescription = "VoiceInk needs access to your microphone to record audio for transcription.";
|
||||||
LD_RUNPATH_SEARCH_PATHS = (
|
LD_RUNPATH_SEARCH_PATHS = (
|
||||||
"$(inherited)",
|
"$(inherited)",
|
||||||
"@executable_path/../Frameworks",
|
"@executable_path/../Frameworks",
|
||||||
);
|
);
|
||||||
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
||||||
MARKETING_VERSION = 1.69;
|
MARKETING_VERSION = 1.69;
|
||||||
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
|
PRODUCT_BUNDLE_IDENTIFIER = "--com.jakeshore.VoiceInk-com.jakeshore.VoiceInk";
|
||||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
||||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"originHash" : "93572b72309723585f9fa623350a6b09a152df9dec03f14a5b938629e0f677a0",
|
"originHash" : "144ae35ef0b62c92588dc767eb6b2d443797062688bf1347662bed55d75a7ec2",
|
||||||
"pins" : [
|
"pins" : [
|
||||||
{
|
{
|
||||||
"identity" : "axswift",
|
"identity" : "axswift",
|
||||||
@ -16,7 +16,7 @@
|
|||||||
"location" : "https://github.com/FluidInference/FluidAudio",
|
"location" : "https://github.com/FluidInference/FluidAudio",
|
||||||
"state" : {
|
"state" : {
|
||||||
"branch" : "main",
|
"branch" : "main",
|
||||||
"revision" : "ddee663c4a9806d4f139943b0978b0f0a961587b"
|
"revision" : "11805437821b7e2efc044fc9c5b9b8ce88f6f29f"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -52,7 +52,7 @@
|
|||||||
"location" : "https://github.com/ejbills/mediaremote-adapter",
|
"location" : "https://github.com/ejbills/mediaremote-adapter",
|
||||||
"state" : {
|
"state" : {
|
||||||
"branch" : "master",
|
"branch" : "master",
|
||||||
"revision" : "3529aa25023082a2ceadebcd2c9c4a9430ee96b9"
|
"revision" : "78aae86c03adab11a7b352211cc82381737cf854"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -69,8 +69,8 @@
|
|||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
"location" : "https://github.com/sparkle-project/Sparkle",
|
"location" : "https://github.com/sparkle-project/Sparkle",
|
||||||
"state" : {
|
"state" : {
|
||||||
"revision" : "9a1d2a19d3595fcf8d9c447173f9a1687b3dcadb",
|
"revision" : "5581748cef2bae787496fe6d61139aebe0a451f6",
|
||||||
"version" : "2.8.0"
|
"version" : "2.8.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@ -48,6 +48,9 @@ final class CoreAudioRecorder {
|
|||||||
private var renderBuffer: UnsafeMutablePointer<Float32>?
|
private var renderBuffer: UnsafeMutablePointer<Float32>?
|
||||||
private var renderBufferSize: UInt32 = 0
|
private var renderBufferSize: UInt32 = 0
|
||||||
|
|
||||||
|
// Streaming callback for real-time audio processing (called from audio thread)
|
||||||
|
var streamingAudioCallback: ((_ samples: UnsafePointer<Float32>, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)?
|
||||||
|
|
||||||
// MARK: - Initialization
|
// MARK: - Initialization
|
||||||
|
|
||||||
init() {}
|
init() {}
|
||||||
@ -541,7 +544,6 @@ final class CoreAudioRecorder {
|
|||||||
inBusNumber: UInt32,
|
inBusNumber: UInt32,
|
||||||
inNumberFrames: UInt32
|
inNumberFrames: UInt32
|
||||||
) -> OSStatus {
|
) -> OSStatus {
|
||||||
|
|
||||||
guard let audioUnit = audioUnit, isRecording, let renderBuf = renderBuffer else {
|
guard let audioUnit = audioUnit, isRecording, let renderBuf = renderBuffer else {
|
||||||
return noErr
|
return noErr
|
||||||
}
|
}
|
||||||
@ -581,6 +583,11 @@ final class CoreAudioRecorder {
|
|||||||
return status
|
return status
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Call streaming callback with raw audio samples (for real-time transcription)
|
||||||
|
if let callback = streamingAudioCallback {
|
||||||
|
callback(renderBuf, inNumberFrames, deviceFormat.mSampleRate, channelCount)
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate audio meters from input buffer
|
// Calculate audio meters from input buffer
|
||||||
calculateMeters(from: &bufferList, frameCount: inNumberFrames)
|
calculateMeters(from: &bufferList, frameCount: inNumberFrames)
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,31 @@
|
|||||||
import Foundation
|
import Foundation
|
||||||
import AppKit
|
import AppKit
|
||||||
|
import os.log
|
||||||
|
|
||||||
class CursorPaster {
|
class CursorPaster {
|
||||||
|
private static let logger = Logger(subsystem: "com.jakeshore.VoiceInk", category: "CursorPaster")
|
||||||
|
|
||||||
|
// MARK: - Streaming Mode
|
||||||
|
// When streaming is active, we skip clipboard save/restore to avoid conflicts
|
||||||
|
// with rapid consecutive paste operations
|
||||||
|
private static var isStreamingMode: Bool = false
|
||||||
|
|
||||||
|
/// Enable or disable streaming mode. When enabled, clipboard save/restore is skipped
|
||||||
|
/// to prevent race conditions during rapid streaming text updates.
|
||||||
|
static func setStreamingMode(_ enabled: Bool) {
|
||||||
|
isStreamingMode = enabled
|
||||||
|
logger.notice("📋 Streaming mode \(enabled ? "enabled" : "disabled")")
|
||||||
|
}
|
||||||
|
|
||||||
static func pasteAtCursor(_ text: String) {
|
static func pasteAtCursor(_ text: String) {
|
||||||
|
logger.notice("📋 pasteAtCursor called with \(text.count) chars: '\(text.prefix(50))...'")
|
||||||
|
logger.notice("📋 AXIsProcessTrusted = \(AXIsProcessTrusted())")
|
||||||
let pasteboard = NSPasteboard.general
|
let pasteboard = NSPasteboard.general
|
||||||
// Default to true if not explicitly set by user
|
|
||||||
let shouldRestoreClipboard = UserDefaults.standard.object(forKey: "restoreClipboardAfterPaste") as? Bool ?? true
|
// During streaming mode, skip clipboard save/restore to avoid race conditions
|
||||||
|
// with rapid consecutive paste operations
|
||||||
|
let userWantsRestore = UserDefaults.standard.object(forKey: "restoreClipboardAfterPaste") as? Bool ?? true
|
||||||
|
let shouldRestoreClipboard = userWantsRestore && !isStreamingMode
|
||||||
|
|
||||||
var savedContents: [(NSPasteboard.PasteboardType, Data)] = []
|
var savedContents: [(NSPasteboard.PasteboardType, Data)] = []
|
||||||
|
|
||||||
@ -67,25 +86,29 @@ class CursorPaster {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static func pasteUsingCommandV() {
|
private static func pasteUsingCommandV() {
|
||||||
|
logger.notice("📋 pasteUsingCommandV called")
|
||||||
guard AXIsProcessTrusted() else {
|
guard AXIsProcessTrusted() else {
|
||||||
|
logger.error("❌ pasteUsingCommandV: AXIsProcessTrusted() returned false!")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
let source = CGEventSource(stateID: .hidSystemState)
|
let source = CGEventSource(stateID: .hidSystemState)
|
||||||
|
|
||||||
let cmdDown = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: true)
|
let cmdDown = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: true)
|
||||||
let vDown = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: true)
|
let vDown = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: true)
|
||||||
let vUp = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: false)
|
let vUp = CGEvent(keyboardEventSource: source, virtualKey: 0x09, keyDown: false)
|
||||||
let cmdUp = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: false)
|
let cmdUp = CGEvent(keyboardEventSource: source, virtualKey: 0x37, keyDown: false)
|
||||||
|
|
||||||
cmdDown?.flags = .maskCommand
|
cmdDown?.flags = .maskCommand
|
||||||
vDown?.flags = .maskCommand
|
vDown?.flags = .maskCommand
|
||||||
vUp?.flags = .maskCommand
|
vUp?.flags = .maskCommand
|
||||||
|
cmdUp?.flags = .maskCommand // Fix: cmdUp also needs .maskCommand flag
|
||||||
|
|
||||||
cmdDown?.post(tap: .cghidEventTap)
|
cmdDown?.post(tap: .cghidEventTap)
|
||||||
vDown?.post(tap: .cghidEventTap)
|
vDown?.post(tap: .cghidEventTap)
|
||||||
vUp?.post(tap: .cghidEventTap)
|
vUp?.post(tap: .cghidEventTap)
|
||||||
cmdUp?.post(tap: .cghidEventTap)
|
cmdUp?.post(tap: .cghidEventTap)
|
||||||
|
logger.notice("📋 pasteUsingCommandV: Posted Cmd+V events")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simulate pressing the Return / Enter key
|
// Simulate pressing the Return / Enter key
|
||||||
@ -97,4 +120,32 @@ class CursorPaster {
|
|||||||
enterDown?.post(tap: .cghidEventTap)
|
enterDown?.post(tap: .cghidEventTap)
|
||||||
enterUp?.post(tap: .cghidEventTap)
|
enterUp?.post(tap: .cghidEventTap)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Deletes the specified number of characters by simulating backspace key presses
|
||||||
|
/// Includes inter-key delays to ensure reliable deletion across all applications
|
||||||
|
static func deleteCharacters(count: Int) {
|
||||||
|
logger.notice("📋 deleteCharacters called with count=\(count)")
|
||||||
|
guard AXIsProcessTrusted() else {
|
||||||
|
logger.error("❌ deleteCharacters: AXIsProcessTrusted() returned false!")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
guard count > 0 else { return }
|
||||||
|
|
||||||
|
let source = CGEventSource(stateID: .hidSystemState)
|
||||||
|
let backspaceKeyCode: CGKeyCode = 0x33 // Backspace key
|
||||||
|
|
||||||
|
for i in 0..<count {
|
||||||
|
let backspaceDown = CGEvent(keyboardEventSource: source, virtualKey: backspaceKeyCode, keyDown: true)
|
||||||
|
let backspaceUp = CGEvent(keyboardEventSource: source, virtualKey: backspaceKeyCode, keyDown: false)
|
||||||
|
backspaceDown?.post(tap: .cghidEventTap)
|
||||||
|
backspaceUp?.post(tap: .cghidEventTap)
|
||||||
|
|
||||||
|
// Add small delay every 5 keystrokes to let the system process them
|
||||||
|
// This prevents keystroke loss in applications that can't handle rapid input
|
||||||
|
if i % 5 == 4 && i < count - 1 {
|
||||||
|
usleep(1500) // 1.5ms pause every 5 keystrokes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.notice("📋 deleteCharacters: Deleted \(count) characters")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,36 +2,9 @@
|
|||||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||||
<plist version="1.0">
|
<plist version="1.0">
|
||||||
<dict>
|
<dict>
|
||||||
<key>SUEnableInstallerLauncherService</key>
|
|
||||||
<true/>
|
|
||||||
<key>SUFeedURL</key>
|
|
||||||
<string>https://beingpax.github.io/VoiceInk/appcast.xml</string>
|
|
||||||
<key>SUPublicEDKey</key>
|
|
||||||
<string>rLRdZIjK3gHKfqNlAF9nT7FbjwSvwkJ8BVn0v2mD1Mo=</string>
|
|
||||||
<key>LSUIElement</key>
|
|
||||||
<false/>
|
|
||||||
<key>SUEnableAutomaticChecks</key>
|
|
||||||
<true/>
|
|
||||||
<key>NSMicrophoneUsageDescription</key>
|
|
||||||
<string>VoiceInk needs access to your microphone to record audio for transcription.</string>
|
|
||||||
<key>NSAppleEventsUsageDescription</key>
|
|
||||||
<string>VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.</string>
|
|
||||||
<key>NSScreenCaptureUsageDescription</key>
|
|
||||||
<string>VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy.</string>
|
|
||||||
<key>CFBundleDocumentTypes</key>
|
<key>CFBundleDocumentTypes</key>
|
||||||
<array>
|
<array>
|
||||||
<dict>
|
<dict>
|
||||||
<key>CFBundleTypeName</key>
|
|
||||||
<string>Audio/Video File</string>
|
|
||||||
<key>CFBundleTypeRole</key>
|
|
||||||
<string>Viewer</string>
|
|
||||||
<key>LSHandlerRank</key>
|
|
||||||
<string>Alternate</string>
|
|
||||||
<key>LSItemContentTypes</key>
|
|
||||||
<array>
|
|
||||||
<string>public.audio</string>
|
|
||||||
<string>public.movie</string>
|
|
||||||
</array>
|
|
||||||
<key>CFBundleTypeExtensions</key>
|
<key>CFBundleTypeExtensions</key>
|
||||||
<array>
|
<array>
|
||||||
<string>wav</string>
|
<string>wav</string>
|
||||||
@ -44,7 +17,28 @@
|
|||||||
<string>flac</string>
|
<string>flac</string>
|
||||||
<string>caf</string>
|
<string>caf</string>
|
||||||
</array>
|
</array>
|
||||||
|
<key>CFBundleTypeName</key>
|
||||||
|
<string>Audio/Video File</string>
|
||||||
|
<key>CFBundleTypeRole</key>
|
||||||
|
<string>Viewer</string>
|
||||||
|
<key>LSHandlerRank</key>
|
||||||
|
<string>Alternate</string>
|
||||||
|
<key>LSItemContentTypes</key>
|
||||||
|
<array>
|
||||||
|
<string>public.audio</string>
|
||||||
|
<string>public.movie</string>
|
||||||
|
</array>
|
||||||
</dict>
|
</dict>
|
||||||
</array>
|
</array>
|
||||||
|
<key>NSScreenCaptureUsageDescription</key>
|
||||||
|
<string>VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy.</string>
|
||||||
|
<key>SUEnableAutomaticChecks</key>
|
||||||
|
<true/>
|
||||||
|
<key>SUEnableInstallerLauncherService</key>
|
||||||
|
<true/>
|
||||||
|
<key>SUFeedURL</key>
|
||||||
|
<string>https://beingpax.github.io/VoiceInk/appcast.xml</string>
|
||||||
|
<key>SUPublicEDKey</key>
|
||||||
|
<string>rLRdZIjK3gHKfqNlAF9nT7FbjwSvwkJ8BVn0v2mD1Mo=</string>
|
||||||
</dict>
|
</dict>
|
||||||
</plist>
|
</plist>
|
||||||
|
|||||||
@ -19,9 +19,11 @@ class LicenseViewModel: ObservableObject {
|
|||||||
private let polarService = PolarService()
|
private let polarService = PolarService()
|
||||||
private let userDefaults = UserDefaults.standard
|
private let userDefaults = UserDefaults.standard
|
||||||
private let licenseManager = LicenseManager.shared
|
private let licenseManager = LicenseManager.shared
|
||||||
|
private var isInitializing = true
|
||||||
|
|
||||||
init() {
|
init() {
|
||||||
loadLicenseState()
|
loadLicenseState()
|
||||||
|
isInitializing = false
|
||||||
}
|
}
|
||||||
|
|
||||||
func startTrial() {
|
func startTrial() {
|
||||||
@ -29,7 +31,10 @@ class LicenseViewModel: ObservableObject {
|
|||||||
if licenseManager.trialStartDate == nil {
|
if licenseManager.trialStartDate == nil {
|
||||||
licenseManager.trialStartDate = Date()
|
licenseManager.trialStartDate = Date()
|
||||||
licenseState = .trial(daysRemaining: trialPeriodDays)
|
licenseState = .trial(daysRemaining: trialPeriodDays)
|
||||||
NotificationCenter.default.post(name: .licenseStatusChanged, object: nil)
|
// Don't post notification during initialization to prevent recursive loop
|
||||||
|
if !isInitializing {
|
||||||
|
NotificationCenter.default.post(name: .licenseStatusChanged, object: nil)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@ class PlaybackController: ObservableObject {
|
|||||||
|
|
||||||
private func setupMediaControllerCallbacks() {
|
private func setupMediaControllerCallbacks() {
|
||||||
mediaController.onTrackInfoReceived = { [weak self] trackInfo in
|
mediaController.onTrackInfoReceived = { [weak self] trackInfo in
|
||||||
self?.isMediaPlaying = trackInfo.payload.isPlaying ?? false
|
self?.isMediaPlaying = trackInfo?.payload.isPlaying ?? false
|
||||||
self?.lastKnownTrackInfo = trackInfo
|
self?.lastKnownTrackInfo = trackInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,9 @@ class Recorder: NSObject, ObservableObject {
|
|||||||
private var audioMeterUpdateTask: Task<Void, Never>?
|
private var audioMeterUpdateTask: Task<Void, Never>?
|
||||||
private var audioRestorationTask: Task<Void, Never>?
|
private var audioRestorationTask: Task<Void, Never>?
|
||||||
private var hasDetectedAudioInCurrentSession = false
|
private var hasDetectedAudioInCurrentSession = false
|
||||||
|
|
||||||
|
/// Stored streaming callback - applied when CoreAudioRecorder is created
|
||||||
|
private var pendingStreamingCallback: ((_ samples: UnsafePointer<Float32>, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)?
|
||||||
|
|
||||||
enum RecorderError: Error {
|
enum RecorderError: Error {
|
||||||
case couldNotStartRecording
|
case couldNotStartRecording
|
||||||
@ -127,6 +130,12 @@ class Recorder: NSObject, ObservableObject {
|
|||||||
let coreAudioRecorder = CoreAudioRecorder()
|
let coreAudioRecorder = CoreAudioRecorder()
|
||||||
recorder = coreAudioRecorder
|
recorder = coreAudioRecorder
|
||||||
|
|
||||||
|
// Apply any pending streaming callback that was set before recording started
|
||||||
|
if let callback = pendingStreamingCallback {
|
||||||
|
coreAudioRecorder.streamingAudioCallback = callback
|
||||||
|
logger.notice("🎙️ Applied pending streaming callback to recorder")
|
||||||
|
}
|
||||||
|
|
||||||
try coreAudioRecorder.startRecording(toOutputFile: url, deviceID: deviceID)
|
try coreAudioRecorder.startRecording(toOutputFile: url, deviceID: deviceID)
|
||||||
|
|
||||||
audioRestorationTask?.cancel()
|
audioRestorationTask?.cancel()
|
||||||
@ -179,6 +188,7 @@ class Recorder: NSObject, ObservableObject {
|
|||||||
func stopRecording() {
|
func stopRecording() {
|
||||||
audioLevelCheckTask?.cancel()
|
audioLevelCheckTask?.cancel()
|
||||||
audioMeterUpdateTask?.cancel()
|
audioMeterUpdateTask?.cancel()
|
||||||
|
recorder?.streamingAudioCallback = nil // Clear streaming callback
|
||||||
recorder?.stopRecording()
|
recorder?.stopRecording()
|
||||||
recorder = nil
|
recorder = nil
|
||||||
audioMeter = AudioMeter(averagePower: 0, peakPower: 0)
|
audioMeter = AudioMeter(averagePower: 0, peakPower: 0)
|
||||||
@ -190,6 +200,15 @@ class Recorder: NSObject, ObservableObject {
|
|||||||
deviceManager.isRecordingActive = false
|
deviceManager.isRecordingActive = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sets a callback to receive real-time audio samples for streaming transcription.
|
||||||
|
/// The callback is invoked on the audio thread - do not perform blocking operations.
|
||||||
|
/// Note: The callback is stored and applied when recording starts (CoreAudioRecorder is created lazily).
|
||||||
|
func setStreamingAudioCallback(_ callback: ((_ samples: UnsafePointer<Float32>, _ frameCount: UInt32, _ sampleRate: Double, _ channelCount: UInt32) -> Void)?) {
|
||||||
|
pendingStreamingCallback = callback
|
||||||
|
// Also apply immediately if recorder already exists
|
||||||
|
recorder?.streamingAudioCallback = callback
|
||||||
|
}
|
||||||
|
|
||||||
private func handleRecordingError(_ error: Error) async {
|
private func handleRecordingError(_ error: Error) async {
|
||||||
logger.error("❌ Recording error occurred: \(error.localizedDescription)")
|
logger.error("❌ Recording error occurred: \(error.localizedDescription)")
|
||||||
|
|
||||||
|
|||||||
@ -21,6 +21,17 @@ class ParakeetTranscriptionService: TranscriptionService {
|
|||||||
private var activeVersion: AsrModelVersion?
|
private var activeVersion: AsrModelVersion?
|
||||||
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink.parakeet", category: "ParakeetTranscriptionService")
|
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink.parakeet", category: "ParakeetTranscriptionService")
|
||||||
|
|
||||||
|
init() {
|
||||||
|
logger.notice("🆕 ParakeetTranscriptionService initialized (v4 - raw audio, no preprocessing)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Streaming Properties (using StreamingEouAsrManager for low-latency 160ms chunks)
|
||||||
|
private var streamingEouManager: StreamingEouAsrManager?
|
||||||
|
private var streamingTask: Task<Void, Never>?
|
||||||
|
private var streamingContinuation: AsyncStream<String>.Continuation?
|
||||||
|
private var streamAudioCallCount = 0
|
||||||
|
private var lastPartialTranscript: String = ""
|
||||||
|
|
||||||
private func version(for model: any TranscriptionModel) -> AsrModelVersion {
|
private func version(for model: any TranscriptionModel) -> AsrModelVersion {
|
||||||
model.name.lowercased().contains("v2") ? .v2 : .v3
|
model.name.lowercased().contains("v2") ? .v2 : .v3
|
||||||
}
|
}
|
||||||
@ -121,4 +132,190 @@ class ParakeetTranscriptionService: TranscriptionService {
|
|||||||
vadManager = nil
|
vadManager = nil
|
||||||
activeVersion = nil
|
activeVersion = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Streaming Transcription (Low-Latency EOU Mode)
|
||||||
|
|
||||||
|
/// Gets the directory for EOU streaming models
|
||||||
|
private func getEouModelsDirectory() -> URL {
|
||||||
|
let applicationSupportURL = FileManager.default.urls(
|
||||||
|
for: .applicationSupportDirectory, in: .userDomainMask
|
||||||
|
).first!
|
||||||
|
let appDirectory = applicationSupportURL.appendingPathComponent("FluidAudio", isDirectory: true)
|
||||||
|
return appDirectory.appendingPathComponent("Models/parakeet-eou-streaming/160ms", isDirectory: true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Downloads EOU models if not already present
|
||||||
|
private func ensureEouModelsDownloaded() async throws -> URL {
|
||||||
|
let modelsDir = getEouModelsDirectory()
|
||||||
|
let encoderPath = modelsDir.appendingPathComponent("streaming_encoder.mlmodelc")
|
||||||
|
|
||||||
|
if !FileManager.default.fileExists(atPath: encoderPath.path) {
|
||||||
|
logger.notice("🎙️ Downloading Parakeet EOU 160ms models for streaming preview...")
|
||||||
|
let baseDir = modelsDir.deletingLastPathComponent().deletingLastPathComponent()
|
||||||
|
try await DownloadUtils.downloadRepo(.parakeetEou160, to: baseDir)
|
||||||
|
logger.notice("🎙️ EOU 160ms models downloaded successfully")
|
||||||
|
}
|
||||||
|
|
||||||
|
return modelsDir
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Starts a streaming transcription session using StreamingEouAsrManager for near-instant results.
|
||||||
|
/// Uses 160ms chunks for lowest latency (~160ms between updates).
|
||||||
|
/// Returns an AsyncStream that emits transcription text updates as they arrive.
|
||||||
|
func startStreaming(model: ParakeetModel) async throws -> AsyncStream<String> {
|
||||||
|
logger.notice("🎙️ Starting low-latency EOU streaming transcription")
|
||||||
|
|
||||||
|
// Reset state
|
||||||
|
streamAudioCallCount = 0
|
||||||
|
lastPartialTranscript = ""
|
||||||
|
|
||||||
|
// Download EOU models if needed
|
||||||
|
let modelsDir = try await ensureEouModelsDownloaded()
|
||||||
|
|
||||||
|
// Create StreamingEouAsrManager with 160ms chunks for lowest latency preview
|
||||||
|
// In HYBRID mode: streaming is just for visual feedback, batch provides accuracy
|
||||||
|
// EOU debounce of 1280ms means end-of-utterance detection after ~1.3s of silence
|
||||||
|
let manager = StreamingEouAsrManager(chunkSize: .ms160, eouDebounceMs: 1280)
|
||||||
|
streamingEouManager = manager
|
||||||
|
|
||||||
|
// Load Parakeet EOU models
|
||||||
|
try await manager.loadModels(modelDir: modelsDir)
|
||||||
|
|
||||||
|
logger.notice("🎙️ EOU streaming preview started with 160ms chunks (batch will provide accuracy)")
|
||||||
|
|
||||||
|
// Create stream using makeStream for proper continuation management
|
||||||
|
let (stream, continuation) = AsyncStream<String>.makeStream()
|
||||||
|
self.streamingContinuation = continuation
|
||||||
|
|
||||||
|
// Set up partial callback BEFORE returning the stream (fixes race condition)
|
||||||
|
await manager.setPartialCallback { [weak self] partialText in
|
||||||
|
guard let self = self else { return }
|
||||||
|
let trimmed = partialText.trimmingCharacters(in: .whitespaces)
|
||||||
|
if !trimmed.isEmpty && trimmed != self.lastPartialTranscript {
|
||||||
|
self.lastPartialTranscript = trimmed
|
||||||
|
self.logger.notice("🎙️ Partial update: '\(trimmed.prefix(50))...'")
|
||||||
|
continuation.yield(trimmed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Removed onTermination callback that called cancelStreaming()
|
||||||
|
// This was causing a race condition where the manager was nullified
|
||||||
|
// before finishStreaming() could call manager.finish()
|
||||||
|
// Cleanup is handled by finishStreaming()'s defer block instead
|
||||||
|
|
||||||
|
logger.notice("🎙️ Callback registered, streaming ready")
|
||||||
|
return stream
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feeds raw audio samples to the streaming EOU transcription engine.
|
||||||
|
/// Called from the audio thread - creates AVAudioPCMBuffer and forwards to manager.
|
||||||
|
/// SDK handles resampling to 16kHz internally. No preprocessing applied (research shows it hurts accuracy).
|
||||||
|
func streamAudio(samples: UnsafePointer<Float32>, frameCount: UInt32, sampleRate: Double, channels: UInt32) {
|
||||||
|
streamAudioCallCount += 1
|
||||||
|
|
||||||
|
// Create buffer at original sample rate
|
||||||
|
// SDK's process() method handles resampling to 16kHz internally via AudioConverter
|
||||||
|
guard let audioBuffer = createOriginalFormatBuffer(samples: samples, frameCount: frameCount, sampleRate: sampleRate, channels: channels) else {
|
||||||
|
if streamAudioCallCount <= 5 {
|
||||||
|
logger.warning("Failed to create audio buffer at chunk #\(self.streamAudioCallCount)")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
guard streamingEouManager != nil else {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// StreamingEouAsrManager.process is an actor method, dispatch to avoid blocking audio thread
|
||||||
|
Task.detached { [weak self, audioBuffer] in
|
||||||
|
do {
|
||||||
|
_ = try await self?.streamingEouManager?.process(audioBuffer: audioBuffer)
|
||||||
|
} catch {
|
||||||
|
self?.logger.warning("EOU process error: \(error.localizedDescription)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a MONO AVAudioPCMBuffer from interleaved input samples.
|
||||||
|
/// No preprocessing - research shows gain control and noise reduction HURT ASR accuracy.
|
||||||
|
/// Just converts stereo to mono if needed, passes raw audio otherwise.
|
||||||
|
private func createOriginalFormatBuffer(samples: UnsafePointer<Float32>, frameCount: UInt32, sampleRate: Double, channels: UInt32) -> AVAudioPCMBuffer? {
|
||||||
|
// Create MONO non-interleaved format - simplest format for ASR
|
||||||
|
guard let format = AVAudioFormat(
|
||||||
|
commonFormat: .pcmFormatFloat32,
|
||||||
|
sampleRate: sampleRate,
|
||||||
|
channels: 1, // Output is MONO
|
||||||
|
interleaved: false
|
||||||
|
) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.frameLength = frameCount
|
||||||
|
|
||||||
|
guard let monoData = buffer.floatChannelData?[0] else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
let channelCount = Int(channels)
|
||||||
|
let frames = Int(frameCount)
|
||||||
|
|
||||||
|
if channelCount == 1 {
|
||||||
|
// Already mono - direct copy (no gain, no processing)
|
||||||
|
for frame in 0..<frames {
|
||||||
|
monoData[frame] = samples[frame]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Stereo or multi-channel - mix to mono (simple average, no gain)
|
||||||
|
let channelWeight = 1.0 / Float(channelCount)
|
||||||
|
for frame in 0..<frames {
|
||||||
|
var sum: Float = 0
|
||||||
|
for channel in 0..<channelCount {
|
||||||
|
// Input is interleaved: L0 R0 L1 R1 L2 R2 ...
|
||||||
|
sum += samples[frame * channelCount + channel]
|
||||||
|
}
|
||||||
|
monoData[frame] = sum * channelWeight
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finishes the streaming session and returns the final transcription.
|
||||||
|
func finishStreaming() async throws -> String {
|
||||||
|
defer {
|
||||||
|
streamingTask?.cancel()
|
||||||
|
streamingTask = nil
|
||||||
|
streamingContinuation?.finish()
|
||||||
|
streamingContinuation = nil
|
||||||
|
streamingEouManager = nil
|
||||||
|
lastPartialTranscript = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let manager = streamingEouManager else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
let finalText = try await manager.finish()
|
||||||
|
logger.notice("🎙️ EOU streaming finished with \(finalText.count) characters")
|
||||||
|
return finalText
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancels the streaming session without returning results.
|
||||||
|
func cancelStreaming() async {
|
||||||
|
streamingTask?.cancel()
|
||||||
|
streamingTask = nil
|
||||||
|
streamingContinuation?.finish()
|
||||||
|
streamingContinuation = nil
|
||||||
|
|
||||||
|
if let manager = streamingEouManager {
|
||||||
|
await manager.reset()
|
||||||
|
streamingEouManager = nil
|
||||||
|
lastPartialTranscript = ""
|
||||||
|
logger.notice("🎙️ Cancelled EOU streaming transcription")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,6 +28,11 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
@Published var miniRecorderError: String?
|
@Published var miniRecorderError: String?
|
||||||
@Published var shouldCancelRecording = false
|
@Published var shouldCancelRecording = false
|
||||||
|
|
||||||
|
// MARK: - Streaming Transcription Properties
|
||||||
|
private var streamingUpdateTask: Task<Void, Never>?
|
||||||
|
private var lastStreamedText: String = ""
|
||||||
|
private var isStreamingActive: Bool = false
|
||||||
|
|
||||||
|
|
||||||
@Published var recorderType: String = UserDefaults.standard.string(forKey: "RecorderType") ?? "mini" {
|
@Published var recorderType: String = UserDefaults.standard.string(forKey: "RecorderType") ?? "mini" {
|
||||||
didSet {
|
didSet {
|
||||||
@ -100,7 +105,12 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
// For model progress tracking
|
// For model progress tracking
|
||||||
@Published var downloadProgress: [String: Double] = [:]
|
@Published var downloadProgress: [String: Double] = [:]
|
||||||
@Published var parakeetDownloadStates: [String: Bool] = [:]
|
@Published var parakeetDownloadStates: [String: Bool] = [:]
|
||||||
|
|
||||||
|
/// Returns true if the current transcription model supports streaming (Parakeet only)
|
||||||
|
var isStreamingSupported: Bool {
|
||||||
|
currentTranscriptionModel?.provider == .parakeet
|
||||||
|
}
|
||||||
|
|
||||||
init(modelContext: ModelContext, enhancementService: AIEnhancementService? = nil) {
|
init(modelContext: ModelContext, enhancementService: AIEnhancementService? = nil) {
|
||||||
self.modelContext = modelContext
|
self.modelContext = modelContext
|
||||||
let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
|
let appSupportDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
|
||||||
@ -141,28 +151,41 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
func toggleRecord(powerModeId: UUID? = nil) async {
|
func toggleRecord(powerModeId: UUID? = nil) async {
|
||||||
if recordingState == .recording {
|
if recordingState == .recording {
|
||||||
await recorder.stopRecording()
|
await recorder.stopRecording()
|
||||||
if let recordedFile {
|
|
||||||
if !shouldCancelRecording {
|
|
||||||
let audioAsset = AVURLAsset(url: recordedFile)
|
|
||||||
let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0
|
|
||||||
|
|
||||||
let transcription = Transcription(
|
// Handle cancellation - clean up streaming if active
|
||||||
text: "",
|
if shouldCancelRecording {
|
||||||
duration: duration,
|
if isStreamingActive {
|
||||||
audioFileURL: recordedFile.absoluteString,
|
await cancelStreamingTranscription()
|
||||||
transcriptionStatus: .pending
|
|
||||||
)
|
|
||||||
modelContext.insert(transcription)
|
|
||||||
try? modelContext.save()
|
|
||||||
NotificationCenter.default.post(name: .transcriptionCreated, object: transcription)
|
|
||||||
|
|
||||||
await transcribeAudio(on: transcription)
|
|
||||||
} else {
|
|
||||||
await MainActor.run {
|
|
||||||
recordingState = .idle
|
|
||||||
}
|
|
||||||
await cleanupModelResources()
|
|
||||||
}
|
}
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .idle
|
||||||
|
}
|
||||||
|
await cleanupModelResources()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle streaming transcription completion
|
||||||
|
if isStreamingActive {
|
||||||
|
await handleStreamingCompletion()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-streaming (batch) transcription
|
||||||
|
if let recordedFile {
|
||||||
|
let audioAsset = AVURLAsset(url: recordedFile)
|
||||||
|
let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0
|
||||||
|
|
||||||
|
let transcription = Transcription(
|
||||||
|
text: "",
|
||||||
|
duration: duration,
|
||||||
|
audioFileURL: recordedFile.absoluteString,
|
||||||
|
transcriptionStatus: .pending
|
||||||
|
)
|
||||||
|
modelContext.insert(transcription)
|
||||||
|
try? modelContext.save()
|
||||||
|
NotificationCenter.default.post(name: .transcriptionCreated, object: transcription)
|
||||||
|
|
||||||
|
await transcribeAudio(on: transcription)
|
||||||
} else {
|
} else {
|
||||||
logger.error("❌ No recorded file found after stopping recording")
|
logger.error("❌ No recorded file found after stopping recording")
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
@ -189,7 +212,16 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
let permanentURL = self.recordingsDirectory.appendingPathComponent(fileName)
|
let permanentURL = self.recordingsDirectory.appendingPathComponent(fileName)
|
||||||
self.recordedFile = permanentURL
|
self.recordedFile = permanentURL
|
||||||
|
|
||||||
|
// IMPORTANT: Set up streaming BEFORE starting recording to avoid losing early audio
|
||||||
|
// Check if we're using a Parakeet model and set up streaming first
|
||||||
|
let isParakeetModel = self.currentTranscriptionModel is ParakeetModel
|
||||||
|
if isParakeetModel {
|
||||||
|
self.logger.notice("🎙️ Detected Parakeet model, setting up streaming BEFORE recording...")
|
||||||
|
await self.startStreamingTranscription()
|
||||||
|
}
|
||||||
|
|
||||||
try await self.recorder.startRecording(toOutputFile: permanentURL)
|
try await self.recorder.startRecording(toOutputFile: permanentURL)
|
||||||
|
self.logger.notice("🎙️ Recording started\(isParakeetModel ? " (streaming already active)" : "")")
|
||||||
|
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
self.recordingState = .recording
|
self.recordingState = .recording
|
||||||
@ -202,9 +234,19 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
|
|
||||||
// Load model and capture context in background without blocking
|
// Load model and capture context in background without blocking
|
||||||
Task.detached { [weak self] in
|
Task.detached { [weak self] in
|
||||||
guard let self = self else { return }
|
guard let self = self else {
|
||||||
|
print("⚠️ Self was deallocated in Task.detached!")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debug: Check what model type we have
|
||||||
|
let modelType = await type(of: self.currentTranscriptionModel)
|
||||||
|
let modelName = await self.currentTranscriptionModel?.displayName ?? "nil"
|
||||||
|
print("🔍 DEBUG: Model type = \(modelType), name = \(modelName)")
|
||||||
|
print("🔍 DEBUG: Is ParakeetModel? \(await self.currentTranscriptionModel is ParakeetModel)")
|
||||||
|
|
||||||
// Only load model if it's a local model and not already loaded
|
// Only load model if it's a local model and not already loaded
|
||||||
|
// Note: Parakeet streaming is now set up BEFORE recording starts (above)
|
||||||
if let model = await self.currentTranscriptionModel, model.provider == .local {
|
if let model = await self.currentTranscriptionModel, model.provider == .local {
|
||||||
if let localWhisperModel = await self.availableModels.first(where: { $0.name == model.name }),
|
if let localWhisperModel = await self.availableModels.first(where: { $0.name == model.name }),
|
||||||
await self.whisperContext == nil {
|
await self.whisperContext == nil {
|
||||||
@ -214,8 +256,10 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
await self.logger.error("❌ Model loading failed: \(error.localizedDescription)")
|
await self.logger.error("❌ Model loading failed: \(error.localizedDescription)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if let parakeetModel = await self.currentTranscriptionModel as? ParakeetModel {
|
} else if !(await self.currentTranscriptionModel is ParakeetModel) {
|
||||||
try? await self.serviceRegistry.parakeetTranscriptionService.loadModel(for: parakeetModel)
|
// Non-Parakeet, non-local models - just log
|
||||||
|
let modelDesc = await self.currentTranscriptionModel?.displayName ?? "nil"
|
||||||
|
await self.logger.notice("🎙️ Model is not local or Parakeet: \(modelDesc)")
|
||||||
}
|
}
|
||||||
|
|
||||||
if let enhancementService = await self.enhancementService {
|
if let enhancementService = await self.enhancementService {
|
||||||
@ -244,7 +288,320 @@ class WhisperState: NSObject, ObservableObject {
|
|||||||
private func requestRecordPermission(response: @escaping (Bool) -> Void) {
|
private func requestRecordPermission(response: @escaping (Bool) -> Void) {
|
||||||
response(true)
|
response(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Streaming Transcription Methods
|
||||||
|
|
||||||
|
/// Starts streaming transcription for Parakeet models
|
||||||
|
private func startStreamingTranscription() async {
|
||||||
|
guard let parakeetModel = currentTranscriptionModel as? ParakeetModel else { return }
|
||||||
|
|
||||||
|
// Capture direct reference to the service to avoid @MainActor isolation issues in audio callback
|
||||||
|
let parakeetService = serviceRegistry.parakeetTranscriptionService
|
||||||
|
|
||||||
|
// Set up audio callback BEFORE starting streaming to avoid losing early audio
|
||||||
|
// Note: callback runs on audio thread, so we capture parakeetService directly
|
||||||
|
// Audio will be silently dropped until manager is created (streamAudio has a guard)
|
||||||
|
logger.notice("🎙️ Setting up streaming audio callback")
|
||||||
|
recorder.setStreamingAudioCallback { samples, frameCount, sampleRate, channels in
|
||||||
|
parakeetService.streamAudio(
|
||||||
|
samples: samples,
|
||||||
|
frameCount: frameCount,
|
||||||
|
sampleRate: sampleRate,
|
||||||
|
channels: channels
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let transcriptStream = try await parakeetService.startStreaming(model: parakeetModel)
|
||||||
|
|
||||||
|
isStreamingActive = true
|
||||||
|
lastStreamedText = ""
|
||||||
|
|
||||||
|
// Enable streaming mode in CursorPaster to skip clipboard save/restore
|
||||||
|
// This prevents race conditions during rapid paste operations
|
||||||
|
CursorPaster.setStreamingMode(true)
|
||||||
|
|
||||||
|
// Start task to handle streaming updates
|
||||||
|
logger.notice("🎙️ Starting streaming update task...")
|
||||||
|
streamingUpdateTask = Task {
|
||||||
|
self.logger.notice("🎙️ Streaming update task running, waiting for transcripts...")
|
||||||
|
for await text in transcriptStream {
|
||||||
|
self.logger.notice("🎙️ Got transcript from stream: '\(text.prefix(30))...'")
|
||||||
|
await self.handleStreamingUpdate(text)
|
||||||
|
}
|
||||||
|
self.logger.notice("🎙️ Streaming update task ended")
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.notice("🎙️ Started streaming transcription - all setup complete")
|
||||||
|
} catch {
|
||||||
|
logger.error("❌ Failed to start streaming transcription: \(error.localizedDescription)")
|
||||||
|
isStreamingActive = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles incoming streaming transcription updates by pasting text to active app
|
||||||
|
/// Optimized to use differential updates when possible to reduce flicker
|
||||||
|
private func handleStreamingUpdate(_ newText: String) async {
|
||||||
|
guard isStreamingActive else { return }
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
let oldText = self.lastStreamedText
|
||||||
|
|
||||||
|
// Optimization: If new text starts with old text, just append the delta
|
||||||
|
// This is the common case during continuous speech and avoids flicker
|
||||||
|
if newText.hasPrefix(oldText) && !oldText.isEmpty {
|
||||||
|
let deltaText = String(newText.dropFirst(oldText.count))
|
||||||
|
if !deltaText.isEmpty {
|
||||||
|
self.lastStreamedText = newText
|
||||||
|
CursorPaster.pasteAtCursor(deltaText)
|
||||||
|
self.logger.notice("🎙️ Appended delta: '\(deltaText.prefix(30))...'")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Full replacement needed (model corrected itself or first update)
|
||||||
|
let charsToDelete = oldText.count
|
||||||
|
|
||||||
|
// Step 1: Delete previously streamed text
|
||||||
|
if charsToDelete > 0 {
|
||||||
|
CursorPaster.deleteCharacters(count: charsToDelete)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Wait for deletions to complete before pasting
|
||||||
|
let deleteWaitTime = max(0.02, Double(charsToDelete) * 0.002) // ~2ms per char, min 20ms
|
||||||
|
|
||||||
|
DispatchQueue.main.asyncAfter(deadline: .now() + deleteWaitTime) { [weak self] in
|
||||||
|
guard let self = self, self.isStreamingActive else { return }
|
||||||
|
|
||||||
|
self.lastStreamedText = newText
|
||||||
|
CursorPaster.pasteAtCursor(newText)
|
||||||
|
self.logger.notice("🎙️ Full replacement: '\(newText.prefix(30))...'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finishes streaming and returns the final transcription text
|
||||||
|
private func finishStreamingTranscription() async -> String? {
|
||||||
|
guard isStreamingActive else { return nil }
|
||||||
|
|
||||||
|
// Stop receiving updates
|
||||||
|
streamingUpdateTask?.cancel()
|
||||||
|
streamingUpdateTask = nil
|
||||||
|
|
||||||
|
// Clear the audio callback
|
||||||
|
recorder.setStreamingAudioCallback(nil)
|
||||||
|
|
||||||
|
// Get final text
|
||||||
|
var finalText: String
|
||||||
|
do {
|
||||||
|
finalText = try await serviceRegistry.parakeetTranscriptionService.finishStreaming()
|
||||||
|
// If EOU returns empty but we have streamed text, use that as fallback
|
||||||
|
if finalText.isEmpty && !self.lastStreamedText.isEmpty {
|
||||||
|
logger.warning("⚠️ EOU returned empty, using lastStreamedText fallback (\(self.lastStreamedText.count) chars)")
|
||||||
|
finalText = self.lastStreamedText
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
logger.error("❌ Failed to finish streaming: \(error.localizedDescription)")
|
||||||
|
finalText = self.lastStreamedText // Fall back to last streamed text
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete the streamed preview text (will be replaced by batch transcription in hybrid mode)
|
||||||
|
await MainActor.run {
|
||||||
|
if !self.lastStreamedText.isEmpty {
|
||||||
|
CursorPaster.deleteCharacters(count: self.lastStreamedText.count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.isStreamingActive = false
|
||||||
|
self.lastStreamedText = ""
|
||||||
|
|
||||||
|
// Disable streaming mode - clipboard operations can resume normally
|
||||||
|
CursorPaster.setStreamingMode(false)
|
||||||
|
|
||||||
|
logger.notice("🎙️ Finished streaming transcription: \(finalText.count) characters")
|
||||||
|
return finalText
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancels streaming transcription
|
||||||
|
private func cancelStreamingTranscription() async {
|
||||||
|
guard isStreamingActive else { return }
|
||||||
|
|
||||||
|
streamingUpdateTask?.cancel()
|
||||||
|
streamingUpdateTask = nil
|
||||||
|
recorder.setStreamingAudioCallback(nil)
|
||||||
|
|
||||||
|
await serviceRegistry.parakeetTranscriptionService.cancelStreaming()
|
||||||
|
|
||||||
|
// Delete any streamed text
|
||||||
|
await MainActor.run {
|
||||||
|
if !lastStreamedText.isEmpty {
|
||||||
|
CursorPaster.deleteCharacters(count: lastStreamedText.count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isStreamingActive = false
|
||||||
|
lastStreamedText = ""
|
||||||
|
|
||||||
|
// Disable streaming mode - clipboard operations can resume normally
|
||||||
|
CursorPaster.setStreamingMode(false)
|
||||||
|
|
||||||
|
logger.notice("🎙️ Cancelled streaming transcription")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles completion of streaming transcription using HYBRID approach:
|
||||||
|
/// 1. Streaming provided real-time preview (low accuracy, fast)
|
||||||
|
/// 2. Now run BATCH transcription for accurate final result
|
||||||
|
private func handleStreamingCompletion() async {
|
||||||
|
guard let recordedFile = recordedFile else {
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .idle
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1: Clean up streaming and delete the preview text
|
||||||
|
// We discard the streaming result and use batch transcription for accuracy
|
||||||
|
_ = await finishStreamingTranscription()
|
||||||
|
|
||||||
|
// If there was streamed text, it's already been deleted by finishStreamingTranscription()
|
||||||
|
// Now we'll paste the accurate batch result
|
||||||
|
|
||||||
|
// Play stop sound
|
||||||
|
Task {
|
||||||
|
let isSystemMuteEnabled = UserDefaults.standard.bool(forKey: "isSystemMuteEnabled")
|
||||||
|
if isSystemMuteEnabled {
|
||||||
|
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||||
|
}
|
||||||
|
await MainActor.run {
|
||||||
|
SoundManager.shared.playStopSound()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Switch to transcribing state for batch processing
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .transcribing
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.notice("🎙️ HYBRID: Streaming preview done, now running accurate batch transcription...")
|
||||||
|
|
||||||
|
// Get audio duration
|
||||||
|
let audioAsset = AVURLAsset(url: recordedFile)
|
||||||
|
let duration = (try? CMTimeGetSeconds(await audioAsset.load(.duration))) ?? 0.0
|
||||||
|
|
||||||
|
// Create transcription record
|
||||||
|
let transcription = Transcription(
|
||||||
|
text: "",
|
||||||
|
duration: duration,
|
||||||
|
audioFileURL: recordedFile.absoluteString,
|
||||||
|
transcriptionStatus: .pending
|
||||||
|
)
|
||||||
|
modelContext.insert(transcription)
|
||||||
|
try? modelContext.save()
|
||||||
|
NotificationCenter.default.post(name: .transcriptionCreated, object: transcription)
|
||||||
|
|
||||||
|
// Step 3: Run BATCH transcription for accurate result
|
||||||
|
// HYBRID MODE: Prefer Whisper for accuracy (2.7% WER) over Parakeet (6.05% WER)
|
||||||
|
var text: String
|
||||||
|
do {
|
||||||
|
guard let model = currentTranscriptionModel else {
|
||||||
|
throw WhisperStateError.transcriptionFailed
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we should prefer Whisper for better accuracy
|
||||||
|
var transcriptionModel: any TranscriptionModel = model
|
||||||
|
var usedWhisper = false
|
||||||
|
|
||||||
|
if model is ParakeetModel {
|
||||||
|
// Parakeet was selected for streaming, but check if Whisper is available for better batch accuracy
|
||||||
|
// Look for Whisper large-v3-turbo in available models (2.7% WER vs Parakeet's 6.05%)
|
||||||
|
if let turboModel = allAvailableModels.first(where: {
|
||||||
|
$0.provider == .local && $0.name.contains("large-v3-turbo")
|
||||||
|
}) {
|
||||||
|
// Check if this model is actually downloaded
|
||||||
|
let isDownloaded = availableModels.contains(where: { $0.name == turboModel.name })
|
||||||
|
if isDownloaded {
|
||||||
|
transcriptionModel = turboModel
|
||||||
|
usedWhisper = true
|
||||||
|
logger.notice("🎙️ HYBRID: Using Whisper turbo for accuracy: \(turboModel.name)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
text = try await serviceRegistry.transcribe(audioURL: recordedFile, model: transcriptionModel)
|
||||||
|
logger.notice("🎙️ HYBRID: Batch transcription complete\(usedWhisper ? " (Whisper)" : ""): \(text.prefix(50))...")
|
||||||
|
} catch {
|
||||||
|
logger.error("❌ Batch transcription failed: \(error.localizedDescription)")
|
||||||
|
transcription.text = "Transcription Failed: \(error.localizedDescription)"
|
||||||
|
transcription.transcriptionStatus = TranscriptionStatus.failed.rawValue
|
||||||
|
try? modelContext.save()
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .idle
|
||||||
|
}
|
||||||
|
await dismissMiniRecorder()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: Apply post-processing pipeline
|
||||||
|
text = TranscriptionOutputFilter.filter(text)
|
||||||
|
|
||||||
|
let shouldFormatText = UserDefaults.standard.object(forKey: "EnableTextFormatting") as? Bool ?? true
|
||||||
|
if shouldFormatText {
|
||||||
|
text = WhisperTextFormatter.format(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
text = WordReplacementService.shared.applyReplacements(to: text, using: modelContext)
|
||||||
|
|
||||||
|
// Update transcription record
|
||||||
|
transcription.text = text
|
||||||
|
transcription.transcriptionModelName = currentTranscriptionModel?.displayName
|
||||||
|
|
||||||
|
// AI Enhancement (if enabled)
|
||||||
|
var enhancedText: String?
|
||||||
|
if let enhancementService = enhancementService,
|
||||||
|
enhancementService.isEnhancementEnabled,
|
||||||
|
enhancementService.isConfigured {
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .enhancing
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let (enhanced, enhancementDuration, promptName) = try await enhancementService.enhance(text)
|
||||||
|
enhancedText = enhanced
|
||||||
|
transcription.enhancedText = enhanced
|
||||||
|
transcription.enhancementDuration = enhancementDuration
|
||||||
|
transcription.promptName = promptName
|
||||||
|
} catch {
|
||||||
|
logger.error("❌ Enhancement failed: \(error.localizedDescription)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark transcription as complete
|
||||||
|
transcription.transcriptionStatus = TranscriptionStatus.completed.rawValue
|
||||||
|
try? modelContext.save()
|
||||||
|
|
||||||
|
NotificationCenter.default.post(name: .transcriptionCompleted, object: transcription)
|
||||||
|
|
||||||
|
// Step 5: Paste the accurate final text
|
||||||
|
let finalText = enhancedText ?? text
|
||||||
|
await MainActor.run {
|
||||||
|
DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) {
|
||||||
|
CursorPaster.pasteAtCursor(finalText + " ")
|
||||||
|
|
||||||
|
// Auto-send if Power Mode enabled
|
||||||
|
let powerMode = PowerModeManager.shared
|
||||||
|
if let activeConfig = powerMode.currentActiveConfiguration,
|
||||||
|
activeConfig.isAutoSendEnabled {
|
||||||
|
CursorPaster.pressEnter()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
recordingState = .idle
|
||||||
|
}
|
||||||
|
await dismissMiniRecorder()
|
||||||
|
}
|
||||||
|
|
||||||
private func transcribeAudio(on transcription: Transcription) async {
|
private func transcribeAudio(on transcription: Transcription) async {
|
||||||
guard let urlString = transcription.audioFileURL, let url = URL(string: urlString) else {
|
guard let urlString = transcription.audioFileURL, let url = URL(string: urlString) else {
|
||||||
logger.error("❌ Invalid audio file URL in transcription object.")
|
logger.error("❌ Invalid audio file URL in transcription object.")
|
||||||
|
|||||||
0
default.profraw
Normal file
0
default.profraw
Normal file
Loading…
x
Reference in New Issue
Block a user