diff --git a/VoiceInk.xcodeproj/project.pbxproj b/VoiceInk.xcodeproj/project.pbxproj index f18f690..63a704e 100644 --- a/VoiceInk.xcodeproj/project.pbxproj +++ b/VoiceInk.xcodeproj/project.pbxproj @@ -459,7 +459,7 @@ "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; - CURRENT_PROJECT_VERSION = 152; + CURRENT_PROJECT_VERSION = 153; DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\""; DEVELOPMENT_TEAM = V6J6A3VWY2; ENABLE_HARDENED_RUNTIME = YES; @@ -474,7 +474,7 @@ "@executable_path/../Frameworks", ); MACOSX_DEPLOYMENT_TARGET = 14.0; - MARKETING_VERSION = 1.52; + MARKETING_VERSION = 1.53; PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)"; @@ -493,7 +493,7 @@ "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; - CURRENT_PROJECT_VERSION = 152; + CURRENT_PROJECT_VERSION = 153; DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\""; DEVELOPMENT_TEAM = V6J6A3VWY2; ENABLE_HARDENED_RUNTIME = YES; @@ -508,7 +508,7 @@ "@executable_path/../Frameworks", ); MACOSX_DEPLOYMENT_TARGET = 14.0; - MARKETING_VERSION = 1.52; + MARKETING_VERSION = 1.53; PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)"; diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 886cfcd..2c29478 100644 --- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -7,7 +7,7 @@ "location" : "https://github.com/FluidInference/FluidAudio", "state" : { "branch" : "main", - "revision" : "abf7d9ef3f53a693e3721069071971eff84c002f" + "revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3" } }, { diff --git a/VoiceInk/AppDelegate.swift b/VoiceInk/AppDelegate.swift index bf27f48..20cd81e 100644 --- a/VoiceInk/AppDelegate.swift +++ b/VoiceInk/AppDelegate.swift @@ -1,5 +1,6 @@ import Cocoa import SwiftUI +import UniformTypeIdentifiers class AppDelegate: NSObject, NSApplicationDelegate { func applicationDidFinishLaunching(_ notification: Notification) { @@ -49,4 +50,28 @@ class AppDelegate: NSObject, NSApplicationDelegate { defaults.removeObject(forKey: "defaultPowerModeConfigV2") defaults.removeObject(forKey: "isPowerModeEnabled") } + + // Stash URL when app cold-starts to avoid spawning a new window/tab + var pendingOpenFileURL: URL? + + func application(_ application: NSApplication, open urls: [URL]) { + guard let url = urls.first(where: { SupportedMedia.isSupported(url: $0) }) else { + return + } + + NSApp.activate(ignoringOtherApps: true) + + if NSApp.windows.isEmpty { + // Cold start: do NOT create a window here to avoid extra window/tab. + // Defer to SwiftUIโ€™s WindowGroup-created ContentView and let it process this later. + pendingOpenFileURL = url + } else { + // Running: focus current window and route in-place to Transcribe Audio + NSApp.windows.first?.makeKeyAndOrderFront(nil) + NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"]) + DispatchQueue.main.async { + NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": url]) + } + } + } } diff --git a/VoiceInk/HotkeyManager.swift b/VoiceInk/HotkeyManager.swift index 2eba0de..46787fd 100644 --- a/VoiceInk/HotkeyManager.swift +++ b/VoiceInk/HotkeyManager.swift @@ -154,13 +154,6 @@ class HotkeyManager: ObservableObject { } } - KeyboardShortcuts.onKeyUp(for: .pasteLastEnhancement) { [weak self] in - guard let self = self else { return } - Task { @MainActor in - LastTranscriptionService.pasteLastEnhancement(from: self.whisperState.modelContext) - } - } - KeyboardShortcuts.onKeyUp(for: .retryLastTranscription) { [weak self] in guard let self = self else { return } Task { @MainActor in @@ -442,4 +435,3 @@ class HotkeyManager: ObservableObject { } } } - diff --git a/VoiceInk/Info.plist b/VoiceInk/Info.plist index c29b98f..7833f1f 100644 --- a/VoiceInk/Info.plist +++ b/VoiceInk/Info.plist @@ -18,5 +18,33 @@ VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations. NSScreenCaptureUsageDescription VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy. + CFBundleDocumentTypes + + + CFBundleTypeName + Audio/Video File + CFBundleTypeRole + Viewer + LSHandlerRank + Alternate + LSItemContentTypes + + public.audio + public.movie + + CFBundleTypeExtensions + + wav + mp3 + m4a + aiff + mp4 + mov + aac + flac + caf + + + diff --git a/VoiceInk/Notifications/AppNotifications.swift b/VoiceInk/Notifications/AppNotifications.swift index 0c1ae30..c23e055 100644 --- a/VoiceInk/Notifications/AppNotifications.swift +++ b/VoiceInk/Notifications/AppNotifications.swift @@ -14,4 +14,5 @@ extension Notification.Name { static let powerModeConfigurationApplied = Notification.Name("powerModeConfigurationApplied") static let transcriptionCreated = Notification.Name("transcriptionCreated") static let enhancementToggleChanged = Notification.Name("enhancementToggleChanged") + static let openFileForTranscription = Notification.Name("openFileForTranscription") } diff --git a/VoiceInk/PowerMode/PowerModeConfigView.swift b/VoiceInk/PowerMode/PowerModeConfigView.swift index a44566d..60fcb12 100644 --- a/VoiceInk/PowerMode/PowerModeConfigView.swift +++ b/VoiceInk/PowerMode/PowerModeConfigView.swift @@ -182,18 +182,16 @@ struct ConfigurationView: View { } // Default Power Mode Toggle - if !powerModeManager.hasDefaultConfiguration() || isCurrentConfigDefault { - HStack { - Toggle("Set as default power mode", isOn: $isDefault) - .font(.system(size: 14)) - - InfoTip( - title: "Default Power Mode", - message: "Default power mode is used when no specific app or website matches are found" - ) - - Spacer() - } + HStack { + Toggle("Set as default power mode", isOn: $isDefault) + .font(.system(size: 14)) + + InfoTip( + title: "Default Power Mode", + message: "Default power mode is used when no specific app or website matches are found" + ) + + Spacer() } } .padding(.horizontal, 20) diff --git a/VoiceInk/PowerMode/PowerModeViewComponents.swift b/VoiceInk/PowerMode/PowerModeViewComponents.swift index 98222dc..3559734 100644 --- a/VoiceInk/PowerMode/PowerModeViewComponents.swift +++ b/VoiceInk/PowerMode/PowerModeViewComponents.swift @@ -203,7 +203,7 @@ struct ConfigurationRow: View { .padding(.vertical, 12) .padding(.horizontal, 14) - if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled { + if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled || config.isAutoSendEnabled { Divider() .padding(.horizontal, 16) @@ -259,6 +259,22 @@ struct ConfigurationRow: View { ) } + if config.isAutoSendEnabled { + HStack(spacing: 4) { + Image(systemName: "keyboard") + .font(.system(size: 10)) + Text("Auto Send") + .font(.caption) + } + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(Capsule() + .fill(Color(NSColor.controlBackgroundColor))) + .overlay( + Capsule() + .stroke(Color(NSColor.separatorColor), lineWidth: 0.5) + ) + } if config.isAIEnhancementEnabled { if config.useScreenCapture { HStack(spacing: 4) { @@ -289,7 +305,7 @@ struct ConfigurationRow: View { .fill(Color.accentColor.opacity(0.1))) .foregroundColor(.accentColor) } - + Spacer() } .padding(.vertical, 10) @@ -376,4 +392,4 @@ struct AppGridItem: View { } .buttonStyle(.plain) } -} +} diff --git a/VoiceInk/Recorder.swift b/VoiceInk/Recorder.swift index 6cc1d01..b683f7b 100644 --- a/VoiceInk/Recorder.swift +++ b/VoiceInk/Recorder.swift @@ -76,15 +76,7 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate { UserDefaults.standard.set(String(currentDeviceID), forKey: "lastUsedMicrophoneDeviceID") hasDetectedAudioInCurrentSession = false - - // Coordinate media control and system audio sequentially for better reliability - await playbackController.pauseMedia() - - // Small delay to allow media command to process before muting system audio - try? await Task.sleep(nanoseconds: 100_000_000) // 100ms - - _ = await mediaController.muteSystemAudio() - + let deviceID = deviceManager.getCurrentDevice() if deviceID != 0 { do { @@ -114,6 +106,12 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate { throw RecorderError.couldNotStartRecording } + Task { [weak self] in + guard let self = self else { return } + await self.playbackController.pauseMedia() + _ = await self.mediaController.muteSystemAudio() + } + audioLevelCheckTask?.cancel() audioMeterUpdateTask?.cancel() diff --git a/VoiceInk/Services/AIEnhancementService.swift b/VoiceInk/Services/AIEnhancementService.swift index 7ae13d8..a9d4c05 100644 --- a/VoiceInk/Services/AIEnhancementService.swift +++ b/VoiceInk/Services/AIEnhancementService.swift @@ -261,6 +261,8 @@ class AIEnhancementService: ObservableObject { let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines)) return filteredText + } else if httpResponse.statusCode == 429 { + throw EnhancementError.rateLimitExceeded } else if (500...599).contains(httpResponse.statusCode) { throw EnhancementError.serverError } else { @@ -316,6 +318,8 @@ class AIEnhancementService: ObservableObject { let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines)) return filteredText + } else if httpResponse.statusCode == 429 { + throw EnhancementError.rateLimitExceeded } else if (500...599).contains(httpResponse.statusCode) { throw EnhancementError.serverError } else { @@ -342,7 +346,7 @@ class AIEnhancementService: ObservableObject { return try await makeRequest(text: text, mode: mode) } catch let error as EnhancementError { switch error { - case .networkError, .serverError: + case .networkError, .serverError, .rateLimitExceeded: retries += 1 if retries < maxRetries { logger.warning("Request failed, retrying in \(currentDelay)s... (Attempt \(retries)/\(maxRetries))") @@ -458,6 +462,7 @@ enum EnhancementError: Error { case enhancementFailed case networkError case serverError + case rateLimitExceeded case customError(String) } @@ -474,6 +479,8 @@ extension EnhancementError: LocalizedError { return "Network connection failed. Check your internet." case .serverError: return "The AI provider's server encountered an error. Please try again later." + case .rateLimitExceeded: + return "Rate limit exceeded. Please try again later." case .customError(let message): return message } diff --git a/VoiceInk/Services/AIService.swift b/VoiceInk/Services/AIService.swift index b6b6799..da94353 100644 --- a/VoiceInk/Services/AIService.swift +++ b/VoiceInk/Services/AIService.swift @@ -82,7 +82,7 @@ enum AIProvider: String, CaseIterable { case .groq: return [ "llama-3.3-70b-versatile", - "moonshotai/kimi-k2-instruct", + "moonshotai/kimi-k2-instruct-0905", "qwen/qwen3-32b", "meta-llama/llama-4-maverick-17b-128e-instruct", "openai/gpt-oss-120b" diff --git a/VoiceInk/Services/DictionaryContextService.swift b/VoiceInk/Services/DictionaryContextService.swift index 6e51b9a..9086112 100644 --- a/VoiceInk/Services/DictionaryContextService.swift +++ b/VoiceInk/Services/DictionaryContextService.swift @@ -6,7 +6,7 @@ class DictionaryContextService { private init() {} - private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, Wispr flow, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai" + private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai" func getDictionaryContext() -> String { var allWords: [String] = [] diff --git a/VoiceInk/Services/ImportExportService.swift b/VoiceInk/Services/ImportExportService.swift index 2b08313..1578ffc 100644 --- a/VoiceInk/Services/ImportExportService.swift +++ b/VoiceInk/Services/ImportExportService.swift @@ -7,6 +7,7 @@ import LaunchAtLogin struct GeneralSettings: Codable { let toggleMiniRecorderShortcut: KeyboardShortcuts.Shortcut? let toggleMiniRecorderShortcut2: KeyboardShortcuts.Shortcut? + let retryLastTranscriptionShortcut: KeyboardShortcuts.Shortcut? let selectedHotkey1RawValue: String? let selectedHotkey2RawValue: String? let launchAtLoginEnabled: Bool? @@ -86,6 +87,7 @@ class ImportExportService { let generalSettingsToExport = GeneralSettings( toggleMiniRecorderShortcut: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder), toggleMiniRecorderShortcut2: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder2), + retryLastTranscriptionShortcut: KeyboardShortcuts.getShortcut(for: .retryLastTranscription), selectedHotkey1RawValue: hotkeyManager.selectedHotkey1.rawValue, selectedHotkey2RawValue: hotkeyManager.selectedHotkey2.rawValue, launchAtLoginEnabled: LaunchAtLogin.isEnabled, @@ -218,6 +220,9 @@ class ImportExportService { if let shortcut2 = general.toggleMiniRecorderShortcut2 { KeyboardShortcuts.setShortcut(shortcut2, for: .toggleMiniRecorder2) } + if let retryShortcut = general.retryLastTranscriptionShortcut { + KeyboardShortcuts.setShortcut(retryShortcut, for: .retryLastTranscription) + } if let hotkeyRaw = general.selectedHotkey1RawValue, let hotkey = HotkeyManager.HotkeyOption(rawValue: hotkeyRaw) { hotkeyManager.selectedHotkey1 = hotkey diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift index f3e4a09..cd0d315 100644 --- a/VoiceInk/Services/ParakeetTranscriptionService.swift +++ b/VoiceInk/Services/ParakeetTranscriptionService.swift @@ -71,13 +71,60 @@ class ParakeetTranscriptionService: TranscriptionService { let audioSamples = try readAudioSamples(from: audioURL) - // Validate audio data before transcription - guard audioSamples.count >= 16000 else { - logger.notice("๐Ÿฆœ Audio too short for transcription: \(audioSamples.count) samples") + // Validate audio data before VAD + guard !audioSamples.isEmpty else { + logger.notice("๐Ÿฆœ Audio is empty, skipping transcription.") + throw ASRError.invalidAudioData + } + + // Use VAD to get speech segments + var speechAudio: [Float] = [] + let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true + + if isVADEnabled { + if let modelPath = await VADModelManager.shared.getModelPath() { + if let vad = VoiceActivityDetector(modelPath: modelPath) { + let speechSegments = vad.process(audioSamples: audioSamples) + logger.notice("๐Ÿฆœ VAD detected \(speechSegments.count) speech segments.") + + let sampleRate = 16000 // Assuming 16kHz sample rate + for segment in speechSegments { + let startSample = Int(segment.start * Double(sampleRate)) + var endSample = Int(segment.end * Double(sampleRate)) + + // Cap endSample to the audio buffer size + if endSample > audioSamples.count { + endSample = audioSamples.count + } + + if startSample < endSample { + speechAudio.append(contentsOf: audioSamples[startSample..= 16000 else { + logger.notice("๐Ÿฆœ Audio too short for transcription after VAD: \(speechAudio.count) samples") throw ASRError.invalidAudioData } - let result = try await asrManager.transcribe(audioSamples) + let result = try await asrManager.transcribe(speechAudio) + print(result.text) // Reset decoder state and cleanup after transcription to avoid blocking the transcription start Task { @@ -91,10 +138,16 @@ class ParakeetTranscriptionService: TranscriptionService { logger.notice("๐Ÿฆœ Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue") } + var text = result.text + if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true { - return WhisperTextFormatter.format(result.text) + text = WhisperTextFormatter.format(text) } - return result.text + + // Apply hallucination and filler word filtering + text = WhisperHallucinationFilter.filter(text) + + return text } private func readAudioSamples(from url: URL) throws -> [Float] { diff --git a/VoiceInk/Services/SupportedMedia.swift b/VoiceInk/Services/SupportedMedia.swift new file mode 100644 index 0000000..c66c452 --- /dev/null +++ b/VoiceInk/Services/SupportedMedia.swift @@ -0,0 +1,28 @@ +import Foundation +import UniformTypeIdentifiers + +struct SupportedMedia { + static let extensions: Set = [ + "wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf" + ] + + static let contentTypes: [UTType] = [ + .audio, .movie + ] + + static func isSupported(url: URL) -> Bool { + let fileExtension = url.pathExtension.lowercased() + if !fileExtension.isEmpty, extensions.contains(fileExtension) { + return true + } + + if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]), + let contentType = resourceValues.contentType { + return contentTypes.contains(where: { contentType.conforms(to: $0) }) + } + + return false + } +} + + diff --git a/VoiceInk/Services/VoiceActivityDetector.swift b/VoiceInk/Services/VoiceActivityDetector.swift new file mode 100644 index 0000000..67cd0f3 --- /dev/null +++ b/VoiceInk/Services/VoiceActivityDetector.swift @@ -0,0 +1,88 @@ +import Foundation +import AVFoundation +import os.log +#if canImport(whisper) +import whisper +#else +#error("Unable to import whisper module. Please check your project configuration.") +#endif + +// MARK: - C API Bridge + +// Opaque pointers for the C contexts +fileprivate typealias WhisperVADContext = OpaquePointer +fileprivate typealias WhisperVADSegments = OpaquePointer + + +// MARK: - VoiceActivityDetector Class + +class VoiceActivityDetector { + private var vadContext: WhisperVADContext + private let logger = Logger(subsystem: "com.voiceink.app", category: "VoiceActivityDetector") + + init?(modelPath: String) { + var contextParams = whisper_vad_default_context_params() + contextParams.n_threads = max(1, min(8, Int32(ProcessInfo.processInfo.processorCount) - 2)) + + let contextOpt: WhisperVADContext? = modelPath.withCString { cPath in + whisper_vad_init_from_file_with_params(cPath, contextParams) + } + + guard let context = contextOpt else { + logger.error("Failed to initialize VAD context.") + return nil + } + self.vadContext = context + logger.notice("VAD context initialized successfully.") + } + + deinit { + whisper_vad_free(vadContext) + logger.notice("VAD context freed.") + } + + /// Processes audio samples to detect speech segments and returns an array of (start: TimeInterval, end: TimeInterval) tuples. + func process(audioSamples: [Float]) -> [(start: TimeInterval, end: TimeInterval)] { + // 1. Detect speech and get probabilities internally in the context + let success = audioSamples.withUnsafeBufferPointer { buffer in + whisper_vad_detect_speech(vadContext, buffer.baseAddress!, Int32(audioSamples.count)) + } + + guard success else { + logger.error("Failed to detect speech probabilities.") + return [] + } + + // 2. Get segments from probabilities + var vadParams = whisper_vad_default_params() + vadParams.threshold = 0.45 + vadParams.min_speech_duration_ms = 150 + vadParams.min_silence_duration_ms = 750 + vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude // Use the largest representable Float value for no max duration + vadParams.speech_pad_ms = 100 + vadParams.samples_overlap = 0.1 // Add samples_overlap parameter + + guard let segments = whisper_vad_segments_from_probs(vadContext, vadParams) else { + logger.error("Failed to get VAD segments from probabilities.") + return [] + } + defer { + // Ensure segments are freed + whisper_vad_free_segments(segments) + } + + let nSegments = whisper_vad_segments_n_segments(segments) + logger.notice("Detected \(nSegments) speech segments.") + + var speechSegments: [(start: TimeInterval, end: TimeInterval)] = [] + for i in 0.. replacement HStack(spacing: 12) { VStack(alignment: .leading, spacing: 4) { Text("Original:") @@ -280,6 +286,34 @@ struct AddReplacementSheet: View { .font(.callout) } } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(12) + .background(Color(.textBackgroundColor)) + .cornerRadius(8) + + // Comma-separated originals -> single replacement + HStack(spacing: 12) { + VStack(alignment: .leading, spacing: 4) { + Text("Original:") + .font(.caption) + .foregroundColor(.secondary) + Text("Voicing, Voice ink, Voiceing") + .font(.callout) + } + + Image(systemName: "arrow.right") + .font(.caption) + .foregroundColor(.secondary) + + VStack(alignment: .leading, spacing: 4) { + Text("Replacement:") + .font(.caption) + .foregroundColor(.secondary) + Text("VoiceInk") + .font(.callout) + } + } + .frame(maxWidth: .infinity, alignment: .leading) .padding(12) .background(Color(.textBackgroundColor)) .cornerRadius(8) @@ -290,14 +324,19 @@ struct AddReplacementSheet: View { .padding(.vertical) } } - .frame(width: 460, height: 480) + .frame(width: 460, height: 520) } private func addReplacement() { let original = originalWord let replacement = replacementWord - guard !original.isEmpty && !replacement.isEmpty else { return } + // Validate that at least one non-empty token exists + let tokens = original + .split(separator: ",") + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + guard !tokens.isEmpty && !replacement.isEmpty else { return } manager.addReplacement(original: original, replacement: replacement) dismiss() diff --git a/VoiceInk/Views/EnhancementSettingsView.swift b/VoiceInk/Views/EnhancementSettingsView.swift index 714d677..4026b5e 100644 --- a/VoiceInk/Views/EnhancementSettingsView.swift +++ b/VoiceInk/Views/EnhancementSettingsView.swift @@ -1,4 +1,5 @@ import SwiftUI +import UniformTypeIdentifiers struct EnhancementSettingsView: View { @EnvironmentObject private var enhancementService: AIEnhancementService @@ -79,25 +80,22 @@ struct EnhancementSettingsView: View { Text("Enhancement Prompt") .font(.headline) - // Prompts Section - VStack(alignment: .leading, spacing: 12) { - PromptSelectionGrid( - prompts: enhancementService.allPrompts, - selectedPromptId: enhancementService.selectedPromptId, - onPromptSelected: { prompt in - enhancementService.setActivePrompt(prompt) - }, - onEditPrompt: { prompt in - selectedPromptForEdit = prompt - }, - onDeletePrompt: { prompt in - enhancementService.deletePrompt(prompt) - }, - onAddNewPrompt: { - isEditingPrompt = true - } - ) - } + // Reorderable prompts grid with drag-and-drop + ReorderablePromptGrid( + selectedPromptId: enhancementService.selectedPromptId, + onPromptSelected: { prompt in + enhancementService.setActivePrompt(prompt) + }, + onEditPrompt: { prompt in + selectedPromptForEdit = prompt + }, + onDeletePrompt: { prompt in + enhancementService.deletePrompt(prompt) + }, + onAddNewPrompt: { + isEditingPrompt = true + } + ) } .padding() .background(CardBackground(isSelected: false)) @@ -115,3 +113,151 @@ struct EnhancementSettingsView: View { } } } + +// MARK: - Drag & Drop Reorderable Grid +private struct ReorderablePromptGrid: View { + @EnvironmentObject private var enhancementService: AIEnhancementService + + let selectedPromptId: UUID? + let onPromptSelected: (CustomPrompt) -> Void + let onEditPrompt: ((CustomPrompt) -> Void)? + let onDeletePrompt: ((CustomPrompt) -> Void)? + let onAddNewPrompt: (() -> Void)? + + @State private var draggingItem: CustomPrompt? + + var body: some View { + VStack(alignment: .leading, spacing: 12) { + if enhancementService.customPrompts.isEmpty { + Text("No prompts available") + .foregroundColor(.secondary) + .font(.caption) + } else { + let columns = [ + GridItem(.adaptive(minimum: 80, maximum: 100), spacing: 36) + ] + + LazyVGrid(columns: columns, spacing: 16) { + ForEach(enhancementService.customPrompts) { prompt in + prompt.promptIcon( + isSelected: selectedPromptId == prompt.id, + onTap: { + withAnimation(.spring(response: 0.3, dampingFraction: 0.7)) { + onPromptSelected(prompt) + } + }, + onEdit: onEditPrompt, + onDelete: onDeletePrompt + ) + .opacity(draggingItem?.id == prompt.id ? 0.3 : 1.0) + .scaleEffect(draggingItem?.id == prompt.id ? 1.05 : 1.0) + .overlay( + RoundedRectangle(cornerRadius: 14) + .stroke( + draggingItem != nil && draggingItem?.id != prompt.id + ? Color.accentColor.opacity(0.25) + : Color.clear, + lineWidth: 1 + ) + ) + .animation(.easeInOut(duration: 0.15), value: draggingItem?.id == prompt.id) + .onDrag { + draggingItem = prompt + return NSItemProvider(object: prompt.id.uuidString as NSString) + } + .onDrop( + of: [UTType.text], + delegate: PromptDropDelegate( + item: prompt, + prompts: $enhancementService.customPrompts, + draggingItem: $draggingItem + ) + ) + } + + if let onAddNewPrompt = onAddNewPrompt { + CustomPrompt.addNewButton { + onAddNewPrompt() + } + .help("Add new prompt") + .onDrop( + of: [UTType.text], + delegate: PromptEndDropDelegate( + prompts: $enhancementService.customPrompts, + draggingItem: $draggingItem + ) + ) + } + } + .padding(.vertical, 12) + .padding(.horizontal, 16) + + HStack { + Image(systemName: "info.circle") + .font(.caption) + .foregroundColor(.secondary) + + Text("Double-click to edit โ€ข Right-click for more options") + .font(.caption) + .foregroundColor(.secondary) + } + .padding(.top, 8) + .padding(.horizontal, 16) + } + } + } +} + +// MARK: - Drop Delegates +private struct PromptDropDelegate: DropDelegate { + let item: CustomPrompt + @Binding var prompts: [CustomPrompt] + @Binding var draggingItem: CustomPrompt? + + func dropEntered(info: DropInfo) { + guard let draggingItem = draggingItem, draggingItem != item else { return } + guard let fromIndex = prompts.firstIndex(of: draggingItem), + let toIndex = prompts.firstIndex(of: item) else { return } + + // Move item as you hover for immediate visual update + if prompts[toIndex].id != draggingItem.id { + withAnimation(.easeInOut(duration: 0.12)) { + let from = fromIndex + let to = toIndex + prompts.move(fromOffsets: IndexSet(integer: from), toOffset: to > from ? to + 1 : to) + } + } + } + + func dropUpdated(info: DropInfo) -> DropProposal? { + DropProposal(operation: .move) + } + + func performDrop(info: DropInfo) -> Bool { + draggingItem = nil + return true + } +} + +private struct PromptEndDropDelegate: DropDelegate { + @Binding var prompts: [CustomPrompt] + @Binding var draggingItem: CustomPrompt? + + func validateDrop(info: DropInfo) -> Bool { true } + func dropUpdated(info: DropInfo) -> DropProposal? { DropProposal(operation: .move) } + + func performDrop(info: DropInfo) -> Bool { + guard let draggingItem = draggingItem, + let currentIndex = prompts.firstIndex(of: draggingItem) else { + self.draggingItem = nil + return false + } + + // Move to end if dropped on the trailing "Add New" tile + withAnimation(.easeInOut(duration: 0.12)) { + prompts.move(fromOffsets: IndexSet(integer: currentIndex), toOffset: prompts.endIndex) + } + self.draggingItem = nil + return true + } +} diff --git a/VoiceInk/Views/ModelSettingsView.swift b/VoiceInk/Views/ModelSettingsView.swift index 0af2337..5a1a06d 100644 --- a/VoiceInk/Views/ModelSettingsView.swift +++ b/VoiceInk/Views/ModelSettingsView.swift @@ -99,7 +99,7 @@ struct ModelSettingsView: View { InfoTip( title: "Voice Activity Detection", - message: "Detects speech segments and filters out silence to reduce hallucinations in local Whisper models." + message: "Detect speech segments and filter out silence to improve accuracy of local models." ) } diff --git a/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift b/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift index 900af55..b0479b9 100644 --- a/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift +++ b/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift @@ -37,7 +37,7 @@ struct ExperimentalFeaturesSection: View { if isExperimentalFeaturesEnabled { Toggle(isOn: $playbackController.isPauseMediaEnabled) { - Text("Pause Media on Playback") + Text("Pause Media during recording") } .toggleStyle(.switch) .help("Automatically pause active media playback during recordings and resume afterward.") diff --git a/VoiceInk/Views/Settings/SettingsView.swift b/VoiceInk/Views/Settings/SettingsView.swift index 211218c..54c219f 100644 --- a/VoiceInk/Views/Settings/SettingsView.swift +++ b/VoiceInk/Views/Settings/SettingsView.swift @@ -130,6 +130,8 @@ struct SettingsView: View { Divider() + + // Custom Cancel Shortcut VStack(alignment: .leading, spacing: 12) { HStack(spacing: 8) { diff --git a/VoiceInk/VoiceInk.swift b/VoiceInk/VoiceInk.swift index 549a500..af22cfd 100644 --- a/VoiceInk/VoiceInk.swift +++ b/VoiceInk/VoiceInk.swift @@ -114,6 +114,15 @@ struct VoiceInkApp: App { if !UserDefaults.standard.bool(forKey: "IsTranscriptionCleanupEnabled") { audioCleanupManager.startAutomaticCleanup(modelContext: container.mainContext) } + + // Process any pending open-file request now that the main ContentView is ready. + if let pendingURL = appDelegate.pendingOpenFileURL { + NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"]) + DispatchQueue.main.asyncAfter(deadline: .now() + 0.3) { + NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": pendingURL]) + } + appDelegate.pendingOpenFileURL = nil + } } .background(WindowAccessor { window in WindowManager.shared.configureWindow(window) diff --git a/VoiceInk/Whisper/WhisperHallucinationFilter.swift b/VoiceInk/Whisper/WhisperHallucinationFilter.swift index f95d59d..bea80b3 100644 --- a/VoiceInk/Whisper/WhisperHallucinationFilter.swift +++ b/VoiceInk/Whisper/WhisperHallucinationFilter.swift @@ -4,49 +4,48 @@ import os struct WhisperHallucinationFilter { private static let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperHallucinationFilter") - // Pattern-based approach for detecting hallucinations - focusing on format indicators private static let hallucinationPatterns = [ - // Text in various types of brackets - the most reliable hallucination indicators - #"\[.*?\]"#, // [Text in square brackets] - #"\(.*?\)"#, // (Text in parentheses) - #"\{.*?\}"#, // {Text in curly braces} - #"<.*?>"#, // - - // Text with special formatting - #"\*.*?\*"#, // *Text with asterisks* - #"_.*?_"#, // _Text with underscores_ - - // Time indicators often added by Whisper - #"(?i)\d{1,2}:\d{2}(:\d{2})?\s*-\s*\d{1,2}:\d{2}(:\d{2})?"# // 00:00 - 00:00 format + #"\[.*?\]"#, // Square brackets + #"\(.*?\)"#, // Parentheses + #"\{.*?\}"# // Curly braces + ] + + private static let fillerWords = [ + "uh", "um", "uhm", "umm", "uhh", "uhhh", "er", "ah", "eh", + "hmm", "hm", "h", "m", "mmm", "mm", "mh", "ha", "ehh" ] - - /// Removes hallucinations from transcription text using pattern matching - /// - Parameter text: Original transcription text from Whisper - /// - Returns: Filtered text with hallucinations removed static func filter(_ text: String) -> String { - logger.notice("๐Ÿงน Applying pattern-based hallucination filter to transcription") - + logger.notice("๐Ÿงน Filtering hallucinations and filler words") var filteredText = text - - // Remove pattern-based hallucinations + + // Remove bracketed hallucinations for pattern in hallucinationPatterns { if let regex = try? NSRegularExpression(pattern: pattern) { let range = NSRange(filteredText.startIndex..., in: filteredText) filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "") } } - - // Clean up extra whitespace and newlines that might be left after removing hallucinations + + // Remove filler words + for fillerWord in fillerWords { + let pattern = "\\b\(NSRegularExpression.escapedPattern(for: fillerWord))\\b[,.]?" + if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) { + let range = NSRange(filteredText.startIndex..., in: filteredText) + filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "") + } + } + + // Clean whitespace filteredText = filteredText.replacingOccurrences(of: #"\s{2,}"#, with: " ", options: .regularExpression) filteredText = filteredText.trimmingCharacters(in: .whitespacesAndNewlines) - - // Add logging to track effectiveness + + // Log results if filteredText != text { - logger.notice("โœ… Removed hallucinations using pattern matching") + logger.notice("โœ… Removed hallucinations and filler words") } else { - logger.notice("โœ… No hallucinations detected with pattern matching") + logger.notice("โœ… No hallucinations or filler words found") } - + return filteredText } } \ No newline at end of file