diff --git a/VoiceInk.xcodeproj/project.pbxproj b/VoiceInk.xcodeproj/project.pbxproj
index f18f690..63a704e 100644
--- a/VoiceInk.xcodeproj/project.pbxproj
+++ b/VoiceInk.xcodeproj/project.pbxproj
@@ -459,7 +459,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
- CURRENT_PROJECT_VERSION = 152;
+ CURRENT_PROJECT_VERSION = 153;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@@ -474,7 +474,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
- MARKETING_VERSION = 1.52;
+ MARKETING_VERSION = 1.53;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
@@ -493,7 +493,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
- CURRENT_PROJECT_VERSION = 152;
+ CURRENT_PROJECT_VERSION = 153;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@@ -508,7 +508,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
- MARKETING_VERSION = 1.52;
+ MARKETING_VERSION = 1.53;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
diff --git a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
index 886cfcd..2c29478 100644
--- a/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
+++ b/VoiceInk.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
@@ -7,7 +7,7 @@
"location" : "https://github.com/FluidInference/FluidAudio",
"state" : {
"branch" : "main",
- "revision" : "abf7d9ef3f53a693e3721069071971eff84c002f"
+ "revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
}
},
{
diff --git a/VoiceInk/AppDelegate.swift b/VoiceInk/AppDelegate.swift
index bf27f48..20cd81e 100644
--- a/VoiceInk/AppDelegate.swift
+++ b/VoiceInk/AppDelegate.swift
@@ -1,5 +1,6 @@
import Cocoa
import SwiftUI
+import UniformTypeIdentifiers
class AppDelegate: NSObject, NSApplicationDelegate {
func applicationDidFinishLaunching(_ notification: Notification) {
@@ -49,4 +50,28 @@ class AppDelegate: NSObject, NSApplicationDelegate {
defaults.removeObject(forKey: "defaultPowerModeConfigV2")
defaults.removeObject(forKey: "isPowerModeEnabled")
}
+
+ // Stash URL when app cold-starts to avoid spawning a new window/tab
+ var pendingOpenFileURL: URL?
+
+ func application(_ application: NSApplication, open urls: [URL]) {
+ guard let url = urls.first(where: { SupportedMedia.isSupported(url: $0) }) else {
+ return
+ }
+
+ NSApp.activate(ignoringOtherApps: true)
+
+ if NSApp.windows.isEmpty {
+ // Cold start: do NOT create a window here to avoid extra window/tab.
+ // Defer to SwiftUIโs WindowGroup-created ContentView and let it process this later.
+ pendingOpenFileURL = url
+ } else {
+ // Running: focus current window and route in-place to Transcribe Audio
+ NSApp.windows.first?.makeKeyAndOrderFront(nil)
+ NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
+ DispatchQueue.main.async {
+ NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": url])
+ }
+ }
+ }
}
diff --git a/VoiceInk/HotkeyManager.swift b/VoiceInk/HotkeyManager.swift
index 2eba0de..46787fd 100644
--- a/VoiceInk/HotkeyManager.swift
+++ b/VoiceInk/HotkeyManager.swift
@@ -154,13 +154,6 @@ class HotkeyManager: ObservableObject {
}
}
- KeyboardShortcuts.onKeyUp(for: .pasteLastEnhancement) { [weak self] in
- guard let self = self else { return }
- Task { @MainActor in
- LastTranscriptionService.pasteLastEnhancement(from: self.whisperState.modelContext)
- }
- }
-
KeyboardShortcuts.onKeyUp(for: .retryLastTranscription) { [weak self] in
guard let self = self else { return }
Task { @MainActor in
@@ -442,4 +435,3 @@ class HotkeyManager: ObservableObject {
}
}
}
-
diff --git a/VoiceInk/Info.plist b/VoiceInk/Info.plist
index c29b98f..7833f1f 100644
--- a/VoiceInk/Info.plist
+++ b/VoiceInk/Info.plist
@@ -18,5 +18,33 @@
VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.
NSScreenCaptureUsageDescription
VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy.
+ CFBundleDocumentTypes
+
+
+ CFBundleTypeName
+ Audio/Video File
+ CFBundleTypeRole
+ Viewer
+ LSHandlerRank
+ Alternate
+ LSItemContentTypes
+
+ public.audio
+ public.movie
+
+ CFBundleTypeExtensions
+
+ wav
+ mp3
+ m4a
+ aiff
+ mp4
+ mov
+ aac
+ flac
+ caf
+
+
+
diff --git a/VoiceInk/Notifications/AppNotifications.swift b/VoiceInk/Notifications/AppNotifications.swift
index 0c1ae30..c23e055 100644
--- a/VoiceInk/Notifications/AppNotifications.swift
+++ b/VoiceInk/Notifications/AppNotifications.swift
@@ -14,4 +14,5 @@ extension Notification.Name {
static let powerModeConfigurationApplied = Notification.Name("powerModeConfigurationApplied")
static let transcriptionCreated = Notification.Name("transcriptionCreated")
static let enhancementToggleChanged = Notification.Name("enhancementToggleChanged")
+ static let openFileForTranscription = Notification.Name("openFileForTranscription")
}
diff --git a/VoiceInk/PowerMode/PowerModeConfigView.swift b/VoiceInk/PowerMode/PowerModeConfigView.swift
index a44566d..60fcb12 100644
--- a/VoiceInk/PowerMode/PowerModeConfigView.swift
+++ b/VoiceInk/PowerMode/PowerModeConfigView.swift
@@ -182,18 +182,16 @@ struct ConfigurationView: View {
}
// Default Power Mode Toggle
- if !powerModeManager.hasDefaultConfiguration() || isCurrentConfigDefault {
- HStack {
- Toggle("Set as default power mode", isOn: $isDefault)
- .font(.system(size: 14))
-
- InfoTip(
- title: "Default Power Mode",
- message: "Default power mode is used when no specific app or website matches are found"
- )
-
- Spacer()
- }
+ HStack {
+ Toggle("Set as default power mode", isOn: $isDefault)
+ .font(.system(size: 14))
+
+ InfoTip(
+ title: "Default Power Mode",
+ message: "Default power mode is used when no specific app or website matches are found"
+ )
+
+ Spacer()
}
}
.padding(.horizontal, 20)
diff --git a/VoiceInk/PowerMode/PowerModeViewComponents.swift b/VoiceInk/PowerMode/PowerModeViewComponents.swift
index 98222dc..3559734 100644
--- a/VoiceInk/PowerMode/PowerModeViewComponents.swift
+++ b/VoiceInk/PowerMode/PowerModeViewComponents.swift
@@ -203,7 +203,7 @@ struct ConfigurationRow: View {
.padding(.vertical, 12)
.padding(.horizontal, 14)
- if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled {
+ if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled || config.isAutoSendEnabled {
Divider()
.padding(.horizontal, 16)
@@ -259,6 +259,22 @@ struct ConfigurationRow: View {
)
}
+ if config.isAutoSendEnabled {
+ HStack(spacing: 4) {
+ Image(systemName: "keyboard")
+ .font(.system(size: 10))
+ Text("Auto Send")
+ .font(.caption)
+ }
+ .padding(.horizontal, 8)
+ .padding(.vertical, 4)
+ .background(Capsule()
+ .fill(Color(NSColor.controlBackgroundColor)))
+ .overlay(
+ Capsule()
+ .stroke(Color(NSColor.separatorColor), lineWidth: 0.5)
+ )
+ }
if config.isAIEnhancementEnabled {
if config.useScreenCapture {
HStack(spacing: 4) {
@@ -289,7 +305,7 @@ struct ConfigurationRow: View {
.fill(Color.accentColor.opacity(0.1)))
.foregroundColor(.accentColor)
}
-
+
Spacer()
}
.padding(.vertical, 10)
@@ -376,4 +392,4 @@ struct AppGridItem: View {
}
.buttonStyle(.plain)
}
-}
+}
diff --git a/VoiceInk/Recorder.swift b/VoiceInk/Recorder.swift
index 6cc1d01..b683f7b 100644
--- a/VoiceInk/Recorder.swift
+++ b/VoiceInk/Recorder.swift
@@ -76,15 +76,7 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
UserDefaults.standard.set(String(currentDeviceID), forKey: "lastUsedMicrophoneDeviceID")
hasDetectedAudioInCurrentSession = false
-
- // Coordinate media control and system audio sequentially for better reliability
- await playbackController.pauseMedia()
-
- // Small delay to allow media command to process before muting system audio
- try? await Task.sleep(nanoseconds: 100_000_000) // 100ms
-
- _ = await mediaController.muteSystemAudio()
-
+
let deviceID = deviceManager.getCurrentDevice()
if deviceID != 0 {
do {
@@ -114,6 +106,12 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
throw RecorderError.couldNotStartRecording
}
+ Task { [weak self] in
+ guard let self = self else { return }
+ await self.playbackController.pauseMedia()
+ _ = await self.mediaController.muteSystemAudio()
+ }
+
audioLevelCheckTask?.cancel()
audioMeterUpdateTask?.cancel()
diff --git a/VoiceInk/Services/AIEnhancementService.swift b/VoiceInk/Services/AIEnhancementService.swift
index 7ae13d8..a9d4c05 100644
--- a/VoiceInk/Services/AIEnhancementService.swift
+++ b/VoiceInk/Services/AIEnhancementService.swift
@@ -261,6 +261,8 @@ class AIEnhancementService: ObservableObject {
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
return filteredText
+ } else if httpResponse.statusCode == 429 {
+ throw EnhancementError.rateLimitExceeded
} else if (500...599).contains(httpResponse.statusCode) {
throw EnhancementError.serverError
} else {
@@ -316,6 +318,8 @@ class AIEnhancementService: ObservableObject {
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
return filteredText
+ } else if httpResponse.statusCode == 429 {
+ throw EnhancementError.rateLimitExceeded
} else if (500...599).contains(httpResponse.statusCode) {
throw EnhancementError.serverError
} else {
@@ -342,7 +346,7 @@ class AIEnhancementService: ObservableObject {
return try await makeRequest(text: text, mode: mode)
} catch let error as EnhancementError {
switch error {
- case .networkError, .serverError:
+ case .networkError, .serverError, .rateLimitExceeded:
retries += 1
if retries < maxRetries {
logger.warning("Request failed, retrying in \(currentDelay)s... (Attempt \(retries)/\(maxRetries))")
@@ -458,6 +462,7 @@ enum EnhancementError: Error {
case enhancementFailed
case networkError
case serverError
+ case rateLimitExceeded
case customError(String)
}
@@ -474,6 +479,8 @@ extension EnhancementError: LocalizedError {
return "Network connection failed. Check your internet."
case .serverError:
return "The AI provider's server encountered an error. Please try again later."
+ case .rateLimitExceeded:
+ return "Rate limit exceeded. Please try again later."
case .customError(let message):
return message
}
diff --git a/VoiceInk/Services/AIService.swift b/VoiceInk/Services/AIService.swift
index b6b6799..da94353 100644
--- a/VoiceInk/Services/AIService.swift
+++ b/VoiceInk/Services/AIService.swift
@@ -82,7 +82,7 @@ enum AIProvider: String, CaseIterable {
case .groq:
return [
"llama-3.3-70b-versatile",
- "moonshotai/kimi-k2-instruct",
+ "moonshotai/kimi-k2-instruct-0905",
"qwen/qwen3-32b",
"meta-llama/llama-4-maverick-17b-128e-instruct",
"openai/gpt-oss-120b"
diff --git a/VoiceInk/Services/DictionaryContextService.swift b/VoiceInk/Services/DictionaryContextService.swift
index 6e51b9a..9086112 100644
--- a/VoiceInk/Services/DictionaryContextService.swift
+++ b/VoiceInk/Services/DictionaryContextService.swift
@@ -6,7 +6,7 @@ class DictionaryContextService {
private init() {}
- private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, Wispr flow, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
+ private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
func getDictionaryContext() -> String {
var allWords: [String] = []
diff --git a/VoiceInk/Services/ImportExportService.swift b/VoiceInk/Services/ImportExportService.swift
index 2b08313..1578ffc 100644
--- a/VoiceInk/Services/ImportExportService.swift
+++ b/VoiceInk/Services/ImportExportService.swift
@@ -7,6 +7,7 @@ import LaunchAtLogin
struct GeneralSettings: Codable {
let toggleMiniRecorderShortcut: KeyboardShortcuts.Shortcut?
let toggleMiniRecorderShortcut2: KeyboardShortcuts.Shortcut?
+ let retryLastTranscriptionShortcut: KeyboardShortcuts.Shortcut?
let selectedHotkey1RawValue: String?
let selectedHotkey2RawValue: String?
let launchAtLoginEnabled: Bool?
@@ -86,6 +87,7 @@ class ImportExportService {
let generalSettingsToExport = GeneralSettings(
toggleMiniRecorderShortcut: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder),
toggleMiniRecorderShortcut2: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder2),
+ retryLastTranscriptionShortcut: KeyboardShortcuts.getShortcut(for: .retryLastTranscription),
selectedHotkey1RawValue: hotkeyManager.selectedHotkey1.rawValue,
selectedHotkey2RawValue: hotkeyManager.selectedHotkey2.rawValue,
launchAtLoginEnabled: LaunchAtLogin.isEnabled,
@@ -218,6 +220,9 @@ class ImportExportService {
if let shortcut2 = general.toggleMiniRecorderShortcut2 {
KeyboardShortcuts.setShortcut(shortcut2, for: .toggleMiniRecorder2)
}
+ if let retryShortcut = general.retryLastTranscriptionShortcut {
+ KeyboardShortcuts.setShortcut(retryShortcut, for: .retryLastTranscription)
+ }
if let hotkeyRaw = general.selectedHotkey1RawValue,
let hotkey = HotkeyManager.HotkeyOption(rawValue: hotkeyRaw) {
hotkeyManager.selectedHotkey1 = hotkey
diff --git a/VoiceInk/Services/ParakeetTranscriptionService.swift b/VoiceInk/Services/ParakeetTranscriptionService.swift
index f3e4a09..cd0d315 100644
--- a/VoiceInk/Services/ParakeetTranscriptionService.swift
+++ b/VoiceInk/Services/ParakeetTranscriptionService.swift
@@ -71,13 +71,60 @@ class ParakeetTranscriptionService: TranscriptionService {
let audioSamples = try readAudioSamples(from: audioURL)
- // Validate audio data before transcription
- guard audioSamples.count >= 16000 else {
- logger.notice("๐ฆ Audio too short for transcription: \(audioSamples.count) samples")
+ // Validate audio data before VAD
+ guard !audioSamples.isEmpty else {
+ logger.notice("๐ฆ Audio is empty, skipping transcription.")
+ throw ASRError.invalidAudioData
+ }
+
+ // Use VAD to get speech segments
+ var speechAudio: [Float] = []
+ let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true
+
+ if isVADEnabled {
+ if let modelPath = await VADModelManager.shared.getModelPath() {
+ if let vad = VoiceActivityDetector(modelPath: modelPath) {
+ let speechSegments = vad.process(audioSamples: audioSamples)
+ logger.notice("๐ฆ VAD detected \(speechSegments.count) speech segments.")
+
+ let sampleRate = 16000 // Assuming 16kHz sample rate
+ for segment in speechSegments {
+ let startSample = Int(segment.start * Double(sampleRate))
+ var endSample = Int(segment.end * Double(sampleRate))
+
+ // Cap endSample to the audio buffer size
+ if endSample > audioSamples.count {
+ endSample = audioSamples.count
+ }
+
+ if startSample < endSample {
+ speechAudio.append(contentsOf: audioSamples[startSample..= 16000 else {
+ logger.notice("๐ฆ Audio too short for transcription after VAD: \(speechAudio.count) samples")
throw ASRError.invalidAudioData
}
- let result = try await asrManager.transcribe(audioSamples)
+ let result = try await asrManager.transcribe(speechAudio)
+ print(result.text)
// Reset decoder state and cleanup after transcription to avoid blocking the transcription start
Task {
@@ -91,10 +138,16 @@ class ParakeetTranscriptionService: TranscriptionService {
logger.notice("๐ฆ Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue")
}
+ var text = result.text
+
if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true {
- return WhisperTextFormatter.format(result.text)
+ text = WhisperTextFormatter.format(text)
}
- return result.text
+
+ // Apply hallucination and filler word filtering
+ text = WhisperHallucinationFilter.filter(text)
+
+ return text
}
private func readAudioSamples(from url: URL) throws -> [Float] {
diff --git a/VoiceInk/Services/SupportedMedia.swift b/VoiceInk/Services/SupportedMedia.swift
new file mode 100644
index 0000000..c66c452
--- /dev/null
+++ b/VoiceInk/Services/SupportedMedia.swift
@@ -0,0 +1,28 @@
+import Foundation
+import UniformTypeIdentifiers
+
+struct SupportedMedia {
+ static let extensions: Set = [
+ "wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf"
+ ]
+
+ static let contentTypes: [UTType] = [
+ .audio, .movie
+ ]
+
+ static func isSupported(url: URL) -> Bool {
+ let fileExtension = url.pathExtension.lowercased()
+ if !fileExtension.isEmpty, extensions.contains(fileExtension) {
+ return true
+ }
+
+ if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]),
+ let contentType = resourceValues.contentType {
+ return contentTypes.contains(where: { contentType.conforms(to: $0) })
+ }
+
+ return false
+ }
+}
+
+
diff --git a/VoiceInk/Services/VoiceActivityDetector.swift b/VoiceInk/Services/VoiceActivityDetector.swift
new file mode 100644
index 0000000..67cd0f3
--- /dev/null
+++ b/VoiceInk/Services/VoiceActivityDetector.swift
@@ -0,0 +1,88 @@
+import Foundation
+import AVFoundation
+import os.log
+#if canImport(whisper)
+import whisper
+#else
+#error("Unable to import whisper module. Please check your project configuration.")
+#endif
+
+// MARK: - C API Bridge
+
+// Opaque pointers for the C contexts
+fileprivate typealias WhisperVADContext = OpaquePointer
+fileprivate typealias WhisperVADSegments = OpaquePointer
+
+
+// MARK: - VoiceActivityDetector Class
+
+class VoiceActivityDetector {
+ private var vadContext: WhisperVADContext
+ private let logger = Logger(subsystem: "com.voiceink.app", category: "VoiceActivityDetector")
+
+ init?(modelPath: String) {
+ var contextParams = whisper_vad_default_context_params()
+ contextParams.n_threads = max(1, min(8, Int32(ProcessInfo.processInfo.processorCount) - 2))
+
+ let contextOpt: WhisperVADContext? = modelPath.withCString { cPath in
+ whisper_vad_init_from_file_with_params(cPath, contextParams)
+ }
+
+ guard let context = contextOpt else {
+ logger.error("Failed to initialize VAD context.")
+ return nil
+ }
+ self.vadContext = context
+ logger.notice("VAD context initialized successfully.")
+ }
+
+ deinit {
+ whisper_vad_free(vadContext)
+ logger.notice("VAD context freed.")
+ }
+
+ /// Processes audio samples to detect speech segments and returns an array of (start: TimeInterval, end: TimeInterval) tuples.
+ func process(audioSamples: [Float]) -> [(start: TimeInterval, end: TimeInterval)] {
+ // 1. Detect speech and get probabilities internally in the context
+ let success = audioSamples.withUnsafeBufferPointer { buffer in
+ whisper_vad_detect_speech(vadContext, buffer.baseAddress!, Int32(audioSamples.count))
+ }
+
+ guard success else {
+ logger.error("Failed to detect speech probabilities.")
+ return []
+ }
+
+ // 2. Get segments from probabilities
+ var vadParams = whisper_vad_default_params()
+ vadParams.threshold = 0.45
+ vadParams.min_speech_duration_ms = 150
+ vadParams.min_silence_duration_ms = 750
+ vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude // Use the largest representable Float value for no max duration
+ vadParams.speech_pad_ms = 100
+ vadParams.samples_overlap = 0.1 // Add samples_overlap parameter
+
+ guard let segments = whisper_vad_segments_from_probs(vadContext, vadParams) else {
+ logger.error("Failed to get VAD segments from probabilities.")
+ return []
+ }
+ defer {
+ // Ensure segments are freed
+ whisper_vad_free_segments(segments)
+ }
+
+ let nSegments = whisper_vad_segments_n_segments(segments)
+ logger.notice("Detected \(nSegments) speech segments.")
+
+ var speechSegments: [(start: TimeInterval, end: TimeInterval)] = []
+ for i in 0.. replacement
HStack(spacing: 12) {
VStack(alignment: .leading, spacing: 4) {
Text("Original:")
@@ -280,6 +286,34 @@ struct AddReplacementSheet: View {
.font(.callout)
}
}
+ .frame(maxWidth: .infinity, alignment: .leading)
+ .padding(12)
+ .background(Color(.textBackgroundColor))
+ .cornerRadius(8)
+
+ // Comma-separated originals -> single replacement
+ HStack(spacing: 12) {
+ VStack(alignment: .leading, spacing: 4) {
+ Text("Original:")
+ .font(.caption)
+ .foregroundColor(.secondary)
+ Text("Voicing, Voice ink, Voiceing")
+ .font(.callout)
+ }
+
+ Image(systemName: "arrow.right")
+ .font(.caption)
+ .foregroundColor(.secondary)
+
+ VStack(alignment: .leading, spacing: 4) {
+ Text("Replacement:")
+ .font(.caption)
+ .foregroundColor(.secondary)
+ Text("VoiceInk")
+ .font(.callout)
+ }
+ }
+ .frame(maxWidth: .infinity, alignment: .leading)
.padding(12)
.background(Color(.textBackgroundColor))
.cornerRadius(8)
@@ -290,14 +324,19 @@ struct AddReplacementSheet: View {
.padding(.vertical)
}
}
- .frame(width: 460, height: 480)
+ .frame(width: 460, height: 520)
}
private func addReplacement() {
let original = originalWord
let replacement = replacementWord
- guard !original.isEmpty && !replacement.isEmpty else { return }
+ // Validate that at least one non-empty token exists
+ let tokens = original
+ .split(separator: ",")
+ .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
+ .filter { !$0.isEmpty }
+ guard !tokens.isEmpty && !replacement.isEmpty else { return }
manager.addReplacement(original: original, replacement: replacement)
dismiss()
diff --git a/VoiceInk/Views/EnhancementSettingsView.swift b/VoiceInk/Views/EnhancementSettingsView.swift
index 714d677..4026b5e 100644
--- a/VoiceInk/Views/EnhancementSettingsView.swift
+++ b/VoiceInk/Views/EnhancementSettingsView.swift
@@ -1,4 +1,5 @@
import SwiftUI
+import UniformTypeIdentifiers
struct EnhancementSettingsView: View {
@EnvironmentObject private var enhancementService: AIEnhancementService
@@ -79,25 +80,22 @@ struct EnhancementSettingsView: View {
Text("Enhancement Prompt")
.font(.headline)
- // Prompts Section
- VStack(alignment: .leading, spacing: 12) {
- PromptSelectionGrid(
- prompts: enhancementService.allPrompts,
- selectedPromptId: enhancementService.selectedPromptId,
- onPromptSelected: { prompt in
- enhancementService.setActivePrompt(prompt)
- },
- onEditPrompt: { prompt in
- selectedPromptForEdit = prompt
- },
- onDeletePrompt: { prompt in
- enhancementService.deletePrompt(prompt)
- },
- onAddNewPrompt: {
- isEditingPrompt = true
- }
- )
- }
+ // Reorderable prompts grid with drag-and-drop
+ ReorderablePromptGrid(
+ selectedPromptId: enhancementService.selectedPromptId,
+ onPromptSelected: { prompt in
+ enhancementService.setActivePrompt(prompt)
+ },
+ onEditPrompt: { prompt in
+ selectedPromptForEdit = prompt
+ },
+ onDeletePrompt: { prompt in
+ enhancementService.deletePrompt(prompt)
+ },
+ onAddNewPrompt: {
+ isEditingPrompt = true
+ }
+ )
}
.padding()
.background(CardBackground(isSelected: false))
@@ -115,3 +113,151 @@ struct EnhancementSettingsView: View {
}
}
}
+
+// MARK: - Drag & Drop Reorderable Grid
+private struct ReorderablePromptGrid: View {
+ @EnvironmentObject private var enhancementService: AIEnhancementService
+
+ let selectedPromptId: UUID?
+ let onPromptSelected: (CustomPrompt) -> Void
+ let onEditPrompt: ((CustomPrompt) -> Void)?
+ let onDeletePrompt: ((CustomPrompt) -> Void)?
+ let onAddNewPrompt: (() -> Void)?
+
+ @State private var draggingItem: CustomPrompt?
+
+ var body: some View {
+ VStack(alignment: .leading, spacing: 12) {
+ if enhancementService.customPrompts.isEmpty {
+ Text("No prompts available")
+ .foregroundColor(.secondary)
+ .font(.caption)
+ } else {
+ let columns = [
+ GridItem(.adaptive(minimum: 80, maximum: 100), spacing: 36)
+ ]
+
+ LazyVGrid(columns: columns, spacing: 16) {
+ ForEach(enhancementService.customPrompts) { prompt in
+ prompt.promptIcon(
+ isSelected: selectedPromptId == prompt.id,
+ onTap: {
+ withAnimation(.spring(response: 0.3, dampingFraction: 0.7)) {
+ onPromptSelected(prompt)
+ }
+ },
+ onEdit: onEditPrompt,
+ onDelete: onDeletePrompt
+ )
+ .opacity(draggingItem?.id == prompt.id ? 0.3 : 1.0)
+ .scaleEffect(draggingItem?.id == prompt.id ? 1.05 : 1.0)
+ .overlay(
+ RoundedRectangle(cornerRadius: 14)
+ .stroke(
+ draggingItem != nil && draggingItem?.id != prompt.id
+ ? Color.accentColor.opacity(0.25)
+ : Color.clear,
+ lineWidth: 1
+ )
+ )
+ .animation(.easeInOut(duration: 0.15), value: draggingItem?.id == prompt.id)
+ .onDrag {
+ draggingItem = prompt
+ return NSItemProvider(object: prompt.id.uuidString as NSString)
+ }
+ .onDrop(
+ of: [UTType.text],
+ delegate: PromptDropDelegate(
+ item: prompt,
+ prompts: $enhancementService.customPrompts,
+ draggingItem: $draggingItem
+ )
+ )
+ }
+
+ if let onAddNewPrompt = onAddNewPrompt {
+ CustomPrompt.addNewButton {
+ onAddNewPrompt()
+ }
+ .help("Add new prompt")
+ .onDrop(
+ of: [UTType.text],
+ delegate: PromptEndDropDelegate(
+ prompts: $enhancementService.customPrompts,
+ draggingItem: $draggingItem
+ )
+ )
+ }
+ }
+ .padding(.vertical, 12)
+ .padding(.horizontal, 16)
+
+ HStack {
+ Image(systemName: "info.circle")
+ .font(.caption)
+ .foregroundColor(.secondary)
+
+ Text("Double-click to edit โข Right-click for more options")
+ .font(.caption)
+ .foregroundColor(.secondary)
+ }
+ .padding(.top, 8)
+ .padding(.horizontal, 16)
+ }
+ }
+ }
+}
+
+// MARK: - Drop Delegates
+private struct PromptDropDelegate: DropDelegate {
+ let item: CustomPrompt
+ @Binding var prompts: [CustomPrompt]
+ @Binding var draggingItem: CustomPrompt?
+
+ func dropEntered(info: DropInfo) {
+ guard let draggingItem = draggingItem, draggingItem != item else { return }
+ guard let fromIndex = prompts.firstIndex(of: draggingItem),
+ let toIndex = prompts.firstIndex(of: item) else { return }
+
+ // Move item as you hover for immediate visual update
+ if prompts[toIndex].id != draggingItem.id {
+ withAnimation(.easeInOut(duration: 0.12)) {
+ let from = fromIndex
+ let to = toIndex
+ prompts.move(fromOffsets: IndexSet(integer: from), toOffset: to > from ? to + 1 : to)
+ }
+ }
+ }
+
+ func dropUpdated(info: DropInfo) -> DropProposal? {
+ DropProposal(operation: .move)
+ }
+
+ func performDrop(info: DropInfo) -> Bool {
+ draggingItem = nil
+ return true
+ }
+}
+
+private struct PromptEndDropDelegate: DropDelegate {
+ @Binding var prompts: [CustomPrompt]
+ @Binding var draggingItem: CustomPrompt?
+
+ func validateDrop(info: DropInfo) -> Bool { true }
+ func dropUpdated(info: DropInfo) -> DropProposal? { DropProposal(operation: .move) }
+
+ func performDrop(info: DropInfo) -> Bool {
+ guard let draggingItem = draggingItem,
+ let currentIndex = prompts.firstIndex(of: draggingItem) else {
+ self.draggingItem = nil
+ return false
+ }
+
+ // Move to end if dropped on the trailing "Add New" tile
+ withAnimation(.easeInOut(duration: 0.12)) {
+ prompts.move(fromOffsets: IndexSet(integer: currentIndex), toOffset: prompts.endIndex)
+ }
+ self.draggingItem = nil
+ return true
+ }
+}
diff --git a/VoiceInk/Views/ModelSettingsView.swift b/VoiceInk/Views/ModelSettingsView.swift
index 0af2337..5a1a06d 100644
--- a/VoiceInk/Views/ModelSettingsView.swift
+++ b/VoiceInk/Views/ModelSettingsView.swift
@@ -99,7 +99,7 @@ struct ModelSettingsView: View {
InfoTip(
title: "Voice Activity Detection",
- message: "Detects speech segments and filters out silence to reduce hallucinations in local Whisper models."
+ message: "Detect speech segments and filter out silence to improve accuracy of local models."
)
}
diff --git a/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift b/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift
index 900af55..b0479b9 100644
--- a/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift
+++ b/VoiceInk/Views/Settings/ExperimentalFeaturesSection.swift
@@ -37,7 +37,7 @@ struct ExperimentalFeaturesSection: View {
if isExperimentalFeaturesEnabled {
Toggle(isOn: $playbackController.isPauseMediaEnabled) {
- Text("Pause Media on Playback")
+ Text("Pause Media during recording")
}
.toggleStyle(.switch)
.help("Automatically pause active media playback during recordings and resume afterward.")
diff --git a/VoiceInk/Views/Settings/SettingsView.swift b/VoiceInk/Views/Settings/SettingsView.swift
index 211218c..54c219f 100644
--- a/VoiceInk/Views/Settings/SettingsView.swift
+++ b/VoiceInk/Views/Settings/SettingsView.swift
@@ -130,6 +130,8 @@ struct SettingsView: View {
Divider()
+
+
// Custom Cancel Shortcut
VStack(alignment: .leading, spacing: 12) {
HStack(spacing: 8) {
diff --git a/VoiceInk/VoiceInk.swift b/VoiceInk/VoiceInk.swift
index 549a500..af22cfd 100644
--- a/VoiceInk/VoiceInk.swift
+++ b/VoiceInk/VoiceInk.swift
@@ -114,6 +114,15 @@ struct VoiceInkApp: App {
if !UserDefaults.standard.bool(forKey: "IsTranscriptionCleanupEnabled") {
audioCleanupManager.startAutomaticCleanup(modelContext: container.mainContext)
}
+
+ // Process any pending open-file request now that the main ContentView is ready.
+ if let pendingURL = appDelegate.pendingOpenFileURL {
+ NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
+ DispatchQueue.main.asyncAfter(deadline: .now() + 0.3) {
+ NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": pendingURL])
+ }
+ appDelegate.pendingOpenFileURL = nil
+ }
}
.background(WindowAccessor { window in
WindowManager.shared.configureWindow(window)
diff --git a/VoiceInk/Whisper/WhisperHallucinationFilter.swift b/VoiceInk/Whisper/WhisperHallucinationFilter.swift
index f95d59d..bea80b3 100644
--- a/VoiceInk/Whisper/WhisperHallucinationFilter.swift
+++ b/VoiceInk/Whisper/WhisperHallucinationFilter.swift
@@ -4,49 +4,48 @@ import os
struct WhisperHallucinationFilter {
private static let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperHallucinationFilter")
- // Pattern-based approach for detecting hallucinations - focusing on format indicators
private static let hallucinationPatterns = [
- // Text in various types of brackets - the most reliable hallucination indicators
- #"\[.*?\]"#, // [Text in square brackets]
- #"\(.*?\)"#, // (Text in parentheses)
- #"\{.*?\}"#, // {Text in curly braces}
- #"<.*?>"#, //
-
- // Text with special formatting
- #"\*.*?\*"#, // *Text with asterisks*
- #"_.*?_"#, // _Text with underscores_
-
- // Time indicators often added by Whisper
- #"(?i)\d{1,2}:\d{2}(:\d{2})?\s*-\s*\d{1,2}:\d{2}(:\d{2})?"# // 00:00 - 00:00 format
+ #"\[.*?\]"#, // Square brackets
+ #"\(.*?\)"#, // Parentheses
+ #"\{.*?\}"# // Curly braces
+ ]
+
+ private static let fillerWords = [
+ "uh", "um", "uhm", "umm", "uhh", "uhhh", "er", "ah", "eh",
+ "hmm", "hm", "h", "m", "mmm", "mm", "mh", "ha", "ehh"
]
-
- /// Removes hallucinations from transcription text using pattern matching
- /// - Parameter text: Original transcription text from Whisper
- /// - Returns: Filtered text with hallucinations removed
static func filter(_ text: String) -> String {
- logger.notice("๐งน Applying pattern-based hallucination filter to transcription")
-
+ logger.notice("๐งน Filtering hallucinations and filler words")
var filteredText = text
-
- // Remove pattern-based hallucinations
+
+ // Remove bracketed hallucinations
for pattern in hallucinationPatterns {
if let regex = try? NSRegularExpression(pattern: pattern) {
let range = NSRange(filteredText.startIndex..., in: filteredText)
filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
}
}
-
- // Clean up extra whitespace and newlines that might be left after removing hallucinations
+
+ // Remove filler words
+ for fillerWord in fillerWords {
+ let pattern = "\\b\(NSRegularExpression.escapedPattern(for: fillerWord))\\b[,.]?"
+ if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
+ let range = NSRange(filteredText.startIndex..., in: filteredText)
+ filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
+ }
+ }
+
+ // Clean whitespace
filteredText = filteredText.replacingOccurrences(of: #"\s{2,}"#, with: " ", options: .regularExpression)
filteredText = filteredText.trimmingCharacters(in: .whitespacesAndNewlines)
-
- // Add logging to track effectiveness
+
+ // Log results
if filteredText != text {
- logger.notice("โ
Removed hallucinations using pattern matching")
+ logger.notice("โ
Removed hallucinations and filler words")
} else {
- logger.notice("โ
No hallucinations detected with pattern matching")
+ logger.notice("โ
No hallucinations or filler words found")
}
-
+
return filteredText
}
}
\ No newline at end of file