Merge branch 'main' into feat/paste-last-enhancement

This commit is contained in:
Alexey Haidamaka 2025-09-12 21:49:17 +02:00 committed by GitHub
commit 8293bc27b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 594 additions and 155 deletions

View File

@ -459,7 +459,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
CURRENT_PROJECT_VERSION = 152;
CURRENT_PROJECT_VERSION = 153;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@ -474,7 +474,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 1.52;
MARKETING_VERSION = 1.53;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
@ -493,7 +493,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
CURRENT_PROJECT_VERSION = 152;
CURRENT_PROJECT_VERSION = 153;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@ -508,7 +508,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 1.52;
MARKETING_VERSION = 1.53;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";

View File

@ -7,7 +7,7 @@
"location" : "https://github.com/FluidInference/FluidAudio",
"state" : {
"branch" : "main",
"revision" : "abf7d9ef3f53a693e3721069071971eff84c002f"
"revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
}
},
{

View File

@ -1,5 +1,6 @@
import Cocoa
import SwiftUI
import UniformTypeIdentifiers
class AppDelegate: NSObject, NSApplicationDelegate {
func applicationDidFinishLaunching(_ notification: Notification) {
@ -49,4 +50,28 @@ class AppDelegate: NSObject, NSApplicationDelegate {
defaults.removeObject(forKey: "defaultPowerModeConfigV2")
defaults.removeObject(forKey: "isPowerModeEnabled")
}
// Stash URL when app cold-starts to avoid spawning a new window/tab
var pendingOpenFileURL: URL?
func application(_ application: NSApplication, open urls: [URL]) {
guard let url = urls.first(where: { SupportedMedia.isSupported(url: $0) }) else {
return
}
NSApp.activate(ignoringOtherApps: true)
if NSApp.windows.isEmpty {
// Cold start: do NOT create a window here to avoid extra window/tab.
// Defer to SwiftUIs WindowGroup-created ContentView and let it process this later.
pendingOpenFileURL = url
} else {
// Running: focus current window and route in-place to Transcribe Audio
NSApp.windows.first?.makeKeyAndOrderFront(nil)
NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
DispatchQueue.main.async {
NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": url])
}
}
}
}

View File

@ -154,13 +154,6 @@ class HotkeyManager: ObservableObject {
}
}
KeyboardShortcuts.onKeyUp(for: .pasteLastEnhancement) { [weak self] in
guard let self = self else { return }
Task { @MainActor in
LastTranscriptionService.pasteLastEnhancement(from: self.whisperState.modelContext)
}
}
KeyboardShortcuts.onKeyUp(for: .retryLastTranscription) { [weak self] in
guard let self = self else { return }
Task { @MainActor in
@ -442,4 +435,3 @@ class HotkeyManager: ObservableObject {
}
}
}

View File

@ -18,5 +18,33 @@
<string>VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.</string>
<key>NSScreenCaptureUsageDescription</key>
<string>VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy.</string>
<key>CFBundleDocumentTypes</key>
<array>
<dict>
<key>CFBundleTypeName</key>
<string>Audio/Video File</string>
<key>CFBundleTypeRole</key>
<string>Viewer</string>
<key>LSHandlerRank</key>
<string>Alternate</string>
<key>LSItemContentTypes</key>
<array>
<string>public.audio</string>
<string>public.movie</string>
</array>
<key>CFBundleTypeExtensions</key>
<array>
<string>wav</string>
<string>mp3</string>
<string>m4a</string>
<string>aiff</string>
<string>mp4</string>
<string>mov</string>
<string>aac</string>
<string>flac</string>
<string>caf</string>
</array>
</dict>
</array>
</dict>
</plist>

View File

@ -14,4 +14,5 @@ extension Notification.Name {
static let powerModeConfigurationApplied = Notification.Name("powerModeConfigurationApplied")
static let transcriptionCreated = Notification.Name("transcriptionCreated")
static let enhancementToggleChanged = Notification.Name("enhancementToggleChanged")
static let openFileForTranscription = Notification.Name("openFileForTranscription")
}

View File

@ -182,18 +182,16 @@ struct ConfigurationView: View {
}
// Default Power Mode Toggle
if !powerModeManager.hasDefaultConfiguration() || isCurrentConfigDefault {
HStack {
Toggle("Set as default power mode", isOn: $isDefault)
.font(.system(size: 14))
InfoTip(
title: "Default Power Mode",
message: "Default power mode is used when no specific app or website matches are found"
)
Spacer()
}
HStack {
Toggle("Set as default power mode", isOn: $isDefault)
.font(.system(size: 14))
InfoTip(
title: "Default Power Mode",
message: "Default power mode is used when no specific app or website matches are found"
)
Spacer()
}
}
.padding(.horizontal, 20)

View File

@ -203,7 +203,7 @@ struct ConfigurationRow: View {
.padding(.vertical, 12)
.padding(.horizontal, 14)
if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled {
if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled || config.isAutoSendEnabled {
Divider()
.padding(.horizontal, 16)
@ -259,6 +259,22 @@ struct ConfigurationRow: View {
)
}
if config.isAutoSendEnabled {
HStack(spacing: 4) {
Image(systemName: "keyboard")
.font(.system(size: 10))
Text("Auto Send")
.font(.caption)
}
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Capsule()
.fill(Color(NSColor.controlBackgroundColor)))
.overlay(
Capsule()
.stroke(Color(NSColor.separatorColor), lineWidth: 0.5)
)
}
if config.isAIEnhancementEnabled {
if config.useScreenCapture {
HStack(spacing: 4) {
@ -289,7 +305,7 @@ struct ConfigurationRow: View {
.fill(Color.accentColor.opacity(0.1)))
.foregroundColor(.accentColor)
}
Spacer()
}
.padding(.vertical, 10)
@ -376,4 +392,4 @@ struct AppGridItem: View {
}
.buttonStyle(.plain)
}
}
}

View File

@ -76,15 +76,7 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
UserDefaults.standard.set(String(currentDeviceID), forKey: "lastUsedMicrophoneDeviceID")
hasDetectedAudioInCurrentSession = false
// Coordinate media control and system audio sequentially for better reliability
await playbackController.pauseMedia()
// Small delay to allow media command to process before muting system audio
try? await Task.sleep(nanoseconds: 100_000_000) // 100ms
_ = await mediaController.muteSystemAudio()
let deviceID = deviceManager.getCurrentDevice()
if deviceID != 0 {
do {
@ -114,6 +106,12 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
throw RecorderError.couldNotStartRecording
}
Task { [weak self] in
guard let self = self else { return }
await self.playbackController.pauseMedia()
_ = await self.mediaController.muteSystemAudio()
}
audioLevelCheckTask?.cancel()
audioMeterUpdateTask?.cancel()

View File

@ -261,6 +261,8 @@ class AIEnhancementService: ObservableObject {
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
return filteredText
} else if httpResponse.statusCode == 429 {
throw EnhancementError.rateLimitExceeded
} else if (500...599).contains(httpResponse.statusCode) {
throw EnhancementError.serverError
} else {
@ -316,6 +318,8 @@ class AIEnhancementService: ObservableObject {
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
return filteredText
} else if httpResponse.statusCode == 429 {
throw EnhancementError.rateLimitExceeded
} else if (500...599).contains(httpResponse.statusCode) {
throw EnhancementError.serverError
} else {
@ -342,7 +346,7 @@ class AIEnhancementService: ObservableObject {
return try await makeRequest(text: text, mode: mode)
} catch let error as EnhancementError {
switch error {
case .networkError, .serverError:
case .networkError, .serverError, .rateLimitExceeded:
retries += 1
if retries < maxRetries {
logger.warning("Request failed, retrying in \(currentDelay)s... (Attempt \(retries)/\(maxRetries))")
@ -458,6 +462,7 @@ enum EnhancementError: Error {
case enhancementFailed
case networkError
case serverError
case rateLimitExceeded
case customError(String)
}
@ -474,6 +479,8 @@ extension EnhancementError: LocalizedError {
return "Network connection failed. Check your internet."
case .serverError:
return "The AI provider's server encountered an error. Please try again later."
case .rateLimitExceeded:
return "Rate limit exceeded. Please try again later."
case .customError(let message):
return message
}

View File

@ -82,7 +82,7 @@ enum AIProvider: String, CaseIterable {
case .groq:
return [
"llama-3.3-70b-versatile",
"moonshotai/kimi-k2-instruct",
"moonshotai/kimi-k2-instruct-0905",
"qwen/qwen3-32b",
"meta-llama/llama-4-maverick-17b-128e-instruct",
"openai/gpt-oss-120b"

View File

@ -6,7 +6,7 @@ class DictionaryContextService {
private init() {}
private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, Wispr flow, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
func getDictionaryContext() -> String {
var allWords: [String] = []

View File

@ -7,6 +7,7 @@ import LaunchAtLogin
struct GeneralSettings: Codable {
let toggleMiniRecorderShortcut: KeyboardShortcuts.Shortcut?
let toggleMiniRecorderShortcut2: KeyboardShortcuts.Shortcut?
let retryLastTranscriptionShortcut: KeyboardShortcuts.Shortcut?
let selectedHotkey1RawValue: String?
let selectedHotkey2RawValue: String?
let launchAtLoginEnabled: Bool?
@ -86,6 +87,7 @@ class ImportExportService {
let generalSettingsToExport = GeneralSettings(
toggleMiniRecorderShortcut: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder),
toggleMiniRecorderShortcut2: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder2),
retryLastTranscriptionShortcut: KeyboardShortcuts.getShortcut(for: .retryLastTranscription),
selectedHotkey1RawValue: hotkeyManager.selectedHotkey1.rawValue,
selectedHotkey2RawValue: hotkeyManager.selectedHotkey2.rawValue,
launchAtLoginEnabled: LaunchAtLogin.isEnabled,
@ -218,6 +220,9 @@ class ImportExportService {
if let shortcut2 = general.toggleMiniRecorderShortcut2 {
KeyboardShortcuts.setShortcut(shortcut2, for: .toggleMiniRecorder2)
}
if let retryShortcut = general.retryLastTranscriptionShortcut {
KeyboardShortcuts.setShortcut(retryShortcut, for: .retryLastTranscription)
}
if let hotkeyRaw = general.selectedHotkey1RawValue,
let hotkey = HotkeyManager.HotkeyOption(rawValue: hotkeyRaw) {
hotkeyManager.selectedHotkey1 = hotkey

View File

@ -71,13 +71,60 @@ class ParakeetTranscriptionService: TranscriptionService {
let audioSamples = try readAudioSamples(from: audioURL)
// Validate audio data before transcription
guard audioSamples.count >= 16000 else {
logger.notice("🦜 Audio too short for transcription: \(audioSamples.count) samples")
// Validate audio data before VAD
guard !audioSamples.isEmpty else {
logger.notice("🦜 Audio is empty, skipping transcription.")
throw ASRError.invalidAudioData
}
// Use VAD to get speech segments
var speechAudio: [Float] = []
let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true
if isVADEnabled {
if let modelPath = await VADModelManager.shared.getModelPath() {
if let vad = VoiceActivityDetector(modelPath: modelPath) {
let speechSegments = vad.process(audioSamples: audioSamples)
logger.notice("🦜 VAD detected \(speechSegments.count) speech segments.")
let sampleRate = 16000 // Assuming 16kHz sample rate
for segment in speechSegments {
let startSample = Int(segment.start * Double(sampleRate))
var endSample = Int(segment.end * Double(sampleRate))
// Cap endSample to the audio buffer size
if endSample > audioSamples.count {
endSample = audioSamples.count
}
if startSample < endSample {
speechAudio.append(contentsOf: audioSamples[startSample..<endSample])
} else {
logger.warning("🦜 Invalid sample range for segment: start=\(startSample), end=\(endSample). Skipping.")
}
}
logger.notice("🦜 Extracted \(speechAudio.count) samples from VAD segments.")
} else {
logger.warning("🦜 VAD could not be initialized. Transcribing original audio.")
speechAudio = audioSamples
}
} else {
logger.warning("🦜 VAD model path not found. Transcribing original audio.")
speechAudio = audioSamples
}
} else {
logger.notice("🦜 VAD is disabled by user setting. Transcribing original audio.")
speechAudio = audioSamples
}
// Validate audio data after VAD
guard speechAudio.count >= 16000 else {
logger.notice("🦜 Audio too short for transcription after VAD: \(speechAudio.count) samples")
throw ASRError.invalidAudioData
}
let result = try await asrManager.transcribe(audioSamples)
let result = try await asrManager.transcribe(speechAudio)
print(result.text)
// Reset decoder state and cleanup after transcription to avoid blocking the transcription start
Task {
@ -91,10 +138,16 @@ class ParakeetTranscriptionService: TranscriptionService {
logger.notice("🦜 Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue")
}
var text = result.text
if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true {
return WhisperTextFormatter.format(result.text)
text = WhisperTextFormatter.format(text)
}
return result.text
// Apply hallucination and filler word filtering
text = WhisperHallucinationFilter.filter(text)
return text
}
private func readAudioSamples(from url: URL) throws -> [Float] {

View File

@ -0,0 +1,28 @@
import Foundation
import UniformTypeIdentifiers
struct SupportedMedia {
static let extensions: Set<String> = [
"wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf"
]
static let contentTypes: [UTType] = [
.audio, .movie
]
static func isSupported(url: URL) -> Bool {
let fileExtension = url.pathExtension.lowercased()
if !fileExtension.isEmpty, extensions.contains(fileExtension) {
return true
}
if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]),
let contentType = resourceValues.contentType {
return contentTypes.contains(where: { contentType.conforms(to: $0) })
}
return false
}
}

View File

@ -0,0 +1,88 @@
import Foundation
import AVFoundation
import os.log
#if canImport(whisper)
import whisper
#else
#error("Unable to import whisper module. Please check your project configuration.")
#endif
// MARK: - C API Bridge
// Opaque pointers for the C contexts
fileprivate typealias WhisperVADContext = OpaquePointer
fileprivate typealias WhisperVADSegments = OpaquePointer
// MARK: - VoiceActivityDetector Class
class VoiceActivityDetector {
private var vadContext: WhisperVADContext
private let logger = Logger(subsystem: "com.voiceink.app", category: "VoiceActivityDetector")
init?(modelPath: String) {
var contextParams = whisper_vad_default_context_params()
contextParams.n_threads = max(1, min(8, Int32(ProcessInfo.processInfo.processorCount) - 2))
let contextOpt: WhisperVADContext? = modelPath.withCString { cPath in
whisper_vad_init_from_file_with_params(cPath, contextParams)
}
guard let context = contextOpt else {
logger.error("Failed to initialize VAD context.")
return nil
}
self.vadContext = context
logger.notice("VAD context initialized successfully.")
}
deinit {
whisper_vad_free(vadContext)
logger.notice("VAD context freed.")
}
/// Processes audio samples to detect speech segments and returns an array of (start: TimeInterval, end: TimeInterval) tuples.
func process(audioSamples: [Float]) -> [(start: TimeInterval, end: TimeInterval)] {
// 1. Detect speech and get probabilities internally in the context
let success = audioSamples.withUnsafeBufferPointer { buffer in
whisper_vad_detect_speech(vadContext, buffer.baseAddress!, Int32(audioSamples.count))
}
guard success else {
logger.error("Failed to detect speech probabilities.")
return []
}
// 2. Get segments from probabilities
var vadParams = whisper_vad_default_params()
vadParams.threshold = 0.45
vadParams.min_speech_duration_ms = 150
vadParams.min_silence_duration_ms = 750
vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude // Use the largest representable Float value for no max duration
vadParams.speech_pad_ms = 100
vadParams.samples_overlap = 0.1 // Add samples_overlap parameter
guard let segments = whisper_vad_segments_from_probs(vadContext, vadParams) else {
logger.error("Failed to get VAD segments from probabilities.")
return []
}
defer {
// Ensure segments are freed
whisper_vad_free_segments(segments)
}
let nSegments = whisper_vad_segments_n_segments(segments)
logger.notice("Detected \(nSegments) speech segments.")
var speechSegments: [(start: TimeInterval, end: TimeInterval)] = []
for i in 0..<nSegments {
// Timestamps from C are mysteriously multiplied by 100, so we correct them here.
let startTimeSec = whisper_vad_segments_get_segment_t0(segments, i) / 100.0
let endTimeSec = whisper_vad_segments_get_segment_t1(segments, i) / 100.0
speechSegments.append((start: TimeInterval(startTimeSec), end: TimeInterval(endTimeSec)))
}
logger.notice("Returning \(speechSegments.count) speech segments.")
return speechSegments
}
}

View File

@ -14,22 +14,31 @@ class WordReplacementService {
var modifiedText = text
// Apply replacements (case-insensitive)
for (original, replacement) in replacements {
let isPhrase = original.contains(" ") || original.trimmingCharacters(in: .whitespacesAndNewlines) != original
for (originalGroup, replacement) in replacements {
// Split comma-separated originals at apply time only
let variants = originalGroup
.split(separator: ",")
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
if isPhrase || !usesWordBoundaries(for: original) {
modifiedText = modifiedText.replacingOccurrences(of: original, with: replacement, options: .caseInsensitive)
} else {
// Use word boundaries for spaced languages
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: original))\\b"
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
let range = NSRange(modifiedText.startIndex..., in: modifiedText)
modifiedText = regex.stringByReplacingMatches(
in: modifiedText,
options: [],
range: range,
withTemplate: replacement
)
for original in variants {
let usesBoundaries = usesWordBoundaries(for: original)
if usesBoundaries {
// Word-boundary regex for full original string
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: original))\\b"
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
let range = NSRange(modifiedText.startIndex..., in: modifiedText)
modifiedText = regex.stringByReplacingMatches(
in: modifiedText,
options: [],
range: range,
withTemplate: replacement
)
}
} else {
// Fallback substring replace for non-spaced scripts
modifiedText = modifiedText.replacingOccurrences(of: original, with: replacement, options: .caseInsensitive)
}
}
}

View File

@ -112,6 +112,12 @@ struct AudioTranscribeView: View {
Text(errorMessage)
}
}
.onReceive(NotificationCenter.default.publisher(for: .openFileForTranscription)) { notification in
if let url = notification.userInfo?["url"] as? URL {
// Do not auto-start; only select file for manual transcription
validateAndSetAudioFile(url)
}
}
}
private var dropZoneView: some View {
@ -347,29 +353,8 @@ struct AudioTranscribeView: View {
}
}
// Validate file type by extension
let supportedExtensions = ["wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf"]
let fileExtension = url.pathExtension.lowercased()
// Check file extension first
if !fileExtension.isEmpty && supportedExtensions.contains(fileExtension) {
print("File type validated by extension: \(fileExtension)")
} else {
print("Unsupported file extension: \(fileExtension)")
// Try to validate by UTType as well
if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]),
let contentType = resourceValues.contentType {
if contentType.conforms(to: .audio) || contentType.conforms(to: .movie) {
print("File type validated by UTType: \(contentType.identifier)")
} else {
print("File does not conform to audio or movie type: \(contentType.identifier)")
return
}
} else {
print("Could not validate file type")
return
}
}
// Validate file type
guard SupportedMedia.isSupported(url: url) else { return }
print("File validated successfully: \(url.lastPathComponent)")
selectedAudioURL = url
@ -381,4 +366,4 @@ struct AudioTranscribeView: View {
let seconds = Int(duration) % 60
return String(format: "%d:%02d", minutes, seconds)
}
}
}

View File

@ -165,6 +165,7 @@ struct ContentView: View {
let appVersion = Bundle.main.infoDictionary?["CFBundleShortVersionString"] as? String ?? "1.0.0"
@StateObject private var licenseViewModel = LicenseViewModel()
private var isSetupComplete: Bool {
hasLoadedData &&
whisperState.currentTranscriptionModel != nil &&
@ -192,6 +193,7 @@ struct ContentView: View {
.onAppear {
hasLoadedData = true
}
// inside ContentView body:
.onReceive(NotificationCenter.default.publisher(for: .navigateToDestination)) { notification in
print("ContentView: Received navigation notification")
if let destination = notification.userInfo?["destination"] as? String {
@ -215,6 +217,10 @@ struct ContentView: View {
case "Enhancement":
print("ContentView: Navigating to Enhancement")
selectedView = .enhancement
case "Transcribe Audio":
// Ensure we switch to the Transcribe Audio view in-place
print("ContentView: Navigating to Transcribe Audio")
selectedView = .transcribeAudio
default:
print("ContentView: No matching destination found for: \(destination)")
break
@ -259,3 +265,5 @@ struct ContentView: View {
}
}
}

View File

@ -1,8 +1,5 @@
import SwiftUI
/// A reusable sheet for editing an existing word replacement entry.
/// Mirrors the UI of `AddReplacementSheet` for consistency while pre-populating
/// the fields with the existing values.
// Edit existing word replacement entry
struct EditReplacementSheet: View {
@ObservedObject var manager: WordReplacementManager
let originalKey: String
@ -26,7 +23,7 @@ struct EditReplacementSheet: View {
Divider()
formContent
}
.frame(width: 460, height: 480)
.frame(width: 460, height: 560)
}
// MARK: Subviews
@ -65,7 +62,7 @@ struct EditReplacementSheet: View {
}
private var descriptionSection: some View {
Text("Update the word or phrase that should be automatically replaced during AI enhancement.")
Text("Update the word or phrase that should be automatically replaced.")
.font(.subheadline)
.foregroundColor(.secondary)
.frame(maxWidth: .infinity, alignment: .leading)
@ -84,8 +81,9 @@ struct EditReplacementSheet: View {
.font(.caption)
.foregroundColor(.secondary)
}
TextField("Enter word or phrase to replace", text: $originalWord)
TextField("Enter word or phrase to replace (use commas for multiple)", text: $originalWord)
.textFieldStyle(.roundedBorder)
}
.padding(.horizontal)
@ -117,7 +115,12 @@ struct EditReplacementSheet: View {
private func saveChanges() {
let newOriginal = originalWord.trimmingCharacters(in: .whitespacesAndNewlines)
let newReplacement = replacementWord.trimmingCharacters(in: .whitespacesAndNewlines)
guard !newOriginal.isEmpty, !newReplacement.isEmpty else { return }
// Ensure at least one non-empty token
let tokens = newOriginal
.split(separator: ",")
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
guard !tokens.isEmpty, !newReplacement.isEmpty else { return }
manager.updateReplacement(oldOriginal: originalKey, newOriginal: newOriginal, newReplacement: newReplacement)
dismiss()

View File

@ -23,7 +23,10 @@ class WordReplacementManager: ObservableObject {
}
func addReplacement(original: String, replacement: String) {
replacements[original] = replacement
// Preserve comma-separated originals as a single entry
let trimmed = original.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
replacements[trimmed] = replacement
}
func removeReplacement(original: String) {
@ -31,12 +34,11 @@ class WordReplacementManager: ObservableObject {
}
func updateReplacement(oldOriginal: String, newOriginal: String, newReplacement: String) {
// Remove the old key if the original text has changed
if oldOriginal != newOriginal {
replacements.removeValue(forKey: oldOriginal)
}
// Update (or insert) the new key/value pair
replacements[newOriginal] = newReplacement
// Replace old key with the new comma-preserved key
replacements.removeValue(forKey: oldOriginal)
let trimmed = newOriginal.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
replacements[trimmed] = newReplacement
}
}
@ -142,7 +144,7 @@ struct EmptyStateView: View {
Text("No Replacements")
.font(.headline)
Text("Add word replacements to automatically replace text during AI enhancement.")
Text("Add word replacements to automatically replace text.")
.font(.subheadline)
.foregroundColor(.secondary)
.multilineTextAlignment(.center)
@ -200,7 +202,7 @@ struct AddReplacementSheet: View {
ScrollView {
VStack(spacing: 20) {
// Description
Text("Define a word or phrase to be automatically replaced during AI enhancement.")
Text("Define a word or phrase to be automatically replaced.")
.font(.subheadline)
.foregroundColor(.secondary)
.frame(maxWidth: .infinity, alignment: .leading)
@ -221,9 +223,12 @@ struct AddReplacementSheet: View {
.foregroundColor(.secondary)
}
TextField("Enter word or phrase to replace", text: $originalWord)
TextField("Enter word or phrase to replace (use commas for multiple)", text: $originalWord)
.textFieldStyle(.roundedBorder)
.font(.body)
Text("Separate multiple originals with commas, e.g. Voicing, Voice ink, Voiceing")
.font(.caption)
.foregroundColor(.secondary)
}
.padding(.horizontal)
@ -255,10 +260,11 @@ struct AddReplacementSheet: View {
// Example Section
VStack(alignment: .leading, spacing: 8) {
Text("Example")
Text("Examples")
.font(.subheadline)
.foregroundColor(.secondary)
// Single original -> replacement
HStack(spacing: 12) {
VStack(alignment: .leading, spacing: 4) {
Text("Original:")
@ -280,6 +286,34 @@ struct AddReplacementSheet: View {
.font(.callout)
}
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(12)
.background(Color(.textBackgroundColor))
.cornerRadius(8)
// Comma-separated originals -> single replacement
HStack(spacing: 12) {
VStack(alignment: .leading, spacing: 4) {
Text("Original:")
.font(.caption)
.foregroundColor(.secondary)
Text("Voicing, Voice ink, Voiceing")
.font(.callout)
}
Image(systemName: "arrow.right")
.font(.caption)
.foregroundColor(.secondary)
VStack(alignment: .leading, spacing: 4) {
Text("Replacement:")
.font(.caption)
.foregroundColor(.secondary)
Text("VoiceInk")
.font(.callout)
}
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(12)
.background(Color(.textBackgroundColor))
.cornerRadius(8)
@ -290,14 +324,19 @@ struct AddReplacementSheet: View {
.padding(.vertical)
}
}
.frame(width: 460, height: 480)
.frame(width: 460, height: 520)
}
private func addReplacement() {
let original = originalWord
let replacement = replacementWord
guard !original.isEmpty && !replacement.isEmpty else { return }
// Validate that at least one non-empty token exists
let tokens = original
.split(separator: ",")
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
guard !tokens.isEmpty && !replacement.isEmpty else { return }
manager.addReplacement(original: original, replacement: replacement)
dismiss()

View File

@ -1,4 +1,5 @@
import SwiftUI
import UniformTypeIdentifiers
struct EnhancementSettingsView: View {
@EnvironmentObject private var enhancementService: AIEnhancementService
@ -79,25 +80,22 @@ struct EnhancementSettingsView: View {
Text("Enhancement Prompt")
.font(.headline)
// Prompts Section
VStack(alignment: .leading, spacing: 12) {
PromptSelectionGrid(
prompts: enhancementService.allPrompts,
selectedPromptId: enhancementService.selectedPromptId,
onPromptSelected: { prompt in
enhancementService.setActivePrompt(prompt)
},
onEditPrompt: { prompt in
selectedPromptForEdit = prompt
},
onDeletePrompt: { prompt in
enhancementService.deletePrompt(prompt)
},
onAddNewPrompt: {
isEditingPrompt = true
}
)
}
// Reorderable prompts grid with drag-and-drop
ReorderablePromptGrid(
selectedPromptId: enhancementService.selectedPromptId,
onPromptSelected: { prompt in
enhancementService.setActivePrompt(prompt)
},
onEditPrompt: { prompt in
selectedPromptForEdit = prompt
},
onDeletePrompt: { prompt in
enhancementService.deletePrompt(prompt)
},
onAddNewPrompt: {
isEditingPrompt = true
}
)
}
.padding()
.background(CardBackground(isSelected: false))
@ -115,3 +113,151 @@ struct EnhancementSettingsView: View {
}
}
}
// MARK: - Drag & Drop Reorderable Grid
private struct ReorderablePromptGrid: View {
@EnvironmentObject private var enhancementService: AIEnhancementService
let selectedPromptId: UUID?
let onPromptSelected: (CustomPrompt) -> Void
let onEditPrompt: ((CustomPrompt) -> Void)?
let onDeletePrompt: ((CustomPrompt) -> Void)?
let onAddNewPrompt: (() -> Void)?
@State private var draggingItem: CustomPrompt?
var body: some View {
VStack(alignment: .leading, spacing: 12) {
if enhancementService.customPrompts.isEmpty {
Text("No prompts available")
.foregroundColor(.secondary)
.font(.caption)
} else {
let columns = [
GridItem(.adaptive(minimum: 80, maximum: 100), spacing: 36)
]
LazyVGrid(columns: columns, spacing: 16) {
ForEach(enhancementService.customPrompts) { prompt in
prompt.promptIcon(
isSelected: selectedPromptId == prompt.id,
onTap: {
withAnimation(.spring(response: 0.3, dampingFraction: 0.7)) {
onPromptSelected(prompt)
}
},
onEdit: onEditPrompt,
onDelete: onDeletePrompt
)
.opacity(draggingItem?.id == prompt.id ? 0.3 : 1.0)
.scaleEffect(draggingItem?.id == prompt.id ? 1.05 : 1.0)
.overlay(
RoundedRectangle(cornerRadius: 14)
.stroke(
draggingItem != nil && draggingItem?.id != prompt.id
? Color.accentColor.opacity(0.25)
: Color.clear,
lineWidth: 1
)
)
.animation(.easeInOut(duration: 0.15), value: draggingItem?.id == prompt.id)
.onDrag {
draggingItem = prompt
return NSItemProvider(object: prompt.id.uuidString as NSString)
}
.onDrop(
of: [UTType.text],
delegate: PromptDropDelegate(
item: prompt,
prompts: $enhancementService.customPrompts,
draggingItem: $draggingItem
)
)
}
if let onAddNewPrompt = onAddNewPrompt {
CustomPrompt.addNewButton {
onAddNewPrompt()
}
.help("Add new prompt")
.onDrop(
of: [UTType.text],
delegate: PromptEndDropDelegate(
prompts: $enhancementService.customPrompts,
draggingItem: $draggingItem
)
)
}
}
.padding(.vertical, 12)
.padding(.horizontal, 16)
HStack {
Image(systemName: "info.circle")
.font(.caption)
.foregroundColor(.secondary)
Text("Double-click to edit • Right-click for more options")
.font(.caption)
.foregroundColor(.secondary)
}
.padding(.top, 8)
.padding(.horizontal, 16)
}
}
}
}
// MARK: - Drop Delegates
private struct PromptDropDelegate: DropDelegate {
let item: CustomPrompt
@Binding var prompts: [CustomPrompt]
@Binding var draggingItem: CustomPrompt?
func dropEntered(info: DropInfo) {
guard let draggingItem = draggingItem, draggingItem != item else { return }
guard let fromIndex = prompts.firstIndex(of: draggingItem),
let toIndex = prompts.firstIndex(of: item) else { return }
// Move item as you hover for immediate visual update
if prompts[toIndex].id != draggingItem.id {
withAnimation(.easeInOut(duration: 0.12)) {
let from = fromIndex
let to = toIndex
prompts.move(fromOffsets: IndexSet(integer: from), toOffset: to > from ? to + 1 : to)
}
}
}
func dropUpdated(info: DropInfo) -> DropProposal? {
DropProposal(operation: .move)
}
func performDrop(info: DropInfo) -> Bool {
draggingItem = nil
return true
}
}
private struct PromptEndDropDelegate: DropDelegate {
@Binding var prompts: [CustomPrompt]
@Binding var draggingItem: CustomPrompt?
func validateDrop(info: DropInfo) -> Bool { true }
func dropUpdated(info: DropInfo) -> DropProposal? { DropProposal(operation: .move) }
func performDrop(info: DropInfo) -> Bool {
guard let draggingItem = draggingItem,
let currentIndex = prompts.firstIndex(of: draggingItem) else {
self.draggingItem = nil
return false
}
// Move to end if dropped on the trailing "Add New" tile
withAnimation(.easeInOut(duration: 0.12)) {
prompts.move(fromOffsets: IndexSet(integer: currentIndex), toOffset: prompts.endIndex)
}
self.draggingItem = nil
return true
}
}

View File

@ -99,7 +99,7 @@ struct ModelSettingsView: View {
InfoTip(
title: "Voice Activity Detection",
message: "Detects speech segments and filters out silence to reduce hallucinations in local Whisper models."
message: "Detect speech segments and filter out silence to improve accuracy of local models."
)
}

View File

@ -37,7 +37,7 @@ struct ExperimentalFeaturesSection: View {
if isExperimentalFeaturesEnabled {
Toggle(isOn: $playbackController.isPauseMediaEnabled) {
Text("Pause Media on Playback")
Text("Pause Media during recording")
}
.toggleStyle(.switch)
.help("Automatically pause active media playback during recordings and resume afterward.")

View File

@ -130,6 +130,8 @@ struct SettingsView: View {
Divider()
// Custom Cancel Shortcut
VStack(alignment: .leading, spacing: 12) {
HStack(spacing: 8) {

View File

@ -114,6 +114,15 @@ struct VoiceInkApp: App {
if !UserDefaults.standard.bool(forKey: "IsTranscriptionCleanupEnabled") {
audioCleanupManager.startAutomaticCleanup(modelContext: container.mainContext)
}
// Process any pending open-file request now that the main ContentView is ready.
if let pendingURL = appDelegate.pendingOpenFileURL {
NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
DispatchQueue.main.asyncAfter(deadline: .now() + 0.3) {
NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": pendingURL])
}
appDelegate.pendingOpenFileURL = nil
}
}
.background(WindowAccessor { window in
WindowManager.shared.configureWindow(window)

View File

@ -4,49 +4,48 @@ import os
struct WhisperHallucinationFilter {
private static let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperHallucinationFilter")
// Pattern-based approach for detecting hallucinations - focusing on format indicators
private static let hallucinationPatterns = [
// Text in various types of brackets - the most reliable hallucination indicators
#"\[.*?\]"#, // [Text in square brackets]
#"\(.*?\)"#, // (Text in parentheses)
#"\{.*?\}"#, // {Text in curly braces}
#"<.*?>"#, // <Text in angle brackets>
// Text with special formatting
#"\*.*?\*"#, // *Text with asterisks*
#"_.*?_"#, // _Text with underscores_
// Time indicators often added by Whisper
#"(?i)\d{1,2}:\d{2}(:\d{2})?\s*-\s*\d{1,2}:\d{2}(:\d{2})?"# // 00:00 - 00:00 format
#"\[.*?\]"#, // Square brackets
#"\(.*?\)"#, // Parentheses
#"\{.*?\}"# // Curly braces
]
private static let fillerWords = [
"uh", "um", "uhm", "umm", "uhh", "uhhh", "er", "ah", "eh",
"hmm", "hm", "h", "m", "mmm", "mm", "mh", "ha", "ehh"
]
/// Removes hallucinations from transcription text using pattern matching
/// - Parameter text: Original transcription text from Whisper
/// - Returns: Filtered text with hallucinations removed
static func filter(_ text: String) -> String {
logger.notice("🧹 Applying pattern-based hallucination filter to transcription")
logger.notice("🧹 Filtering hallucinations and filler words")
var filteredText = text
// Remove pattern-based hallucinations
// Remove bracketed hallucinations
for pattern in hallucinationPatterns {
if let regex = try? NSRegularExpression(pattern: pattern) {
let range = NSRange(filteredText.startIndex..., in: filteredText)
filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
}
}
// Clean up extra whitespace and newlines that might be left after removing hallucinations
// Remove filler words
for fillerWord in fillerWords {
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: fillerWord))\\b[,.]?"
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
let range = NSRange(filteredText.startIndex..., in: filteredText)
filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
}
}
// Clean whitespace
filteredText = filteredText.replacingOccurrences(of: #"\s{2,}"#, with: " ", options: .regularExpression)
filteredText = filteredText.trimmingCharacters(in: .whitespacesAndNewlines)
// Add logging to track effectiveness
// Log results
if filteredText != text {
logger.notice("✅ Removed hallucinations using pattern matching")
logger.notice("✅ Removed hallucinations and filler words")
} else {
logger.notice("✅ No hallucinations detected with pattern matching")
logger.notice("✅ No hallucinations or filler words found")
}
return filteredText
}
}