Merge branch 'main' into feat/paste-last-enhancement
This commit is contained in:
commit
8293bc27b7
@ -459,7 +459,7 @@
|
||||
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COMBINE_HIDPI_IMAGES = YES;
|
||||
CURRENT_PROJECT_VERSION = 152;
|
||||
CURRENT_PROJECT_VERSION = 153;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = V6J6A3VWY2;
|
||||
ENABLE_HARDENED_RUNTIME = YES;
|
||||
@ -474,7 +474,7 @@
|
||||
"@executable_path/../Frameworks",
|
||||
);
|
||||
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
||||
MARKETING_VERSION = 1.52;
|
||||
MARKETING_VERSION = 1.53;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
||||
@ -493,7 +493,7 @@
|
||||
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COMBINE_HIDPI_IMAGES = YES;
|
||||
CURRENT_PROJECT_VERSION = 152;
|
||||
CURRENT_PROJECT_VERSION = 153;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = V6J6A3VWY2;
|
||||
ENABLE_HARDENED_RUNTIME = YES;
|
||||
@ -508,7 +508,7 @@
|
||||
"@executable_path/../Frameworks",
|
||||
);
|
||||
MACOSX_DEPLOYMENT_TARGET = 14.0;
|
||||
MARKETING_VERSION = 1.52;
|
||||
MARKETING_VERSION = 1.53;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "ENABLE_NATIVE_SPEECH_ANALYZER $(inherited)";
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
"location" : "https://github.com/FluidInference/FluidAudio",
|
||||
"state" : {
|
||||
"branch" : "main",
|
||||
"revision" : "abf7d9ef3f53a693e3721069071971eff84c002f"
|
||||
"revision" : "052cbb27cf073a9407251d74ef3459ea258e41b3"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import Cocoa
|
||||
import SwiftUI
|
||||
import UniformTypeIdentifiers
|
||||
|
||||
class AppDelegate: NSObject, NSApplicationDelegate {
|
||||
func applicationDidFinishLaunching(_ notification: Notification) {
|
||||
@ -49,4 +50,28 @@ class AppDelegate: NSObject, NSApplicationDelegate {
|
||||
defaults.removeObject(forKey: "defaultPowerModeConfigV2")
|
||||
defaults.removeObject(forKey: "isPowerModeEnabled")
|
||||
}
|
||||
|
||||
// Stash URL when app cold-starts to avoid spawning a new window/tab
|
||||
var pendingOpenFileURL: URL?
|
||||
|
||||
func application(_ application: NSApplication, open urls: [URL]) {
|
||||
guard let url = urls.first(where: { SupportedMedia.isSupported(url: $0) }) else {
|
||||
return
|
||||
}
|
||||
|
||||
NSApp.activate(ignoringOtherApps: true)
|
||||
|
||||
if NSApp.windows.isEmpty {
|
||||
// Cold start: do NOT create a window here to avoid extra window/tab.
|
||||
// Defer to SwiftUI’s WindowGroup-created ContentView and let it process this later.
|
||||
pendingOpenFileURL = url
|
||||
} else {
|
||||
// Running: focus current window and route in-place to Transcribe Audio
|
||||
NSApp.windows.first?.makeKeyAndOrderFront(nil)
|
||||
NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
|
||||
DispatchQueue.main.async {
|
||||
NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": url])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -154,13 +154,6 @@ class HotkeyManager: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
KeyboardShortcuts.onKeyUp(for: .pasteLastEnhancement) { [weak self] in
|
||||
guard let self = self else { return }
|
||||
Task { @MainActor in
|
||||
LastTranscriptionService.pasteLastEnhancement(from: self.whisperState.modelContext)
|
||||
}
|
||||
}
|
||||
|
||||
KeyboardShortcuts.onKeyUp(for: .retryLastTranscription) { [weak self] in
|
||||
guard let self = self else { return }
|
||||
Task { @MainActor in
|
||||
@ -442,4 +435,3 @@ class HotkeyManager: ObservableObject {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -18,5 +18,33 @@
|
||||
<string>VoiceInk needs to interact with your browser to detect the current website for applying website-specific configurations.</string>
|
||||
<key>NSScreenCaptureUsageDescription</key>
|
||||
<string>VoiceInk needs screen recording access to understand context from your screen for improved transcription accuracy.</string>
|
||||
<key>CFBundleDocumentTypes</key>
|
||||
<array>
|
||||
<dict>
|
||||
<key>CFBundleTypeName</key>
|
||||
<string>Audio/Video File</string>
|
||||
<key>CFBundleTypeRole</key>
|
||||
<string>Viewer</string>
|
||||
<key>LSHandlerRank</key>
|
||||
<string>Alternate</string>
|
||||
<key>LSItemContentTypes</key>
|
||||
<array>
|
||||
<string>public.audio</string>
|
||||
<string>public.movie</string>
|
||||
</array>
|
||||
<key>CFBundleTypeExtensions</key>
|
||||
<array>
|
||||
<string>wav</string>
|
||||
<string>mp3</string>
|
||||
<string>m4a</string>
|
||||
<string>aiff</string>
|
||||
<string>mp4</string>
|
||||
<string>mov</string>
|
||||
<string>aac</string>
|
||||
<string>flac</string>
|
||||
<string>caf</string>
|
||||
</array>
|
||||
</dict>
|
||||
</array>
|
||||
</dict>
|
||||
</plist>
|
||||
|
||||
@ -14,4 +14,5 @@ extension Notification.Name {
|
||||
static let powerModeConfigurationApplied = Notification.Name("powerModeConfigurationApplied")
|
||||
static let transcriptionCreated = Notification.Name("transcriptionCreated")
|
||||
static let enhancementToggleChanged = Notification.Name("enhancementToggleChanged")
|
||||
static let openFileForTranscription = Notification.Name("openFileForTranscription")
|
||||
}
|
||||
|
||||
@ -182,18 +182,16 @@ struct ConfigurationView: View {
|
||||
}
|
||||
|
||||
// Default Power Mode Toggle
|
||||
if !powerModeManager.hasDefaultConfiguration() || isCurrentConfigDefault {
|
||||
HStack {
|
||||
Toggle("Set as default power mode", isOn: $isDefault)
|
||||
.font(.system(size: 14))
|
||||
|
||||
InfoTip(
|
||||
title: "Default Power Mode",
|
||||
message: "Default power mode is used when no specific app or website matches are found"
|
||||
)
|
||||
|
||||
Spacer()
|
||||
}
|
||||
HStack {
|
||||
Toggle("Set as default power mode", isOn: $isDefault)
|
||||
.font(.system(size: 14))
|
||||
|
||||
InfoTip(
|
||||
title: "Default Power Mode",
|
||||
message: "Default power mode is used when no specific app or website matches are found"
|
||||
)
|
||||
|
||||
Spacer()
|
||||
}
|
||||
}
|
||||
.padding(.horizontal, 20)
|
||||
|
||||
@ -203,7 +203,7 @@ struct ConfigurationRow: View {
|
||||
.padding(.vertical, 12)
|
||||
.padding(.horizontal, 14)
|
||||
|
||||
if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled {
|
||||
if selectedModel != nil || selectedLanguage != nil || config.isAIEnhancementEnabled || config.isAutoSendEnabled {
|
||||
Divider()
|
||||
.padding(.horizontal, 16)
|
||||
|
||||
@ -259,6 +259,22 @@ struct ConfigurationRow: View {
|
||||
)
|
||||
}
|
||||
|
||||
if config.isAutoSendEnabled {
|
||||
HStack(spacing: 4) {
|
||||
Image(systemName: "keyboard")
|
||||
.font(.system(size: 10))
|
||||
Text("Auto Send")
|
||||
.font(.caption)
|
||||
}
|
||||
.padding(.horizontal, 8)
|
||||
.padding(.vertical, 4)
|
||||
.background(Capsule()
|
||||
.fill(Color(NSColor.controlBackgroundColor)))
|
||||
.overlay(
|
||||
Capsule()
|
||||
.stroke(Color(NSColor.separatorColor), lineWidth: 0.5)
|
||||
)
|
||||
}
|
||||
if config.isAIEnhancementEnabled {
|
||||
if config.useScreenCapture {
|
||||
HStack(spacing: 4) {
|
||||
@ -289,7 +305,7 @@ struct ConfigurationRow: View {
|
||||
.fill(Color.accentColor.opacity(0.1)))
|
||||
.foregroundColor(.accentColor)
|
||||
}
|
||||
|
||||
|
||||
Spacer()
|
||||
}
|
||||
.padding(.vertical, 10)
|
||||
@ -376,4 +392,4 @@ struct AppGridItem: View {
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -76,15 +76,7 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
|
||||
UserDefaults.standard.set(String(currentDeviceID), forKey: "lastUsedMicrophoneDeviceID")
|
||||
|
||||
hasDetectedAudioInCurrentSession = false
|
||||
|
||||
// Coordinate media control and system audio sequentially for better reliability
|
||||
await playbackController.pauseMedia()
|
||||
|
||||
// Small delay to allow media command to process before muting system audio
|
||||
try? await Task.sleep(nanoseconds: 100_000_000) // 100ms
|
||||
|
||||
_ = await mediaController.muteSystemAudio()
|
||||
|
||||
|
||||
let deviceID = deviceManager.getCurrentDevice()
|
||||
if deviceID != 0 {
|
||||
do {
|
||||
@ -114,6 +106,12 @@ class Recorder: NSObject, ObservableObject, AVAudioRecorderDelegate {
|
||||
throw RecorderError.couldNotStartRecording
|
||||
}
|
||||
|
||||
Task { [weak self] in
|
||||
guard let self = self else { return }
|
||||
await self.playbackController.pauseMedia()
|
||||
_ = await self.mediaController.muteSystemAudio()
|
||||
}
|
||||
|
||||
audioLevelCheckTask?.cancel()
|
||||
audioMeterUpdateTask?.cancel()
|
||||
|
||||
|
||||
@ -261,6 +261,8 @@ class AIEnhancementService: ObservableObject {
|
||||
|
||||
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
|
||||
return filteredText
|
||||
} else if httpResponse.statusCode == 429 {
|
||||
throw EnhancementError.rateLimitExceeded
|
||||
} else if (500...599).contains(httpResponse.statusCode) {
|
||||
throw EnhancementError.serverError
|
||||
} else {
|
||||
@ -316,6 +318,8 @@ class AIEnhancementService: ObservableObject {
|
||||
|
||||
let filteredText = AIEnhancementOutputFilter.filter(enhancedText.trimmingCharacters(in: .whitespacesAndNewlines))
|
||||
return filteredText
|
||||
} else if httpResponse.statusCode == 429 {
|
||||
throw EnhancementError.rateLimitExceeded
|
||||
} else if (500...599).contains(httpResponse.statusCode) {
|
||||
throw EnhancementError.serverError
|
||||
} else {
|
||||
@ -342,7 +346,7 @@ class AIEnhancementService: ObservableObject {
|
||||
return try await makeRequest(text: text, mode: mode)
|
||||
} catch let error as EnhancementError {
|
||||
switch error {
|
||||
case .networkError, .serverError:
|
||||
case .networkError, .serverError, .rateLimitExceeded:
|
||||
retries += 1
|
||||
if retries < maxRetries {
|
||||
logger.warning("Request failed, retrying in \(currentDelay)s... (Attempt \(retries)/\(maxRetries))")
|
||||
@ -458,6 +462,7 @@ enum EnhancementError: Error {
|
||||
case enhancementFailed
|
||||
case networkError
|
||||
case serverError
|
||||
case rateLimitExceeded
|
||||
case customError(String)
|
||||
}
|
||||
|
||||
@ -474,6 +479,8 @@ extension EnhancementError: LocalizedError {
|
||||
return "Network connection failed. Check your internet."
|
||||
case .serverError:
|
||||
return "The AI provider's server encountered an error. Please try again later."
|
||||
case .rateLimitExceeded:
|
||||
return "Rate limit exceeded. Please try again later."
|
||||
case .customError(let message):
|
||||
return message
|
||||
}
|
||||
|
||||
@ -82,7 +82,7 @@ enum AIProvider: String, CaseIterable {
|
||||
case .groq:
|
||||
return [
|
||||
"llama-3.3-70b-versatile",
|
||||
"moonshotai/kimi-k2-instruct",
|
||||
"moonshotai/kimi-k2-instruct-0905",
|
||||
"qwen/qwen3-32b",
|
||||
"meta-llama/llama-4-maverick-17b-128e-instruct",
|
||||
"openai/gpt-oss-120b"
|
||||
|
||||
@ -6,7 +6,7 @@ class DictionaryContextService {
|
||||
|
||||
private init() {}
|
||||
|
||||
private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, Wispr flow, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
|
||||
private let predefinedWords = "VoiceInk, chatGPT, GPT-4o, GPT-5-mini, Kimi-K2, GLM V4.5, Claude, Claude 4 sonnet, Claude opus, ultrathink, Vibe-coding, groq, cerebras, gpt-oss-120B, deepseek, gemini-2.5, Veo 3, elevenlabs, Kyutai"
|
||||
|
||||
func getDictionaryContext() -> String {
|
||||
var allWords: [String] = []
|
||||
|
||||
@ -7,6 +7,7 @@ import LaunchAtLogin
|
||||
struct GeneralSettings: Codable {
|
||||
let toggleMiniRecorderShortcut: KeyboardShortcuts.Shortcut?
|
||||
let toggleMiniRecorderShortcut2: KeyboardShortcuts.Shortcut?
|
||||
let retryLastTranscriptionShortcut: KeyboardShortcuts.Shortcut?
|
||||
let selectedHotkey1RawValue: String?
|
||||
let selectedHotkey2RawValue: String?
|
||||
let launchAtLoginEnabled: Bool?
|
||||
@ -86,6 +87,7 @@ class ImportExportService {
|
||||
let generalSettingsToExport = GeneralSettings(
|
||||
toggleMiniRecorderShortcut: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder),
|
||||
toggleMiniRecorderShortcut2: KeyboardShortcuts.getShortcut(for: .toggleMiniRecorder2),
|
||||
retryLastTranscriptionShortcut: KeyboardShortcuts.getShortcut(for: .retryLastTranscription),
|
||||
selectedHotkey1RawValue: hotkeyManager.selectedHotkey1.rawValue,
|
||||
selectedHotkey2RawValue: hotkeyManager.selectedHotkey2.rawValue,
|
||||
launchAtLoginEnabled: LaunchAtLogin.isEnabled,
|
||||
@ -218,6 +220,9 @@ class ImportExportService {
|
||||
if let shortcut2 = general.toggleMiniRecorderShortcut2 {
|
||||
KeyboardShortcuts.setShortcut(shortcut2, for: .toggleMiniRecorder2)
|
||||
}
|
||||
if let retryShortcut = general.retryLastTranscriptionShortcut {
|
||||
KeyboardShortcuts.setShortcut(retryShortcut, for: .retryLastTranscription)
|
||||
}
|
||||
if let hotkeyRaw = general.selectedHotkey1RawValue,
|
||||
let hotkey = HotkeyManager.HotkeyOption(rawValue: hotkeyRaw) {
|
||||
hotkeyManager.selectedHotkey1 = hotkey
|
||||
|
||||
@ -71,13 +71,60 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
|
||||
let audioSamples = try readAudioSamples(from: audioURL)
|
||||
|
||||
// Validate audio data before transcription
|
||||
guard audioSamples.count >= 16000 else {
|
||||
logger.notice("🦜 Audio too short for transcription: \(audioSamples.count) samples")
|
||||
// Validate audio data before VAD
|
||||
guard !audioSamples.isEmpty else {
|
||||
logger.notice("🦜 Audio is empty, skipping transcription.")
|
||||
throw ASRError.invalidAudioData
|
||||
}
|
||||
|
||||
// Use VAD to get speech segments
|
||||
var speechAudio: [Float] = []
|
||||
let isVADEnabled = UserDefaults.standard.object(forKey: "IsVADEnabled") as? Bool ?? true
|
||||
|
||||
if isVADEnabled {
|
||||
if let modelPath = await VADModelManager.shared.getModelPath() {
|
||||
if let vad = VoiceActivityDetector(modelPath: modelPath) {
|
||||
let speechSegments = vad.process(audioSamples: audioSamples)
|
||||
logger.notice("🦜 VAD detected \(speechSegments.count) speech segments.")
|
||||
|
||||
let sampleRate = 16000 // Assuming 16kHz sample rate
|
||||
for segment in speechSegments {
|
||||
let startSample = Int(segment.start * Double(sampleRate))
|
||||
var endSample = Int(segment.end * Double(sampleRate))
|
||||
|
||||
// Cap endSample to the audio buffer size
|
||||
if endSample > audioSamples.count {
|
||||
endSample = audioSamples.count
|
||||
}
|
||||
|
||||
if startSample < endSample {
|
||||
speechAudio.append(contentsOf: audioSamples[startSample..<endSample])
|
||||
} else {
|
||||
logger.warning("🦜 Invalid sample range for segment: start=\(startSample), end=\(endSample). Skipping.")
|
||||
}
|
||||
}
|
||||
logger.notice("🦜 Extracted \(speechAudio.count) samples from VAD segments.")
|
||||
} else {
|
||||
logger.warning("🦜 VAD could not be initialized. Transcribing original audio.")
|
||||
speechAudio = audioSamples
|
||||
}
|
||||
} else {
|
||||
logger.warning("🦜 VAD model path not found. Transcribing original audio.")
|
||||
speechAudio = audioSamples
|
||||
}
|
||||
} else {
|
||||
logger.notice("🦜 VAD is disabled by user setting. Transcribing original audio.")
|
||||
speechAudio = audioSamples
|
||||
}
|
||||
|
||||
// Validate audio data after VAD
|
||||
guard speechAudio.count >= 16000 else {
|
||||
logger.notice("🦜 Audio too short for transcription after VAD: \(speechAudio.count) samples")
|
||||
throw ASRError.invalidAudioData
|
||||
}
|
||||
|
||||
let result = try await asrManager.transcribe(audioSamples)
|
||||
let result = try await asrManager.transcribe(speechAudio)
|
||||
print(result.text)
|
||||
|
||||
// Reset decoder state and cleanup after transcription to avoid blocking the transcription start
|
||||
Task {
|
||||
@ -91,10 +138,16 @@ class ParakeetTranscriptionService: TranscriptionService {
|
||||
logger.notice("🦜 Warning: Empty transcription result for \(audioSamples.count) samples - possible vocabulary issue")
|
||||
}
|
||||
|
||||
var text = result.text
|
||||
|
||||
if UserDefaults.standard.object(forKey: "IsTextFormattingEnabled") as? Bool ?? true {
|
||||
return WhisperTextFormatter.format(result.text)
|
||||
text = WhisperTextFormatter.format(text)
|
||||
}
|
||||
return result.text
|
||||
|
||||
// Apply hallucination and filler word filtering
|
||||
text = WhisperHallucinationFilter.filter(text)
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
private func readAudioSamples(from url: URL) throws -> [Float] {
|
||||
|
||||
28
VoiceInk/Services/SupportedMedia.swift
Normal file
28
VoiceInk/Services/SupportedMedia.swift
Normal file
@ -0,0 +1,28 @@
|
||||
import Foundation
|
||||
import UniformTypeIdentifiers
|
||||
|
||||
struct SupportedMedia {
|
||||
static let extensions: Set<String> = [
|
||||
"wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf"
|
||||
]
|
||||
|
||||
static let contentTypes: [UTType] = [
|
||||
.audio, .movie
|
||||
]
|
||||
|
||||
static func isSupported(url: URL) -> Bool {
|
||||
let fileExtension = url.pathExtension.lowercased()
|
||||
if !fileExtension.isEmpty, extensions.contains(fileExtension) {
|
||||
return true
|
||||
}
|
||||
|
||||
if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]),
|
||||
let contentType = resourceValues.contentType {
|
||||
return contentTypes.contains(where: { contentType.conforms(to: $0) })
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
88
VoiceInk/Services/VoiceActivityDetector.swift
Normal file
88
VoiceInk/Services/VoiceActivityDetector.swift
Normal file
@ -0,0 +1,88 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
import os.log
|
||||
#if canImport(whisper)
|
||||
import whisper
|
||||
#else
|
||||
#error("Unable to import whisper module. Please check your project configuration.")
|
||||
#endif
|
||||
|
||||
// MARK: - C API Bridge
|
||||
|
||||
// Opaque pointers for the C contexts
|
||||
fileprivate typealias WhisperVADContext = OpaquePointer
|
||||
fileprivate typealias WhisperVADSegments = OpaquePointer
|
||||
|
||||
|
||||
// MARK: - VoiceActivityDetector Class
|
||||
|
||||
class VoiceActivityDetector {
|
||||
private var vadContext: WhisperVADContext
|
||||
private let logger = Logger(subsystem: "com.voiceink.app", category: "VoiceActivityDetector")
|
||||
|
||||
init?(modelPath: String) {
|
||||
var contextParams = whisper_vad_default_context_params()
|
||||
contextParams.n_threads = max(1, min(8, Int32(ProcessInfo.processInfo.processorCount) - 2))
|
||||
|
||||
let contextOpt: WhisperVADContext? = modelPath.withCString { cPath in
|
||||
whisper_vad_init_from_file_with_params(cPath, contextParams)
|
||||
}
|
||||
|
||||
guard let context = contextOpt else {
|
||||
logger.error("Failed to initialize VAD context.")
|
||||
return nil
|
||||
}
|
||||
self.vadContext = context
|
||||
logger.notice("VAD context initialized successfully.")
|
||||
}
|
||||
|
||||
deinit {
|
||||
whisper_vad_free(vadContext)
|
||||
logger.notice("VAD context freed.")
|
||||
}
|
||||
|
||||
/// Processes audio samples to detect speech segments and returns an array of (start: TimeInterval, end: TimeInterval) tuples.
|
||||
func process(audioSamples: [Float]) -> [(start: TimeInterval, end: TimeInterval)] {
|
||||
// 1. Detect speech and get probabilities internally in the context
|
||||
let success = audioSamples.withUnsafeBufferPointer { buffer in
|
||||
whisper_vad_detect_speech(vadContext, buffer.baseAddress!, Int32(audioSamples.count))
|
||||
}
|
||||
|
||||
guard success else {
|
||||
logger.error("Failed to detect speech probabilities.")
|
||||
return []
|
||||
}
|
||||
|
||||
// 2. Get segments from probabilities
|
||||
var vadParams = whisper_vad_default_params()
|
||||
vadParams.threshold = 0.45
|
||||
vadParams.min_speech_duration_ms = 150
|
||||
vadParams.min_silence_duration_ms = 750
|
||||
vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude // Use the largest representable Float value for no max duration
|
||||
vadParams.speech_pad_ms = 100
|
||||
vadParams.samples_overlap = 0.1 // Add samples_overlap parameter
|
||||
|
||||
guard let segments = whisper_vad_segments_from_probs(vadContext, vadParams) else {
|
||||
logger.error("Failed to get VAD segments from probabilities.")
|
||||
return []
|
||||
}
|
||||
defer {
|
||||
// Ensure segments are freed
|
||||
whisper_vad_free_segments(segments)
|
||||
}
|
||||
|
||||
let nSegments = whisper_vad_segments_n_segments(segments)
|
||||
logger.notice("Detected \(nSegments) speech segments.")
|
||||
|
||||
var speechSegments: [(start: TimeInterval, end: TimeInterval)] = []
|
||||
for i in 0..<nSegments {
|
||||
// Timestamps from C are mysteriously multiplied by 100, so we correct them here.
|
||||
let startTimeSec = whisper_vad_segments_get_segment_t0(segments, i) / 100.0
|
||||
let endTimeSec = whisper_vad_segments_get_segment_t1(segments, i) / 100.0
|
||||
speechSegments.append((start: TimeInterval(startTimeSec), end: TimeInterval(endTimeSec)))
|
||||
}
|
||||
|
||||
logger.notice("Returning \(speechSegments.count) speech segments.")
|
||||
return speechSegments
|
||||
}
|
||||
}
|
||||
@ -14,22 +14,31 @@ class WordReplacementService {
|
||||
var modifiedText = text
|
||||
|
||||
// Apply replacements (case-insensitive)
|
||||
for (original, replacement) in replacements {
|
||||
let isPhrase = original.contains(" ") || original.trimmingCharacters(in: .whitespacesAndNewlines) != original
|
||||
for (originalGroup, replacement) in replacements {
|
||||
// Split comma-separated originals at apply time only
|
||||
let variants = originalGroup
|
||||
.split(separator: ",")
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
|
||||
if isPhrase || !usesWordBoundaries(for: original) {
|
||||
modifiedText = modifiedText.replacingOccurrences(of: original, with: replacement, options: .caseInsensitive)
|
||||
} else {
|
||||
// Use word boundaries for spaced languages
|
||||
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: original))\\b"
|
||||
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
|
||||
let range = NSRange(modifiedText.startIndex..., in: modifiedText)
|
||||
modifiedText = regex.stringByReplacingMatches(
|
||||
in: modifiedText,
|
||||
options: [],
|
||||
range: range,
|
||||
withTemplate: replacement
|
||||
)
|
||||
for original in variants {
|
||||
let usesBoundaries = usesWordBoundaries(for: original)
|
||||
|
||||
if usesBoundaries {
|
||||
// Word-boundary regex for full original string
|
||||
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: original))\\b"
|
||||
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
|
||||
let range = NSRange(modifiedText.startIndex..., in: modifiedText)
|
||||
modifiedText = regex.stringByReplacingMatches(
|
||||
in: modifiedText,
|
||||
options: [],
|
||||
range: range,
|
||||
withTemplate: replacement
|
||||
)
|
||||
}
|
||||
} else {
|
||||
// Fallback substring replace for non-spaced scripts
|
||||
modifiedText = modifiedText.replacingOccurrences(of: original, with: replacement, options: .caseInsensitive)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -112,6 +112,12 @@ struct AudioTranscribeView: View {
|
||||
Text(errorMessage)
|
||||
}
|
||||
}
|
||||
.onReceive(NotificationCenter.default.publisher(for: .openFileForTranscription)) { notification in
|
||||
if let url = notification.userInfo?["url"] as? URL {
|
||||
// Do not auto-start; only select file for manual transcription
|
||||
validateAndSetAudioFile(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var dropZoneView: some View {
|
||||
@ -347,29 +353,8 @@ struct AudioTranscribeView: View {
|
||||
}
|
||||
}
|
||||
|
||||
// Validate file type by extension
|
||||
let supportedExtensions = ["wav", "mp3", "m4a", "aiff", "mp4", "mov", "aac", "flac", "caf"]
|
||||
let fileExtension = url.pathExtension.lowercased()
|
||||
|
||||
// Check file extension first
|
||||
if !fileExtension.isEmpty && supportedExtensions.contains(fileExtension) {
|
||||
print("File type validated by extension: \(fileExtension)")
|
||||
} else {
|
||||
print("Unsupported file extension: \(fileExtension)")
|
||||
// Try to validate by UTType as well
|
||||
if let resourceValues = try? url.resourceValues(forKeys: [.contentTypeKey]),
|
||||
let contentType = resourceValues.contentType {
|
||||
if contentType.conforms(to: .audio) || contentType.conforms(to: .movie) {
|
||||
print("File type validated by UTType: \(contentType.identifier)")
|
||||
} else {
|
||||
print("File does not conform to audio or movie type: \(contentType.identifier)")
|
||||
return
|
||||
}
|
||||
} else {
|
||||
print("Could not validate file type")
|
||||
return
|
||||
}
|
||||
}
|
||||
// Validate file type
|
||||
guard SupportedMedia.isSupported(url: url) else { return }
|
||||
|
||||
print("File validated successfully: \(url.lastPathComponent)")
|
||||
selectedAudioURL = url
|
||||
@ -381,4 +366,4 @@ struct AudioTranscribeView: View {
|
||||
let seconds = Int(duration) % 60
|
||||
return String(format: "%d:%02d", minutes, seconds)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,6 +165,7 @@ struct ContentView: View {
|
||||
let appVersion = Bundle.main.infoDictionary?["CFBundleShortVersionString"] as? String ?? "1.0.0"
|
||||
@StateObject private var licenseViewModel = LicenseViewModel()
|
||||
|
||||
|
||||
private var isSetupComplete: Bool {
|
||||
hasLoadedData &&
|
||||
whisperState.currentTranscriptionModel != nil &&
|
||||
@ -192,6 +193,7 @@ struct ContentView: View {
|
||||
.onAppear {
|
||||
hasLoadedData = true
|
||||
}
|
||||
// inside ContentView body:
|
||||
.onReceive(NotificationCenter.default.publisher(for: .navigateToDestination)) { notification in
|
||||
print("ContentView: Received navigation notification")
|
||||
if let destination = notification.userInfo?["destination"] as? String {
|
||||
@ -215,6 +217,10 @@ struct ContentView: View {
|
||||
case "Enhancement":
|
||||
print("ContentView: Navigating to Enhancement")
|
||||
selectedView = .enhancement
|
||||
case "Transcribe Audio":
|
||||
// Ensure we switch to the Transcribe Audio view in-place
|
||||
print("ContentView: Navigating to Transcribe Audio")
|
||||
selectedView = .transcribeAudio
|
||||
default:
|
||||
print("ContentView: No matching destination found for: \(destination)")
|
||||
break
|
||||
@ -259,3 +265,5 @@ struct ContentView: View {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
import SwiftUI
|
||||
|
||||
/// A reusable sheet for editing an existing word replacement entry.
|
||||
/// Mirrors the UI of `AddReplacementSheet` for consistency while pre-populating
|
||||
/// the fields with the existing values.
|
||||
// Edit existing word replacement entry
|
||||
struct EditReplacementSheet: View {
|
||||
@ObservedObject var manager: WordReplacementManager
|
||||
let originalKey: String
|
||||
@ -26,7 +23,7 @@ struct EditReplacementSheet: View {
|
||||
Divider()
|
||||
formContent
|
||||
}
|
||||
.frame(width: 460, height: 480)
|
||||
.frame(width: 460, height: 560)
|
||||
}
|
||||
|
||||
// MARK: – Subviews
|
||||
@ -65,7 +62,7 @@ struct EditReplacementSheet: View {
|
||||
}
|
||||
|
||||
private var descriptionSection: some View {
|
||||
Text("Update the word or phrase that should be automatically replaced during AI enhancement.")
|
||||
Text("Update the word or phrase that should be automatically replaced.")
|
||||
.font(.subheadline)
|
||||
.foregroundColor(.secondary)
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
@ -84,8 +81,9 @@ struct EditReplacementSheet: View {
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
TextField("Enter word or phrase to replace", text: $originalWord)
|
||||
TextField("Enter word or phrase to replace (use commas for multiple)", text: $originalWord)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
|
||||
}
|
||||
.padding(.horizontal)
|
||||
|
||||
@ -117,7 +115,12 @@ struct EditReplacementSheet: View {
|
||||
private func saveChanges() {
|
||||
let newOriginal = originalWord.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let newReplacement = replacementWord.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !newOriginal.isEmpty, !newReplacement.isEmpty else { return }
|
||||
// Ensure at least one non-empty token
|
||||
let tokens = newOriginal
|
||||
.split(separator: ",")
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
guard !tokens.isEmpty, !newReplacement.isEmpty else { return }
|
||||
|
||||
manager.updateReplacement(oldOriginal: originalKey, newOriginal: newOriginal, newReplacement: newReplacement)
|
||||
dismiss()
|
||||
|
||||
@ -23,7 +23,10 @@ class WordReplacementManager: ObservableObject {
|
||||
}
|
||||
|
||||
func addReplacement(original: String, replacement: String) {
|
||||
replacements[original] = replacement
|
||||
// Preserve comma-separated originals as a single entry
|
||||
let trimmed = original.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
replacements[trimmed] = replacement
|
||||
}
|
||||
|
||||
func removeReplacement(original: String) {
|
||||
@ -31,12 +34,11 @@ class WordReplacementManager: ObservableObject {
|
||||
}
|
||||
|
||||
func updateReplacement(oldOriginal: String, newOriginal: String, newReplacement: String) {
|
||||
// Remove the old key if the original text has changed
|
||||
if oldOriginal != newOriginal {
|
||||
replacements.removeValue(forKey: oldOriginal)
|
||||
}
|
||||
// Update (or insert) the new key/value pair
|
||||
replacements[newOriginal] = newReplacement
|
||||
// Replace old key with the new comma-preserved key
|
||||
replacements.removeValue(forKey: oldOriginal)
|
||||
let trimmed = newOriginal.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
replacements[trimmed] = newReplacement
|
||||
}
|
||||
}
|
||||
|
||||
@ -142,7 +144,7 @@ struct EmptyStateView: View {
|
||||
Text("No Replacements")
|
||||
.font(.headline)
|
||||
|
||||
Text("Add word replacements to automatically replace text during AI enhancement.")
|
||||
Text("Add word replacements to automatically replace text.")
|
||||
.font(.subheadline)
|
||||
.foregroundColor(.secondary)
|
||||
.multilineTextAlignment(.center)
|
||||
@ -200,7 +202,7 @@ struct AddReplacementSheet: View {
|
||||
ScrollView {
|
||||
VStack(spacing: 20) {
|
||||
// Description
|
||||
Text("Define a word or phrase to be automatically replaced during AI enhancement.")
|
||||
Text("Define a word or phrase to be automatically replaced.")
|
||||
.font(.subheadline)
|
||||
.foregroundColor(.secondary)
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
@ -221,9 +223,12 @@ struct AddReplacementSheet: View {
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
|
||||
TextField("Enter word or phrase to replace", text: $originalWord)
|
||||
TextField("Enter word or phrase to replace (use commas for multiple)", text: $originalWord)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.font(.body)
|
||||
Text("Separate multiple originals with commas, e.g. Voicing, Voice ink, Voiceing")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
.padding(.horizontal)
|
||||
|
||||
@ -255,10 +260,11 @@ struct AddReplacementSheet: View {
|
||||
|
||||
// Example Section
|
||||
VStack(alignment: .leading, spacing: 8) {
|
||||
Text("Example")
|
||||
Text("Examples")
|
||||
.font(.subheadline)
|
||||
.foregroundColor(.secondary)
|
||||
|
||||
// Single original -> replacement
|
||||
HStack(spacing: 12) {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("Original:")
|
||||
@ -280,6 +286,34 @@ struct AddReplacementSheet: View {
|
||||
.font(.callout)
|
||||
}
|
||||
}
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
.padding(12)
|
||||
.background(Color(.textBackgroundColor))
|
||||
.cornerRadius(8)
|
||||
|
||||
// Comma-separated originals -> single replacement
|
||||
HStack(spacing: 12) {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("Original:")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
Text("Voicing, Voice ink, Voiceing")
|
||||
.font(.callout)
|
||||
}
|
||||
|
||||
Image(systemName: "arrow.right")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("Replacement:")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
Text("VoiceInk")
|
||||
.font(.callout)
|
||||
}
|
||||
}
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
.padding(12)
|
||||
.background(Color(.textBackgroundColor))
|
||||
.cornerRadius(8)
|
||||
@ -290,14 +324,19 @@ struct AddReplacementSheet: View {
|
||||
.padding(.vertical)
|
||||
}
|
||||
}
|
||||
.frame(width: 460, height: 480)
|
||||
.frame(width: 460, height: 520)
|
||||
}
|
||||
|
||||
private func addReplacement() {
|
||||
let original = originalWord
|
||||
let replacement = replacementWord
|
||||
|
||||
guard !original.isEmpty && !replacement.isEmpty else { return }
|
||||
// Validate that at least one non-empty token exists
|
||||
let tokens = original
|
||||
.split(separator: ",")
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
guard !tokens.isEmpty && !replacement.isEmpty else { return }
|
||||
|
||||
manager.addReplacement(original: original, replacement: replacement)
|
||||
dismiss()
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import SwiftUI
|
||||
import UniformTypeIdentifiers
|
||||
|
||||
struct EnhancementSettingsView: View {
|
||||
@EnvironmentObject private var enhancementService: AIEnhancementService
|
||||
@ -79,25 +80,22 @@ struct EnhancementSettingsView: View {
|
||||
Text("Enhancement Prompt")
|
||||
.font(.headline)
|
||||
|
||||
// Prompts Section
|
||||
VStack(alignment: .leading, spacing: 12) {
|
||||
PromptSelectionGrid(
|
||||
prompts: enhancementService.allPrompts,
|
||||
selectedPromptId: enhancementService.selectedPromptId,
|
||||
onPromptSelected: { prompt in
|
||||
enhancementService.setActivePrompt(prompt)
|
||||
},
|
||||
onEditPrompt: { prompt in
|
||||
selectedPromptForEdit = prompt
|
||||
},
|
||||
onDeletePrompt: { prompt in
|
||||
enhancementService.deletePrompt(prompt)
|
||||
},
|
||||
onAddNewPrompt: {
|
||||
isEditingPrompt = true
|
||||
}
|
||||
)
|
||||
}
|
||||
// Reorderable prompts grid with drag-and-drop
|
||||
ReorderablePromptGrid(
|
||||
selectedPromptId: enhancementService.selectedPromptId,
|
||||
onPromptSelected: { prompt in
|
||||
enhancementService.setActivePrompt(prompt)
|
||||
},
|
||||
onEditPrompt: { prompt in
|
||||
selectedPromptForEdit = prompt
|
||||
},
|
||||
onDeletePrompt: { prompt in
|
||||
enhancementService.deletePrompt(prompt)
|
||||
},
|
||||
onAddNewPrompt: {
|
||||
isEditingPrompt = true
|
||||
}
|
||||
)
|
||||
}
|
||||
.padding()
|
||||
.background(CardBackground(isSelected: false))
|
||||
@ -115,3 +113,151 @@ struct EnhancementSettingsView: View {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Drag & Drop Reorderable Grid
|
||||
private struct ReorderablePromptGrid: View {
|
||||
@EnvironmentObject private var enhancementService: AIEnhancementService
|
||||
|
||||
let selectedPromptId: UUID?
|
||||
let onPromptSelected: (CustomPrompt) -> Void
|
||||
let onEditPrompt: ((CustomPrompt) -> Void)?
|
||||
let onDeletePrompt: ((CustomPrompt) -> Void)?
|
||||
let onAddNewPrompt: (() -> Void)?
|
||||
|
||||
@State private var draggingItem: CustomPrompt?
|
||||
|
||||
var body: some View {
|
||||
VStack(alignment: .leading, spacing: 12) {
|
||||
if enhancementService.customPrompts.isEmpty {
|
||||
Text("No prompts available")
|
||||
.foregroundColor(.secondary)
|
||||
.font(.caption)
|
||||
} else {
|
||||
let columns = [
|
||||
GridItem(.adaptive(minimum: 80, maximum: 100), spacing: 36)
|
||||
]
|
||||
|
||||
LazyVGrid(columns: columns, spacing: 16) {
|
||||
ForEach(enhancementService.customPrompts) { prompt in
|
||||
prompt.promptIcon(
|
||||
isSelected: selectedPromptId == prompt.id,
|
||||
onTap: {
|
||||
withAnimation(.spring(response: 0.3, dampingFraction: 0.7)) {
|
||||
onPromptSelected(prompt)
|
||||
}
|
||||
},
|
||||
onEdit: onEditPrompt,
|
||||
onDelete: onDeletePrompt
|
||||
)
|
||||
.opacity(draggingItem?.id == prompt.id ? 0.3 : 1.0)
|
||||
.scaleEffect(draggingItem?.id == prompt.id ? 1.05 : 1.0)
|
||||
.overlay(
|
||||
RoundedRectangle(cornerRadius: 14)
|
||||
.stroke(
|
||||
draggingItem != nil && draggingItem?.id != prompt.id
|
||||
? Color.accentColor.opacity(0.25)
|
||||
: Color.clear,
|
||||
lineWidth: 1
|
||||
)
|
||||
)
|
||||
.animation(.easeInOut(duration: 0.15), value: draggingItem?.id == prompt.id)
|
||||
.onDrag {
|
||||
draggingItem = prompt
|
||||
return NSItemProvider(object: prompt.id.uuidString as NSString)
|
||||
}
|
||||
.onDrop(
|
||||
of: [UTType.text],
|
||||
delegate: PromptDropDelegate(
|
||||
item: prompt,
|
||||
prompts: $enhancementService.customPrompts,
|
||||
draggingItem: $draggingItem
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
if let onAddNewPrompt = onAddNewPrompt {
|
||||
CustomPrompt.addNewButton {
|
||||
onAddNewPrompt()
|
||||
}
|
||||
.help("Add new prompt")
|
||||
.onDrop(
|
||||
of: [UTType.text],
|
||||
delegate: PromptEndDropDelegate(
|
||||
prompts: $enhancementService.customPrompts,
|
||||
draggingItem: $draggingItem
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
.padding(.vertical, 12)
|
||||
.padding(.horizontal, 16)
|
||||
|
||||
HStack {
|
||||
Image(systemName: "info.circle")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
|
||||
Text("Double-click to edit • Right-click for more options")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
.padding(.top, 8)
|
||||
.padding(.horizontal, 16)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Drop Delegates
|
||||
private struct PromptDropDelegate: DropDelegate {
|
||||
let item: CustomPrompt
|
||||
@Binding var prompts: [CustomPrompt]
|
||||
@Binding var draggingItem: CustomPrompt?
|
||||
|
||||
func dropEntered(info: DropInfo) {
|
||||
guard let draggingItem = draggingItem, draggingItem != item else { return }
|
||||
guard let fromIndex = prompts.firstIndex(of: draggingItem),
|
||||
let toIndex = prompts.firstIndex(of: item) else { return }
|
||||
|
||||
// Move item as you hover for immediate visual update
|
||||
if prompts[toIndex].id != draggingItem.id {
|
||||
withAnimation(.easeInOut(duration: 0.12)) {
|
||||
let from = fromIndex
|
||||
let to = toIndex
|
||||
prompts.move(fromOffsets: IndexSet(integer: from), toOffset: to > from ? to + 1 : to)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func dropUpdated(info: DropInfo) -> DropProposal? {
|
||||
DropProposal(operation: .move)
|
||||
}
|
||||
|
||||
func performDrop(info: DropInfo) -> Bool {
|
||||
draggingItem = nil
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
private struct PromptEndDropDelegate: DropDelegate {
|
||||
@Binding var prompts: [CustomPrompt]
|
||||
@Binding var draggingItem: CustomPrompt?
|
||||
|
||||
func validateDrop(info: DropInfo) -> Bool { true }
|
||||
func dropUpdated(info: DropInfo) -> DropProposal? { DropProposal(operation: .move) }
|
||||
|
||||
func performDrop(info: DropInfo) -> Bool {
|
||||
guard let draggingItem = draggingItem,
|
||||
let currentIndex = prompts.firstIndex(of: draggingItem) else {
|
||||
self.draggingItem = nil
|
||||
return false
|
||||
}
|
||||
|
||||
// Move to end if dropped on the trailing "Add New" tile
|
||||
withAnimation(.easeInOut(duration: 0.12)) {
|
||||
prompts.move(fromOffsets: IndexSet(integer: currentIndex), toOffset: prompts.endIndex)
|
||||
}
|
||||
self.draggingItem = nil
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
@ -99,7 +99,7 @@ struct ModelSettingsView: View {
|
||||
|
||||
InfoTip(
|
||||
title: "Voice Activity Detection",
|
||||
message: "Detects speech segments and filters out silence to reduce hallucinations in local Whisper models."
|
||||
message: "Detect speech segments and filter out silence to improve accuracy of local models."
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ struct ExperimentalFeaturesSection: View {
|
||||
|
||||
if isExperimentalFeaturesEnabled {
|
||||
Toggle(isOn: $playbackController.isPauseMediaEnabled) {
|
||||
Text("Pause Media on Playback")
|
||||
Text("Pause Media during recording")
|
||||
}
|
||||
.toggleStyle(.switch)
|
||||
.help("Automatically pause active media playback during recordings and resume afterward.")
|
||||
|
||||
@ -130,6 +130,8 @@ struct SettingsView: View {
|
||||
|
||||
Divider()
|
||||
|
||||
|
||||
|
||||
// Custom Cancel Shortcut
|
||||
VStack(alignment: .leading, spacing: 12) {
|
||||
HStack(spacing: 8) {
|
||||
|
||||
@ -114,6 +114,15 @@ struct VoiceInkApp: App {
|
||||
if !UserDefaults.standard.bool(forKey: "IsTranscriptionCleanupEnabled") {
|
||||
audioCleanupManager.startAutomaticCleanup(modelContext: container.mainContext)
|
||||
}
|
||||
|
||||
// Process any pending open-file request now that the main ContentView is ready.
|
||||
if let pendingURL = appDelegate.pendingOpenFileURL {
|
||||
NotificationCenter.default.post(name: .navigateToDestination, object: nil, userInfo: ["destination": "Transcribe Audio"])
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + 0.3) {
|
||||
NotificationCenter.default.post(name: .openFileForTranscription, object: nil, userInfo: ["url": pendingURL])
|
||||
}
|
||||
appDelegate.pendingOpenFileURL = nil
|
||||
}
|
||||
}
|
||||
.background(WindowAccessor { window in
|
||||
WindowManager.shared.configureWindow(window)
|
||||
|
||||
@ -4,49 +4,48 @@ import os
|
||||
struct WhisperHallucinationFilter {
|
||||
private static let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperHallucinationFilter")
|
||||
|
||||
// Pattern-based approach for detecting hallucinations - focusing on format indicators
|
||||
private static let hallucinationPatterns = [
|
||||
// Text in various types of brackets - the most reliable hallucination indicators
|
||||
#"\[.*?\]"#, // [Text in square brackets]
|
||||
#"\(.*?\)"#, // (Text in parentheses)
|
||||
#"\{.*?\}"#, // {Text in curly braces}
|
||||
#"<.*?>"#, // <Text in angle brackets>
|
||||
|
||||
// Text with special formatting
|
||||
#"\*.*?\*"#, // *Text with asterisks*
|
||||
#"_.*?_"#, // _Text with underscores_
|
||||
|
||||
// Time indicators often added by Whisper
|
||||
#"(?i)\d{1,2}:\d{2}(:\d{2})?\s*-\s*\d{1,2}:\d{2}(:\d{2})?"# // 00:00 - 00:00 format
|
||||
#"\[.*?\]"#, // Square brackets
|
||||
#"\(.*?\)"#, // Parentheses
|
||||
#"\{.*?\}"# // Curly braces
|
||||
]
|
||||
|
||||
private static let fillerWords = [
|
||||
"uh", "um", "uhm", "umm", "uhh", "uhhh", "er", "ah", "eh",
|
||||
"hmm", "hm", "h", "m", "mmm", "mm", "mh", "ha", "ehh"
|
||||
]
|
||||
|
||||
/// Removes hallucinations from transcription text using pattern matching
|
||||
/// - Parameter text: Original transcription text from Whisper
|
||||
/// - Returns: Filtered text with hallucinations removed
|
||||
static func filter(_ text: String) -> String {
|
||||
logger.notice("🧹 Applying pattern-based hallucination filter to transcription")
|
||||
|
||||
logger.notice("🧹 Filtering hallucinations and filler words")
|
||||
var filteredText = text
|
||||
|
||||
// Remove pattern-based hallucinations
|
||||
|
||||
// Remove bracketed hallucinations
|
||||
for pattern in hallucinationPatterns {
|
||||
if let regex = try? NSRegularExpression(pattern: pattern) {
|
||||
let range = NSRange(filteredText.startIndex..., in: filteredText)
|
||||
filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up extra whitespace and newlines that might be left after removing hallucinations
|
||||
|
||||
// Remove filler words
|
||||
for fillerWord in fillerWords {
|
||||
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: fillerWord))\\b[,.]?"
|
||||
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
|
||||
let range = NSRange(filteredText.startIndex..., in: filteredText)
|
||||
filteredText = regex.stringByReplacingMatches(in: filteredText, options: [], range: range, withTemplate: "")
|
||||
}
|
||||
}
|
||||
|
||||
// Clean whitespace
|
||||
filteredText = filteredText.replacingOccurrences(of: #"\s{2,}"#, with: " ", options: .regularExpression)
|
||||
filteredText = filteredText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
// Add logging to track effectiveness
|
||||
|
||||
// Log results
|
||||
if filteredText != text {
|
||||
logger.notice("✅ Removed hallucinations using pattern matching")
|
||||
logger.notice("✅ Removed hallucinations and filler words")
|
||||
} else {
|
||||
logger.notice("✅ No hallucinations detected with pattern matching")
|
||||
logger.notice("✅ No hallucinations or filler words found")
|
||||
}
|
||||
|
||||
|
||||
return filteredText
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user