diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index 2b49a32..94f3376 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -35,11 +35,78 @@ import Foundation } } + // Apple Native Speech specific languages with proper BCP-47 format + // Based on actual supported locales from SpeechTranscriber.supportedLocales + static let appleNativeLanguages = [ + // English variants + "en-US": "English (United States)", + "en-GB": "English (United Kingdom)", + "en-CA": "English (Canada)", + "en-AU": "English (Australia)", + "en-IN": "English (India)", + "en-IE": "English (Ireland)", + "en-NZ": "English (New Zealand)", + "en-ZA": "English (South Africa)", + "en-SA": "English (Saudi Arabia)", + "en-AE": "English (UAE)", + "en-SG": "English (Singapore)", + "en-PH": "English (Philippines)", + "en-ID": "English (Indonesia)", + + // Spanish variants + "es-ES": "Spanish (Spain)", + "es-MX": "Spanish (Mexico)", + "es-US": "Spanish (United States)", + "es-CO": "Spanish (Colombia)", + "es-CL": "Spanish (Chile)", + "es-419": "Spanish (Latin America)", + + // French variants + "fr-FR": "French (France)", + "fr-CA": "French (Canada)", + "fr-BE": "French (Belgium)", + "fr-CH": "French (Switzerland)", + + // German variants + "de-DE": "German (Germany)", + "de-AT": "German (Austria)", + "de-CH": "German (Switzerland)", + + // Chinese variants + "zh-CN": "Chinese Simplified (China)", + "zh-TW": "Chinese Traditional (Taiwan)", + "zh-HK": "Chinese Traditional (Hong Kong)", + + // Other Asian languages + "ja-JP": "Japanese (Japan)", + "ko-KR": "Korean (South Korea)", + "yue-CN": "Cantonese (China)", + + // Portuguese variants + "pt-BR": "Portuguese (Brazil)", + "pt-PT": "Portuguese (Portugal)", + + // Italian variants + "it-IT": "Italian (Italy)", + "it-CH": "Italian (Switzerland)", + + // Arabic + "ar-SA": "Arabic (Saudi Arabia)" + ] + static var models: [any TranscriptionModel] { return predefinedModels + CustomModelManager.shared.customModels } private static let predefinedModels: [any TranscriptionModel] = [ + // Native Apple Model + NativeAppleModel( + name: "apple-speech", + displayName: "Apple Speech", + description: "Uses the native Apple Speech framework for transcription. Available on macOS Sonoma 14+.", + isMultilingualModel: true, + supportedLanguages: appleNativeLanguages + ), // Local Models LocalModel( name: "ggml-tiny", diff --git a/VoiceInk/Models/TranscriptionModel.swift b/VoiceInk/Models/TranscriptionModel.swift index 3a36c92..dae0816 100644 --- a/VoiceInk/Models/TranscriptionModel.swift +++ b/VoiceInk/Models/TranscriptionModel.swift @@ -7,6 +7,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable { case elevenLabs = "ElevenLabs" case deepgram = "Deepgram" case custom = "Custom" + case nativeApple = "Native Apple" // Future providers can be added here } @@ -33,6 +34,17 @@ extension TranscriptionModel { } } +// A new struct for Apple's native models +struct NativeAppleModel: TranscriptionModel { + let id = UUID() + let name: String + let displayName: String + let description: String + let provider: ModelProvider = .nativeApple + let isMultilingualModel: Bool + let supportedLanguages: [String: String] +} + // A new struct for cloud models struct CloudModel: TranscriptionModel { let id: UUID diff --git a/VoiceInk/Services/AudioFileTranscriptionManager.swift b/VoiceInk/Services/AudioFileTranscriptionManager.swift index 283388e..aff0bca 100644 --- a/VoiceInk/Services/AudioFileTranscriptionManager.swift +++ b/VoiceInk/Services/AudioFileTranscriptionManager.swift @@ -21,6 +21,7 @@ class AudioTranscriptionManager: ObservableObject { // Transcription services - will be initialized when needed private var localTranscriptionService: LocalTranscriptionService? private let cloudTranscriptionService = CloudTranscriptionService() + private let nativeAppleTranscriptionService = NativeAppleTranscriptionService() enum ProcessingPhase { case idle @@ -93,9 +94,12 @@ class AudioTranscriptionManager: ObservableObject { processingPhase = .transcribing var text: String - if currentModel.provider == .local { + switch currentModel.provider { + case .local: text = try await localTranscriptionService!.transcribe(audioURL: permanentURL, model: currentModel) - } else { + case .nativeApple: + text = try await nativeAppleTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel) + default: // Cloud models text = try await cloudTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel) } diff --git a/VoiceInk/Services/AudioFileTranscriptionService.swift b/VoiceInk/Services/AudioFileTranscriptionService.swift index 759cbf6..5bfb041 100644 --- a/VoiceInk/Services/AudioFileTranscriptionService.swift +++ b/VoiceInk/Services/AudioFileTranscriptionService.swift @@ -18,6 +18,7 @@ class AudioTranscriptionService: ObservableObject { // Transcription services private let localTranscriptionService: LocalTranscriptionService private let cloudTranscriptionService = CloudTranscriptionService() + private let nativeAppleTranscriptionService = NativeAppleTranscriptionService() enum TranscriptionError: Error { case noAudioFile @@ -47,11 +48,16 @@ class AudioTranscriptionService: ObservableObject { // Delegate transcription to appropriate service var text: String - if model.provider == .local { + switch model.provider { + case .local: messageLog += "Using local transcription service...\n" text = try await localTranscriptionService.transcribe(audioURL: url, model: model) messageLog += "Local transcription completed.\n" - } else { + case .nativeApple: + messageLog += "Using Native Apple transcription service...\n" + text = try await nativeAppleTranscriptionService.transcribe(audioURL: url, model: model) + messageLog += "Native Apple transcription completed.\n" + default: // Cloud models messageLog += "Using cloud transcription service...\n" text = try await cloudTranscriptionService.transcribe(audioURL: url, model: model) messageLog += "Cloud transcription completed.\n" diff --git a/VoiceInk/Services/NativeAppleTranscriptionService.swift b/VoiceInk/Services/NativeAppleTranscriptionService.swift new file mode 100644 index 0000000..62d4c85 --- /dev/null +++ b/VoiceInk/Services/NativeAppleTranscriptionService.swift @@ -0,0 +1,139 @@ +import Foundation +import AVFoundation +import os + +#if canImport(Speech) +import Speech +#endif + +/// Transcription service that leverages the new SpeechAnalyzer / SpeechTranscriber API available on macOS 26 (Tahoe). +/// Falls back with an unsupported-provider error on earlier OS versions so the application can gracefully degrade. +class NativeAppleTranscriptionService: TranscriptionService { + private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "NativeAppleTranscriptionService") + + enum ServiceError: Error, LocalizedError { + case unsupportedOS + case transcriptionFailed + case localeNotSupported + case invalidModel + + var errorDescription: String? { + switch self { + case .unsupportedOS: + return "SpeechAnalyzer requires macOS 26 or later." + case .transcriptionFailed: + return "Transcription failed using SpeechAnalyzer." + case .localeNotSupported: + return "The selected language is not supported by SpeechAnalyzer." + case .invalidModel: + return "Invalid model type provided for Native Apple transcription." + } + } + } + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + guard model is NativeAppleModel else { + throw ServiceError.invalidModel + } + + guard #available(macOS 26, *) else { + logger.error("SpeechAnalyzer is not available on this macOS version") + throw ServiceError.unsupportedOS + } + + #if canImport(Speech) + logger.notice("Starting Apple native transcription with SpeechAnalyzer.") + + let audioFile = try AVAudioFile(forReading: audioURL) + + // Use the user's selected language directly, assuming BCP-47 format. + let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "en-US" + let locale = Locale(identifier: selectedLanguage) + + // Check for locale support and asset installation status. + let supportedLocales = await SpeechTranscriber.supportedLocales + let installedLocales = await SpeechTranscriber.installedLocales + let isLocaleSupported = supportedLocales.contains(locale) + let isLocaleInstalled = installedLocales.contains(locale) + + // Create the detailed log message + let supportedIdentifiers = supportedLocales.map { $0.identifier }.sorted().joined(separator: ", ") + let installedIdentifiers = installedLocales.map { $0.identifier }.sorted().joined(separator: ", ") + let availableForDownload = Set(supportedLocales).subtracting(Set(installedLocales)).map { $0.identifier }.sorted().joined(separator: ", ") + + var statusMessage: String + if isLocaleInstalled { + statusMessage = "✅ Installed" + } else if isLocaleSupported { + statusMessage = "❌ Not Installed (Available for download)" + } else { + statusMessage = "❌ Not Supported" + } + + let logMessage = """ + + --- Native Speech Transcription --- + Locale: '\(locale.identifier)' + Status: \(statusMessage) + ------------------------------------ + Supported Locales: [\(supportedIdentifiers)] + Installed Locales: [\(installedIdentifiers)] + Available for Download: [\(availableForDownload)] + ------------------------------------ + """ + logger.notice("\(logMessage)") + + guard isLocaleSupported else { + logger.error("Transcription failed: Locale '\(locale.identifier)' is not supported by SpeechTranscriber.") + throw ServiceError.localeNotSupported + } + + let transcriber = SpeechTranscriber( + locale: locale, + transcriptionOptions: [], + reportingOptions: [], + attributeOptions: [] + ) + + // Ensure model assets are available, triggering a system download prompt if necessary. + try await ensureModelIsAvailable(for: transcriber, locale: locale) + + let analyzer = SpeechAnalyzer(modules: [transcriber]) + + try await analyzer.start(inputAudioFile: audioFile, finishAfterFile: true) + + var transcript: AttributedString = "" + for try await result in transcriber.results { + transcript += result.text + } + + let finalTranscription = String(transcript.characters).trimmingCharacters(in: .whitespacesAndNewlines) + + logger.notice("Native transcription successful. Length: \(finalTranscription.count) characters.") + return finalTranscription + + #else + logger.error("Speech framework is not available") + throw ServiceError.unsupportedOS + #endif + } + + @available(macOS 26, *) + private func ensureModelIsAvailable(for transcriber: SpeechTranscriber, locale: Locale) async throws { + #if canImport(Speech) + let isInstalled = await SpeechTranscriber.installedLocales.contains(locale) + + if !isInstalled { + logger.notice("Assets for '\(locale.identifier)' not installed. Requesting system download.") + + if let request = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) { + try await request.downloadAndInstall() + logger.notice("Asset download for '\(locale.identifier)' complete.") + } else { + logger.error("Asset download for '\(locale.identifier)' failed: Could not create installation request.") + // Note: We don't throw an error here, as transcription might still work with a base model. + } + } + #endif + } +} diff --git a/VoiceInk/Views/ModelCardRowView.swift b/VoiceInk/Views/ModelCardRowView.swift index f50f2d7..67a2b11 100644 --- a/VoiceInk/Views/ModelCardRowView.swift +++ b/VoiceInk/Views/ModelCardRowView.swift @@ -30,6 +30,14 @@ struct ModelCardRowView: View { downloadAction: downloadAction ) } + case .nativeApple: + if let nativeAppleModel = model as? NativeAppleModel { + NativeAppleModelCardView( + model: nativeAppleModel, + isCurrent: isCurrent, + setDefaultAction: setDefaultAction + ) + } case .groq, .elevenLabs, .deepgram: if let cloudModel = model as? CloudModel { CloudModelCardView( @@ -715,4 +723,115 @@ struct CustomModelCardView: View { .frame(width: 20, height: 20) } } +} + +// MARK: - Native Apple Model Card View +struct NativeAppleModelCardView: View { + let model: NativeAppleModel + let isCurrent: Bool + var setDefaultAction: () -> Void + + var body: some View { + HStack(alignment: .top, spacing: 16) { + // Main Content + VStack(alignment: .leading, spacing: 6) { + headerSection + metadataSection + descriptionSection + } + .frame(maxWidth: .infinity, alignment: .leading) + + // Action Controls + actionSection + } + .padding(16) + .background(CardBackground(isSelected: isCurrent, useAccentGradientWhenSelected: isCurrent)) + } + + private var headerSection: some View { + HStack(alignment: .firstTextBaseline) { + Text(model.displayName) + .font(.system(size: 13, weight: .semibold)) + .foregroundColor(Color(.labelColor)) + + statusBadge + + Spacer() + } + } + + private var statusBadge: some View { + Group { + if isCurrent { + Text("Default") + .font(.system(size: 11, weight: .medium)) + .padding(.horizontal, 6) + .padding(.vertical, 2) + .background(Capsule().fill(Color.accentColor)) + .foregroundColor(.white) + } else { + Text("Built-in") + .font(.system(size: 11, weight: .medium)) + .padding(.horizontal, 6) + .padding(.vertical, 2) + .background(Capsule().fill(Color.blue.opacity(0.2))) + .foregroundColor(Color.blue) + } + } + } + + private var metadataSection: some View { + HStack(spacing: 12) { + // Native Apple + Label("Native Apple", systemImage: "apple.logo") + .font(.system(size: 11)) + .foregroundColor(Color(.secondaryLabelColor)) + .lineLimit(1) + + // Language + Label(model.language, systemImage: "globe") + .font(.system(size: 11)) + .foregroundColor(Color(.secondaryLabelColor)) + .lineLimit(1) + + // On-Device + Label("On-Device", systemImage: "checkmark.shield") + .font(.system(size: 11)) + .foregroundColor(Color(.secondaryLabelColor)) + .lineLimit(1) + + // Requires macOS 26+ + Label("macOS 26+", systemImage: "macbook") + .font(.system(size: 11)) + .foregroundColor(Color(.secondaryLabelColor)) + .lineLimit(1) + } + .lineLimit(1) + } + + private var descriptionSection: some View { + Text(model.description) + .font(.system(size: 11)) + .foregroundColor(Color(.secondaryLabelColor)) + .lineLimit(2) + .fixedSize(horizontal: false, vertical: true) + .padding(.top, 4) + } + + private var actionSection: some View { + HStack(spacing: 8) { + if isCurrent { + Text("Default Model") + .font(.system(size: 12)) + .foregroundColor(Color(.secondaryLabelColor)) + } else { + Button(action: setDefaultAction) { + Text("Set as Default") + .font(.system(size: 12)) + } + .buttonStyle(.bordered) + .controlSize(.small) + } + } + } } diff --git a/VoiceInk/Whisper/WhisperState+ModelQueries.swift b/VoiceInk/Whisper/WhisperState+ModelQueries.swift index 689b28c..48961c8 100644 --- a/VoiceInk/Whisper/WhisperState+ModelQueries.swift +++ b/VoiceInk/Whisper/WhisperState+ModelQueries.swift @@ -6,6 +6,9 @@ extension WhisperState { switch model.provider { case .local: return availableModels.contains { $0.name == model.name } + case .nativeApple: + // Native Apple models are always available (though they require macOS 26+) + return true case .groq: let key = UserDefaults.standard.string(forKey: "GROQAPIKey") return key != nil && !key!.isEmpty diff --git a/VoiceInk/Whisper/WhisperState.swift b/VoiceInk/Whisper/WhisperState.swift index cb1ff03..b56bc19 100644 --- a/VoiceInk/Whisper/WhisperState.swift +++ b/VoiceInk/Whisper/WhisperState.swift @@ -59,6 +59,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate { // Transcription Services private var localTranscriptionService: LocalTranscriptionService private let cloudTranscriptionService = CloudTranscriptionService() + private let nativeAppleTranscriptionService = NativeAppleTranscriptionService() private var modelUrl: URL? { let possibleURLs = [ @@ -294,8 +295,16 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate { throw WhisperStateError.transcriptionFailed } + let transcriptionService: TranscriptionService + switch model.provider { + case .local: + transcriptionService = localTranscriptionService + case .nativeApple: + transcriptionService = nativeAppleTranscriptionService + default: + transcriptionService = cloudTranscriptionService + } - let transcriptionService: TranscriptionService = (model.provider == .local) ? localTranscriptionService : cloudTranscriptionService var text = try await transcriptionService.transcribe(audioURL: url, model: model) text = text.trimmingCharacters(in: .whitespacesAndNewlines)