feat: Added Native apple transcription service

2025-06-19 17:11:36 +05:45 · 2025-06-19 17:11:36 +05:45 · d1edb47d87
commit d1edb47d87
parent b918979e83
8 changed files with 364 additions and 5 deletions
--- a/VoiceInk/Models/PredefinedModels.swift
+++ b/VoiceInk/Models/PredefinedModels.swift
@ -35,11 +35,78 @@ import Foundation
        }
    }
    
+    // Apple Native Speech specific languages with proper BCP-47 format
+    // Based on actual supported locales from SpeechTranscriber.supportedLocales
+    static let appleNativeLanguages = [
+        // English variants
+        "en-US": "English (United States)",
+        "en-GB": "English (United Kingdom)",
+        "en-CA": "English (Canada)",
+        "en-AU": "English (Australia)",
+        "en-IN": "English (India)",
+        "en-IE": "English (Ireland)",
+        "en-NZ": "English (New Zealand)",
+        "en-ZA": "English (South Africa)",
+        "en-SA": "English (Saudi Arabia)",
+        "en-AE": "English (UAE)",
+        "en-SG": "English (Singapore)",
+        "en-PH": "English (Philippines)",
+        "en-ID": "English (Indonesia)",
+        
+        // Spanish variants
+        "es-ES": "Spanish (Spain)",
+        "es-MX": "Spanish (Mexico)",
+        "es-US": "Spanish (United States)",
+        "es-CO": "Spanish (Colombia)",
+        "es-CL": "Spanish (Chile)",
+        "es-419": "Spanish (Latin America)",
+        
+        // French variants
+        "fr-FR": "French (France)",
+        "fr-CA": "French (Canada)",
+        "fr-BE": "French (Belgium)",
+        "fr-CH": "French (Switzerland)",
+        
+        // German variants
+        "de-DE": "German (Germany)",
+        "de-AT": "German (Austria)",
+        "de-CH": "German (Switzerland)",
+        
+        // Chinese variants
+        "zh-CN": "Chinese Simplified (China)",
+        "zh-TW": "Chinese Traditional (Taiwan)",
+        "zh-HK": "Chinese Traditional (Hong Kong)",
+        
+        // Other Asian languages
+        "ja-JP": "Japanese (Japan)",
+        "ko-KR": "Korean (South Korea)",
+        "yue-CN": "Cantonese (China)",
+        
+        // Portuguese variants
+        "pt-BR": "Portuguese (Brazil)",
+        "pt-PT": "Portuguese (Portugal)",
+        
+        // Italian variants
+        "it-IT": "Italian (Italy)",
+        "it-CH": "Italian (Switzerland)",
+        
+        // Arabic
+        "ar-SA": "Arabic (Saudi Arabia)"
+    ]
+    
    static var models: [any TranscriptionModel] {
        return predefinedModels + CustomModelManager.shared.customModels
    }
    
    private static let predefinedModels: [any TranscriptionModel] = [
+        // Native Apple Model
+        NativeAppleModel(
+            name: "apple-speech",
+            displayName: "Apple Speech",
+            description: "Uses the native Apple Speech framework for transcription. Available on macOS Sonoma 14+.",
+            isMultilingualModel: true,
+            supportedLanguages: appleNativeLanguages
+        ),
         // Local Models
         LocalModel(
             name: "ggml-tiny",
--- a/VoiceInk/Models/TranscriptionModel.swift
+++ b/VoiceInk/Models/TranscriptionModel.swift
@ -7,6 +7,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
    case elevenLabs = "ElevenLabs"
    case deepgram = "Deepgram"
    case custom = "Custom"
+    case nativeApple = "Native Apple"
    // Future providers can be added here
 }

@ -33,6 +34,17 @@ extension TranscriptionModel {
    }
 }

+// A new struct for Apple's native models
+struct NativeAppleModel: TranscriptionModel {
+    let id = UUID()
+    let name: String
+    let displayName: String
+    let description: String
+    let provider: ModelProvider = .nativeApple
+    let isMultilingualModel: Bool
+    let supportedLanguages: [String: String]
+}
+
 // A new struct for cloud models
 struct CloudModel: TranscriptionModel {
    let id: UUID
--- a/VoiceInk/Services/AudioFileTranscriptionManager.swift
+++ b/VoiceInk/Services/AudioFileTranscriptionManager.swift
@ -21,6 +21,7 @@ class AudioTranscriptionManager: ObservableObject {
    // Transcription services - will be initialized when needed
    private var localTranscriptionService: LocalTranscriptionService?
    private let cloudTranscriptionService = CloudTranscriptionService()
+    private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
    
    enum ProcessingPhase {
        case idle
@ -93,9 +94,12 @@ class AudioTranscriptionManager: ObservableObject {
                processingPhase = .transcribing
                var text: String
                
-                if currentModel.provider == .local {
+                switch currentModel.provider {
+                case .local:
                    text = try await localTranscriptionService!.transcribe(audioURL: permanentURL, model: currentModel)
-                } else {
+                case .nativeApple:
+                    text = try await nativeAppleTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
+                default: // Cloud models
                    text = try await cloudTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
                }
                
--- a/VoiceInk/Services/AudioFileTranscriptionService.swift
+++ b/VoiceInk/Services/AudioFileTranscriptionService.swift
@ -18,6 +18,7 @@ class AudioTranscriptionService: ObservableObject {
    // Transcription services
    private let localTranscriptionService: LocalTranscriptionService
    private let cloudTranscriptionService = CloudTranscriptionService()
+    private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
    
    enum TranscriptionError: Error {
        case noAudioFile
@ -47,11 +48,16 @@ class AudioTranscriptionService: ObservableObject {
            // Delegate transcription to appropriate service
            var text: String
            
-            if model.provider == .local {
+            switch model.provider {
+            case .local:
                messageLog += "Using local transcription service...\n"
                text = try await localTranscriptionService.transcribe(audioURL: url, model: model)
                messageLog += "Local transcription completed.\n"
-            } else {
+            case .nativeApple:
+                messageLog += "Using Native Apple transcription service...\n"
+                text = try await nativeAppleTranscriptionService.transcribe(audioURL: url, model: model)
+                messageLog += "Native Apple transcription completed.\n"
+            default: // Cloud models
                messageLog += "Using cloud transcription service...\n"
                text = try await cloudTranscriptionService.transcribe(audioURL: url, model: model)
                messageLog += "Cloud transcription completed.\n"
--- a/VoiceInk/Services/NativeAppleTranscriptionService.swift
+++ b/VoiceInk/Services/NativeAppleTranscriptionService.swift
@ -0,0 +1,139 @@
+import Foundation
+import AVFoundation
+import os
+
+#if canImport(Speech)
+import Speech
+#endif
+
+/// Transcription service that leverages the new SpeechAnalyzer / SpeechTranscriber API available on macOS 26 (Tahoe).
+/// Falls back with an unsupported-provider error on earlier OS versions so the application can gracefully degrade.
+class NativeAppleTranscriptionService: TranscriptionService {
+    private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "NativeAppleTranscriptionService")
+    
+    enum ServiceError: Error, LocalizedError {
+        case unsupportedOS
+        case transcriptionFailed
+        case localeNotSupported
+        case invalidModel
+        
+        var errorDescription: String? {
+            switch self {
+            case .unsupportedOS:
+                return "SpeechAnalyzer requires macOS 26 or later."
+            case .transcriptionFailed:
+                return "Transcription failed using SpeechAnalyzer."
+            case .localeNotSupported:
+                return "The selected language is not supported by SpeechAnalyzer."
+            case .invalidModel:
+                return "Invalid model type provided for Native Apple transcription."
+            }
+        }
+    }
+
+    func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
+        guard model is NativeAppleModel else {
+            throw ServiceError.invalidModel
+        }
+        
+        guard #available(macOS 26, *) else {
+            logger.error("SpeechAnalyzer is not available on this macOS version")
+            throw ServiceError.unsupportedOS
+        }
+        
+        #if canImport(Speech)
+        logger.notice("Starting Apple native transcription with SpeechAnalyzer.")
+        
+        let audioFile = try AVAudioFile(forReading: audioURL)
+        
+        // Use the user's selected language directly, assuming BCP-47 format.
+        let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "en-US"
+        let locale = Locale(identifier: selectedLanguage)
+
+        // Check for locale support and asset installation status.
+        let supportedLocales = await SpeechTranscriber.supportedLocales
+        let installedLocales = await SpeechTranscriber.installedLocales
+        let isLocaleSupported = supportedLocales.contains(locale)
+        let isLocaleInstalled = installedLocales.contains(locale)
+
+        // Create the detailed log message
+        let supportedIdentifiers = supportedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
+        let installedIdentifiers = installedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
+        let availableForDownload = Set(supportedLocales).subtracting(Set(installedLocales)).map { $0.identifier }.sorted().joined(separator: ", ")
+        
+        var statusMessage: String
+        if isLocaleInstalled {
+            statusMessage = "✅ Installed"
+        } else if isLocaleSupported {
+            statusMessage = "❌ Not Installed (Available for download)"
+        } else {
+            statusMessage = "❌ Not Supported"
+        }
+        
+        let logMessage = """
+        
+        --- Native Speech Transcription ---
+        Locale: '\(locale.identifier)'
+        Status: \(statusMessage)
+        ------------------------------------
+        Supported Locales: [\(supportedIdentifiers)]
+        Installed Locales: [\(installedIdentifiers)]
+        Available for Download: [\(availableForDownload)]
+        ------------------------------------
+        """
+        logger.notice("\(logMessage)")
+
+        guard isLocaleSupported else {
+            logger.error("Transcription failed: Locale '\(locale.identifier)' is not supported by SpeechTranscriber.")
+            throw ServiceError.localeNotSupported
+        }
+        
+        let transcriber = SpeechTranscriber(
+            locale: locale,
+            transcriptionOptions: [],
+            reportingOptions: [],
+            attributeOptions: []
+        )
+        
+        // Ensure model assets are available, triggering a system download prompt if necessary.
+        try await ensureModelIsAvailable(for: transcriber, locale: locale)
+        
+        let analyzer = SpeechAnalyzer(modules: [transcriber])
+        
+        try await analyzer.start(inputAudioFile: audioFile, finishAfterFile: true)
+        
+        var transcript: AttributedString = ""
+        for try await result in transcriber.results {
+            transcript += result.text
+        }
+        
+        let finalTranscription = String(transcript.characters).trimmingCharacters(in: .whitespacesAndNewlines)
+        
+        logger.notice("Native transcription successful. Length: \(finalTranscription.count) characters.")
+        return finalTranscription
+        
+        #else
+        logger.error("Speech framework is not available")
+        throw ServiceError.unsupportedOS
+        #endif
+    }
+    
+    @available(macOS 26, *)
+    private func ensureModelIsAvailable(for transcriber: SpeechTranscriber, locale: Locale) async throws {
+        #if canImport(Speech)
+        let isInstalled = await SpeechTranscriber.installedLocales.contains(locale)
+
+        if !isInstalled {
+            logger.notice("Assets for '\(locale.identifier)' not installed. Requesting system download.")
+            
+            if let request = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
+                try await request.downloadAndInstall()
+                logger.notice("Asset download for '\(locale.identifier)' complete.")
+            } else {
+                logger.error("Asset download for '\(locale.identifier)' failed: Could not create installation request.")
+                // Note: We don't throw an error here, as transcription might still work with a base model.
+            }
+        }
+        #endif
+    }
+} 
--- a/VoiceInk/Views/ModelCardRowView.swift
+++ b/VoiceInk/Views/ModelCardRowView.swift
@ -30,6 +30,14 @@ struct ModelCardRowView: View {
                        downloadAction: downloadAction
                    )
                }
+            case .nativeApple:
+                if let nativeAppleModel = model as? NativeAppleModel {
+                    NativeAppleModelCardView(
+                        model: nativeAppleModel,
+                        isCurrent: isCurrent,
+                        setDefaultAction: setDefaultAction
+                    )
+                }
            case .groq, .elevenLabs, .deepgram:
                if let cloudModel = model as? CloudModel {
                    CloudModelCardView(
@ -715,4 +723,115 @@ struct CustomModelCardView: View {
            .frame(width: 20, height: 20)
        }
    }
+}
+
+// MARK: - Native Apple Model Card View
+struct NativeAppleModelCardView: View {
+    let model: NativeAppleModel
+    let isCurrent: Bool
+    var setDefaultAction: () -> Void
+    
+    var body: some View {
+        HStack(alignment: .top, spacing: 16) {
+            // Main Content
+            VStack(alignment: .leading, spacing: 6) {
+                headerSection
+                metadataSection
+                descriptionSection
+            }
+            .frame(maxWidth: .infinity, alignment: .leading)
+            
+            // Action Controls
+            actionSection
+        }
+        .padding(16)
+        .background(CardBackground(isSelected: isCurrent, useAccentGradientWhenSelected: isCurrent))
+    }
+    
+    private var headerSection: some View {
+        HStack(alignment: .firstTextBaseline) {
+            Text(model.displayName)
+                .font(.system(size: 13, weight: .semibold))
+                .foregroundColor(Color(.labelColor))
+            
+            statusBadge
+            
+            Spacer()
+        }
+    }
+    
+    private var statusBadge: some View {
+        Group {
+            if isCurrent {
+                Text("Default")
+                    .font(.system(size: 11, weight: .medium))
+                    .padding(.horizontal, 6)
+                    .padding(.vertical, 2)
+                    .background(Capsule().fill(Color.accentColor))
+                    .foregroundColor(.white)
+            } else {
+                Text("Built-in")
+                    .font(.system(size: 11, weight: .medium))
+                    .padding(.horizontal, 6)
+                    .padding(.vertical, 2)
+                    .background(Capsule().fill(Color.blue.opacity(0.2)))
+                    .foregroundColor(Color.blue)
+            }
+        }
+    }
+    
+    private var metadataSection: some View {
+        HStack(spacing: 12) {
+            // Native Apple
+            Label("Native Apple", systemImage: "apple.logo")
+                .font(.system(size: 11))
+                .foregroundColor(Color(.secondaryLabelColor))
+                .lineLimit(1)
+            
+            // Language
+            Label(model.language, systemImage: "globe")
+                .font(.system(size: 11))
+                .foregroundColor(Color(.secondaryLabelColor))
+                .lineLimit(1)
+            
+            // On-Device
+            Label("On-Device", systemImage: "checkmark.shield")
+                .font(.system(size: 11))
+                .foregroundColor(Color(.secondaryLabelColor))
+                .lineLimit(1)
+            
+            // Requires macOS 26+
+            Label("macOS 26+", systemImage: "macbook")
+                .font(.system(size: 11))
+                .foregroundColor(Color(.secondaryLabelColor))
+                .lineLimit(1)
+        }
+        .lineLimit(1)
+    }
+    
+    private var descriptionSection: some View {
+        Text(model.description)
+            .font(.system(size: 11))
+            .foregroundColor(Color(.secondaryLabelColor))
+            .lineLimit(2)
+            .fixedSize(horizontal: false, vertical: true)
+            .padding(.top, 4)
+    }
+    
+    private var actionSection: some View {
+        HStack(spacing: 8) {
+            if isCurrent {
+                Text("Default Model")
+                    .font(.system(size: 12))
+                    .foregroundColor(Color(.secondaryLabelColor))
+            } else {
+                Button(action: setDefaultAction) {
+                    Text("Set as Default")
+                        .font(.system(size: 12))
+                }
+                .buttonStyle(.bordered)
+                .controlSize(.small)
+            }
+        }
+    }
 } 
--- a/VoiceInk/Whisper/WhisperState+ModelQueries.swift
+++ b/VoiceInk/Whisper/WhisperState+ModelQueries.swift
@ -6,6 +6,9 @@ extension WhisperState {
            switch model.provider {
            case .local:
                return availableModels.contains { $0.name == model.name }
+            case .nativeApple:
+                // Native Apple models are always available (though they require macOS 26+)
+                return true
            case .groq:
                let key = UserDefaults.standard.string(forKey: "GROQAPIKey")
                return key != nil && !key!.isEmpty
--- a/VoiceInk/Whisper/WhisperState.swift
+++ b/VoiceInk/Whisper/WhisperState.swift
@ -59,6 +59,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    // Transcription Services
    private var localTranscriptionService: LocalTranscriptionService
    private let cloudTranscriptionService = CloudTranscriptionService()
+    private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
    
    private var modelUrl: URL? {
        let possibleURLs = [
@ -294,8 +295,16 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
                throw WhisperStateError.transcriptionFailed
            }
            
+            let transcriptionService: TranscriptionService
+            switch model.provider {
+            case .local:
+                transcriptionService = localTranscriptionService
+            case .nativeApple:
+                transcriptionService = nativeAppleTranscriptionService
+            default:
+                transcriptionService = cloudTranscriptionService
+            }

-            let transcriptionService: TranscriptionService = (model.provider == .local) ? localTranscriptionService : cloudTranscriptionService
            var text = try await transcriptionService.transcribe(audioURL: url, model: model)
            text = text.trimmingCharacters(in: .whitespacesAndNewlines)