feat: Added Native apple transcription service

This commit is contained in:
Beingpax 2025-06-19 17:11:36 +05:45
parent b918979e83
commit d1edb47d87
8 changed files with 364 additions and 5 deletions

View File

@ -35,11 +35,78 @@ import Foundation
}
}
// Apple Native Speech specific languages with proper BCP-47 format
// Based on actual supported locales from SpeechTranscriber.supportedLocales
static let appleNativeLanguages = [
// English variants
"en-US": "English (United States)",
"en-GB": "English (United Kingdom)",
"en-CA": "English (Canada)",
"en-AU": "English (Australia)",
"en-IN": "English (India)",
"en-IE": "English (Ireland)",
"en-NZ": "English (New Zealand)",
"en-ZA": "English (South Africa)",
"en-SA": "English (Saudi Arabia)",
"en-AE": "English (UAE)",
"en-SG": "English (Singapore)",
"en-PH": "English (Philippines)",
"en-ID": "English (Indonesia)",
// Spanish variants
"es-ES": "Spanish (Spain)",
"es-MX": "Spanish (Mexico)",
"es-US": "Spanish (United States)",
"es-CO": "Spanish (Colombia)",
"es-CL": "Spanish (Chile)",
"es-419": "Spanish (Latin America)",
// French variants
"fr-FR": "French (France)",
"fr-CA": "French (Canada)",
"fr-BE": "French (Belgium)",
"fr-CH": "French (Switzerland)",
// German variants
"de-DE": "German (Germany)",
"de-AT": "German (Austria)",
"de-CH": "German (Switzerland)",
// Chinese variants
"zh-CN": "Chinese Simplified (China)",
"zh-TW": "Chinese Traditional (Taiwan)",
"zh-HK": "Chinese Traditional (Hong Kong)",
// Other Asian languages
"ja-JP": "Japanese (Japan)",
"ko-KR": "Korean (South Korea)",
"yue-CN": "Cantonese (China)",
// Portuguese variants
"pt-BR": "Portuguese (Brazil)",
"pt-PT": "Portuguese (Portugal)",
// Italian variants
"it-IT": "Italian (Italy)",
"it-CH": "Italian (Switzerland)",
// Arabic
"ar-SA": "Arabic (Saudi Arabia)"
]
static var models: [any TranscriptionModel] {
return predefinedModels + CustomModelManager.shared.customModels
}
private static let predefinedModels: [any TranscriptionModel] = [
// Native Apple Model
NativeAppleModel(
name: "apple-speech",
displayName: "Apple Speech",
description: "Uses the native Apple Speech framework for transcription. Available on macOS Sonoma 14+.",
isMultilingualModel: true,
supportedLanguages: appleNativeLanguages
),
// Local Models
LocalModel(
name: "ggml-tiny",

View File

@ -7,6 +7,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
case elevenLabs = "ElevenLabs"
case deepgram = "Deepgram"
case custom = "Custom"
case nativeApple = "Native Apple"
// Future providers can be added here
}
@ -33,6 +34,17 @@ extension TranscriptionModel {
}
}
// A new struct for Apple's native models
struct NativeAppleModel: TranscriptionModel {
let id = UUID()
let name: String
let displayName: String
let description: String
let provider: ModelProvider = .nativeApple
let isMultilingualModel: Bool
let supportedLanguages: [String: String]
}
// A new struct for cloud models
struct CloudModel: TranscriptionModel {
let id: UUID

View File

@ -21,6 +21,7 @@ class AudioTranscriptionManager: ObservableObject {
// Transcription services - will be initialized when needed
private var localTranscriptionService: LocalTranscriptionService?
private let cloudTranscriptionService = CloudTranscriptionService()
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
enum ProcessingPhase {
case idle
@ -93,9 +94,12 @@ class AudioTranscriptionManager: ObservableObject {
processingPhase = .transcribing
var text: String
if currentModel.provider == .local {
switch currentModel.provider {
case .local:
text = try await localTranscriptionService!.transcribe(audioURL: permanentURL, model: currentModel)
} else {
case .nativeApple:
text = try await nativeAppleTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
default: // Cloud models
text = try await cloudTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
}

View File

@ -18,6 +18,7 @@ class AudioTranscriptionService: ObservableObject {
// Transcription services
private let localTranscriptionService: LocalTranscriptionService
private let cloudTranscriptionService = CloudTranscriptionService()
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
enum TranscriptionError: Error {
case noAudioFile
@ -47,11 +48,16 @@ class AudioTranscriptionService: ObservableObject {
// Delegate transcription to appropriate service
var text: String
if model.provider == .local {
switch model.provider {
case .local:
messageLog += "Using local transcription service...\n"
text = try await localTranscriptionService.transcribe(audioURL: url, model: model)
messageLog += "Local transcription completed.\n"
} else {
case .nativeApple:
messageLog += "Using Native Apple transcription service...\n"
text = try await nativeAppleTranscriptionService.transcribe(audioURL: url, model: model)
messageLog += "Native Apple transcription completed.\n"
default: // Cloud models
messageLog += "Using cloud transcription service...\n"
text = try await cloudTranscriptionService.transcribe(audioURL: url, model: model)
messageLog += "Cloud transcription completed.\n"

View File

@ -0,0 +1,139 @@
import Foundation
import AVFoundation
import os
#if canImport(Speech)
import Speech
#endif
/// Transcription service that leverages the new SpeechAnalyzer / SpeechTranscriber API available on macOS 26 (Tahoe).
/// Falls back with an unsupported-provider error on earlier OS versions so the application can gracefully degrade.
class NativeAppleTranscriptionService: TranscriptionService {
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "NativeAppleTranscriptionService")
enum ServiceError: Error, LocalizedError {
case unsupportedOS
case transcriptionFailed
case localeNotSupported
case invalidModel
var errorDescription: String? {
switch self {
case .unsupportedOS:
return "SpeechAnalyzer requires macOS 26 or later."
case .transcriptionFailed:
return "Transcription failed using SpeechAnalyzer."
case .localeNotSupported:
return "The selected language is not supported by SpeechAnalyzer."
case .invalidModel:
return "Invalid model type provided for Native Apple transcription."
}
}
}
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
guard model is NativeAppleModel else {
throw ServiceError.invalidModel
}
guard #available(macOS 26, *) else {
logger.error("SpeechAnalyzer is not available on this macOS version")
throw ServiceError.unsupportedOS
}
#if canImport(Speech)
logger.notice("Starting Apple native transcription with SpeechAnalyzer.")
let audioFile = try AVAudioFile(forReading: audioURL)
// Use the user's selected language directly, assuming BCP-47 format.
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "en-US"
let locale = Locale(identifier: selectedLanguage)
// Check for locale support and asset installation status.
let supportedLocales = await SpeechTranscriber.supportedLocales
let installedLocales = await SpeechTranscriber.installedLocales
let isLocaleSupported = supportedLocales.contains(locale)
let isLocaleInstalled = installedLocales.contains(locale)
// Create the detailed log message
let supportedIdentifiers = supportedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
let installedIdentifiers = installedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
let availableForDownload = Set(supportedLocales).subtracting(Set(installedLocales)).map { $0.identifier }.sorted().joined(separator: ", ")
var statusMessage: String
if isLocaleInstalled {
statusMessage = "✅ Installed"
} else if isLocaleSupported {
statusMessage = "❌ Not Installed (Available for download)"
} else {
statusMessage = "❌ Not Supported"
}
let logMessage = """
--- Native Speech Transcription ---
Locale: '\(locale.identifier)'
Status: \(statusMessage)
------------------------------------
Supported Locales: [\(supportedIdentifiers)]
Installed Locales: [\(installedIdentifiers)]
Available for Download: [\(availableForDownload)]
------------------------------------
"""
logger.notice("\(logMessage)")
guard isLocaleSupported else {
logger.error("Transcription failed: Locale '\(locale.identifier)' is not supported by SpeechTranscriber.")
throw ServiceError.localeNotSupported
}
let transcriber = SpeechTranscriber(
locale: locale,
transcriptionOptions: [],
reportingOptions: [],
attributeOptions: []
)
// Ensure model assets are available, triggering a system download prompt if necessary.
try await ensureModelIsAvailable(for: transcriber, locale: locale)
let analyzer = SpeechAnalyzer(modules: [transcriber])
try await analyzer.start(inputAudioFile: audioFile, finishAfterFile: true)
var transcript: AttributedString = ""
for try await result in transcriber.results {
transcript += result.text
}
let finalTranscription = String(transcript.characters).trimmingCharacters(in: .whitespacesAndNewlines)
logger.notice("Native transcription successful. Length: \(finalTranscription.count) characters.")
return finalTranscription
#else
logger.error("Speech framework is not available")
throw ServiceError.unsupportedOS
#endif
}
@available(macOS 26, *)
private func ensureModelIsAvailable(for transcriber: SpeechTranscriber, locale: Locale) async throws {
#if canImport(Speech)
let isInstalled = await SpeechTranscriber.installedLocales.contains(locale)
if !isInstalled {
logger.notice("Assets for '\(locale.identifier)' not installed. Requesting system download.")
if let request = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
try await request.downloadAndInstall()
logger.notice("Asset download for '\(locale.identifier)' complete.")
} else {
logger.error("Asset download for '\(locale.identifier)' failed: Could not create installation request.")
// Note: We don't throw an error here, as transcription might still work with a base model.
}
}
#endif
}
}

View File

@ -30,6 +30,14 @@ struct ModelCardRowView: View {
downloadAction: downloadAction
)
}
case .nativeApple:
if let nativeAppleModel = model as? NativeAppleModel {
NativeAppleModelCardView(
model: nativeAppleModel,
isCurrent: isCurrent,
setDefaultAction: setDefaultAction
)
}
case .groq, .elevenLabs, .deepgram:
if let cloudModel = model as? CloudModel {
CloudModelCardView(
@ -715,4 +723,115 @@ struct CustomModelCardView: View {
.frame(width: 20, height: 20)
}
}
}
// MARK: - Native Apple Model Card View
struct NativeAppleModelCardView: View {
let model: NativeAppleModel
let isCurrent: Bool
var setDefaultAction: () -> Void
var body: some View {
HStack(alignment: .top, spacing: 16) {
// Main Content
VStack(alignment: .leading, spacing: 6) {
headerSection
metadataSection
descriptionSection
}
.frame(maxWidth: .infinity, alignment: .leading)
// Action Controls
actionSection
}
.padding(16)
.background(CardBackground(isSelected: isCurrent, useAccentGradientWhenSelected: isCurrent))
}
private var headerSection: some View {
HStack(alignment: .firstTextBaseline) {
Text(model.displayName)
.font(.system(size: 13, weight: .semibold))
.foregroundColor(Color(.labelColor))
statusBadge
Spacer()
}
}
private var statusBadge: some View {
Group {
if isCurrent {
Text("Default")
.font(.system(size: 11, weight: .medium))
.padding(.horizontal, 6)
.padding(.vertical, 2)
.background(Capsule().fill(Color.accentColor))
.foregroundColor(.white)
} else {
Text("Built-in")
.font(.system(size: 11, weight: .medium))
.padding(.horizontal, 6)
.padding(.vertical, 2)
.background(Capsule().fill(Color.blue.opacity(0.2)))
.foregroundColor(Color.blue)
}
}
}
private var metadataSection: some View {
HStack(spacing: 12) {
// Native Apple
Label("Native Apple", systemImage: "apple.logo")
.font(.system(size: 11))
.foregroundColor(Color(.secondaryLabelColor))
.lineLimit(1)
// Language
Label(model.language, systemImage: "globe")
.font(.system(size: 11))
.foregroundColor(Color(.secondaryLabelColor))
.lineLimit(1)
// On-Device
Label("On-Device", systemImage: "checkmark.shield")
.font(.system(size: 11))
.foregroundColor(Color(.secondaryLabelColor))
.lineLimit(1)
// Requires macOS 26+
Label("macOS 26+", systemImage: "macbook")
.font(.system(size: 11))
.foregroundColor(Color(.secondaryLabelColor))
.lineLimit(1)
}
.lineLimit(1)
}
private var descriptionSection: some View {
Text(model.description)
.font(.system(size: 11))
.foregroundColor(Color(.secondaryLabelColor))
.lineLimit(2)
.fixedSize(horizontal: false, vertical: true)
.padding(.top, 4)
}
private var actionSection: some View {
HStack(spacing: 8) {
if isCurrent {
Text("Default Model")
.font(.system(size: 12))
.foregroundColor(Color(.secondaryLabelColor))
} else {
Button(action: setDefaultAction) {
Text("Set as Default")
.font(.system(size: 12))
}
.buttonStyle(.bordered)
.controlSize(.small)
}
}
}
}

View File

@ -6,6 +6,9 @@ extension WhisperState {
switch model.provider {
case .local:
return availableModels.contains { $0.name == model.name }
case .nativeApple:
// Native Apple models are always available (though they require macOS 26+)
return true
case .groq:
let key = UserDefaults.standard.string(forKey: "GROQAPIKey")
return key != nil && !key!.isEmpty

View File

@ -59,6 +59,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
// Transcription Services
private var localTranscriptionService: LocalTranscriptionService
private let cloudTranscriptionService = CloudTranscriptionService()
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
private var modelUrl: URL? {
let possibleURLs = [
@ -294,8 +295,16 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
throw WhisperStateError.transcriptionFailed
}
let transcriptionService: TranscriptionService
switch model.provider {
case .local:
transcriptionService = localTranscriptionService
case .nativeApple:
transcriptionService = nativeAppleTranscriptionService
default:
transcriptionService = cloudTranscriptionService
}
let transcriptionService: TranscriptionService = (model.provider == .local) ? localTranscriptionService : cloudTranscriptionService
var text = try await transcriptionService.transcribe(audioURL: url, model: model)
text = text.trimmingCharacters(in: .whitespacesAndNewlines)