feat: Added Native apple transcription service
This commit is contained in:
parent
b918979e83
commit
d1edb47d87
@ -35,11 +35,78 @@ import Foundation
|
||||
}
|
||||
}
|
||||
|
||||
// Apple Native Speech specific languages with proper BCP-47 format
|
||||
// Based on actual supported locales from SpeechTranscriber.supportedLocales
|
||||
static let appleNativeLanguages = [
|
||||
// English variants
|
||||
"en-US": "English (United States)",
|
||||
"en-GB": "English (United Kingdom)",
|
||||
"en-CA": "English (Canada)",
|
||||
"en-AU": "English (Australia)",
|
||||
"en-IN": "English (India)",
|
||||
"en-IE": "English (Ireland)",
|
||||
"en-NZ": "English (New Zealand)",
|
||||
"en-ZA": "English (South Africa)",
|
||||
"en-SA": "English (Saudi Arabia)",
|
||||
"en-AE": "English (UAE)",
|
||||
"en-SG": "English (Singapore)",
|
||||
"en-PH": "English (Philippines)",
|
||||
"en-ID": "English (Indonesia)",
|
||||
|
||||
// Spanish variants
|
||||
"es-ES": "Spanish (Spain)",
|
||||
"es-MX": "Spanish (Mexico)",
|
||||
"es-US": "Spanish (United States)",
|
||||
"es-CO": "Spanish (Colombia)",
|
||||
"es-CL": "Spanish (Chile)",
|
||||
"es-419": "Spanish (Latin America)",
|
||||
|
||||
// French variants
|
||||
"fr-FR": "French (France)",
|
||||
"fr-CA": "French (Canada)",
|
||||
"fr-BE": "French (Belgium)",
|
||||
"fr-CH": "French (Switzerland)",
|
||||
|
||||
// German variants
|
||||
"de-DE": "German (Germany)",
|
||||
"de-AT": "German (Austria)",
|
||||
"de-CH": "German (Switzerland)",
|
||||
|
||||
// Chinese variants
|
||||
"zh-CN": "Chinese Simplified (China)",
|
||||
"zh-TW": "Chinese Traditional (Taiwan)",
|
||||
"zh-HK": "Chinese Traditional (Hong Kong)",
|
||||
|
||||
// Other Asian languages
|
||||
"ja-JP": "Japanese (Japan)",
|
||||
"ko-KR": "Korean (South Korea)",
|
||||
"yue-CN": "Cantonese (China)",
|
||||
|
||||
// Portuguese variants
|
||||
"pt-BR": "Portuguese (Brazil)",
|
||||
"pt-PT": "Portuguese (Portugal)",
|
||||
|
||||
// Italian variants
|
||||
"it-IT": "Italian (Italy)",
|
||||
"it-CH": "Italian (Switzerland)",
|
||||
|
||||
// Arabic
|
||||
"ar-SA": "Arabic (Saudi Arabia)"
|
||||
]
|
||||
|
||||
static var models: [any TranscriptionModel] {
|
||||
return predefinedModels + CustomModelManager.shared.customModels
|
||||
}
|
||||
|
||||
private static let predefinedModels: [any TranscriptionModel] = [
|
||||
// Native Apple Model
|
||||
NativeAppleModel(
|
||||
name: "apple-speech",
|
||||
displayName: "Apple Speech",
|
||||
description: "Uses the native Apple Speech framework for transcription. Available on macOS Sonoma 14+.",
|
||||
isMultilingualModel: true,
|
||||
supportedLanguages: appleNativeLanguages
|
||||
),
|
||||
// Local Models
|
||||
LocalModel(
|
||||
name: "ggml-tiny",
|
||||
|
||||
@ -7,6 +7,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
|
||||
case elevenLabs = "ElevenLabs"
|
||||
case deepgram = "Deepgram"
|
||||
case custom = "Custom"
|
||||
case nativeApple = "Native Apple"
|
||||
// Future providers can be added here
|
||||
}
|
||||
|
||||
@ -33,6 +34,17 @@ extension TranscriptionModel {
|
||||
}
|
||||
}
|
||||
|
||||
// A new struct for Apple's native models
|
||||
struct NativeAppleModel: TranscriptionModel {
|
||||
let id = UUID()
|
||||
let name: String
|
||||
let displayName: String
|
||||
let description: String
|
||||
let provider: ModelProvider = .nativeApple
|
||||
let isMultilingualModel: Bool
|
||||
let supportedLanguages: [String: String]
|
||||
}
|
||||
|
||||
// A new struct for cloud models
|
||||
struct CloudModel: TranscriptionModel {
|
||||
let id: UUID
|
||||
|
||||
@ -21,6 +21,7 @@ class AudioTranscriptionManager: ObservableObject {
|
||||
// Transcription services - will be initialized when needed
|
||||
private var localTranscriptionService: LocalTranscriptionService?
|
||||
private let cloudTranscriptionService = CloudTranscriptionService()
|
||||
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
|
||||
|
||||
enum ProcessingPhase {
|
||||
case idle
|
||||
@ -93,9 +94,12 @@ class AudioTranscriptionManager: ObservableObject {
|
||||
processingPhase = .transcribing
|
||||
var text: String
|
||||
|
||||
if currentModel.provider == .local {
|
||||
switch currentModel.provider {
|
||||
case .local:
|
||||
text = try await localTranscriptionService!.transcribe(audioURL: permanentURL, model: currentModel)
|
||||
} else {
|
||||
case .nativeApple:
|
||||
text = try await nativeAppleTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
|
||||
default: // Cloud models
|
||||
text = try await cloudTranscriptionService.transcribe(audioURL: permanentURL, model: currentModel)
|
||||
}
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ class AudioTranscriptionService: ObservableObject {
|
||||
// Transcription services
|
||||
private let localTranscriptionService: LocalTranscriptionService
|
||||
private let cloudTranscriptionService = CloudTranscriptionService()
|
||||
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
|
||||
|
||||
enum TranscriptionError: Error {
|
||||
case noAudioFile
|
||||
@ -47,11 +48,16 @@ class AudioTranscriptionService: ObservableObject {
|
||||
// Delegate transcription to appropriate service
|
||||
var text: String
|
||||
|
||||
if model.provider == .local {
|
||||
switch model.provider {
|
||||
case .local:
|
||||
messageLog += "Using local transcription service...\n"
|
||||
text = try await localTranscriptionService.transcribe(audioURL: url, model: model)
|
||||
messageLog += "Local transcription completed.\n"
|
||||
} else {
|
||||
case .nativeApple:
|
||||
messageLog += "Using Native Apple transcription service...\n"
|
||||
text = try await nativeAppleTranscriptionService.transcribe(audioURL: url, model: model)
|
||||
messageLog += "Native Apple transcription completed.\n"
|
||||
default: // Cloud models
|
||||
messageLog += "Using cloud transcription service...\n"
|
||||
text = try await cloudTranscriptionService.transcribe(audioURL: url, model: model)
|
||||
messageLog += "Cloud transcription completed.\n"
|
||||
|
||||
139
VoiceInk/Services/NativeAppleTranscriptionService.swift
Normal file
139
VoiceInk/Services/NativeAppleTranscriptionService.swift
Normal file
@ -0,0 +1,139 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
import os
|
||||
|
||||
#if canImport(Speech)
|
||||
import Speech
|
||||
#endif
|
||||
|
||||
/// Transcription service that leverages the new SpeechAnalyzer / SpeechTranscriber API available on macOS 26 (Tahoe).
|
||||
/// Falls back with an unsupported-provider error on earlier OS versions so the application can gracefully degrade.
|
||||
class NativeAppleTranscriptionService: TranscriptionService {
|
||||
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "NativeAppleTranscriptionService")
|
||||
|
||||
enum ServiceError: Error, LocalizedError {
|
||||
case unsupportedOS
|
||||
case transcriptionFailed
|
||||
case localeNotSupported
|
||||
case invalidModel
|
||||
|
||||
var errorDescription: String? {
|
||||
switch self {
|
||||
case .unsupportedOS:
|
||||
return "SpeechAnalyzer requires macOS 26 or later."
|
||||
case .transcriptionFailed:
|
||||
return "Transcription failed using SpeechAnalyzer."
|
||||
case .localeNotSupported:
|
||||
return "The selected language is not supported by SpeechAnalyzer."
|
||||
case .invalidModel:
|
||||
return "Invalid model type provided for Native Apple transcription."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
guard model is NativeAppleModel else {
|
||||
throw ServiceError.invalidModel
|
||||
}
|
||||
|
||||
guard #available(macOS 26, *) else {
|
||||
logger.error("SpeechAnalyzer is not available on this macOS version")
|
||||
throw ServiceError.unsupportedOS
|
||||
}
|
||||
|
||||
#if canImport(Speech)
|
||||
logger.notice("Starting Apple native transcription with SpeechAnalyzer.")
|
||||
|
||||
let audioFile = try AVAudioFile(forReading: audioURL)
|
||||
|
||||
// Use the user's selected language directly, assuming BCP-47 format.
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "en-US"
|
||||
let locale = Locale(identifier: selectedLanguage)
|
||||
|
||||
// Check for locale support and asset installation status.
|
||||
let supportedLocales = await SpeechTranscriber.supportedLocales
|
||||
let installedLocales = await SpeechTranscriber.installedLocales
|
||||
let isLocaleSupported = supportedLocales.contains(locale)
|
||||
let isLocaleInstalled = installedLocales.contains(locale)
|
||||
|
||||
// Create the detailed log message
|
||||
let supportedIdentifiers = supportedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
|
||||
let installedIdentifiers = installedLocales.map { $0.identifier }.sorted().joined(separator: ", ")
|
||||
let availableForDownload = Set(supportedLocales).subtracting(Set(installedLocales)).map { $0.identifier }.sorted().joined(separator: ", ")
|
||||
|
||||
var statusMessage: String
|
||||
if isLocaleInstalled {
|
||||
statusMessage = "✅ Installed"
|
||||
} else if isLocaleSupported {
|
||||
statusMessage = "❌ Not Installed (Available for download)"
|
||||
} else {
|
||||
statusMessage = "❌ Not Supported"
|
||||
}
|
||||
|
||||
let logMessage = """
|
||||
|
||||
--- Native Speech Transcription ---
|
||||
Locale: '\(locale.identifier)'
|
||||
Status: \(statusMessage)
|
||||
------------------------------------
|
||||
Supported Locales: [\(supportedIdentifiers)]
|
||||
Installed Locales: [\(installedIdentifiers)]
|
||||
Available for Download: [\(availableForDownload)]
|
||||
------------------------------------
|
||||
"""
|
||||
logger.notice("\(logMessage)")
|
||||
|
||||
guard isLocaleSupported else {
|
||||
logger.error("Transcription failed: Locale '\(locale.identifier)' is not supported by SpeechTranscriber.")
|
||||
throw ServiceError.localeNotSupported
|
||||
}
|
||||
|
||||
let transcriber = SpeechTranscriber(
|
||||
locale: locale,
|
||||
transcriptionOptions: [],
|
||||
reportingOptions: [],
|
||||
attributeOptions: []
|
||||
)
|
||||
|
||||
// Ensure model assets are available, triggering a system download prompt if necessary.
|
||||
try await ensureModelIsAvailable(for: transcriber, locale: locale)
|
||||
|
||||
let analyzer = SpeechAnalyzer(modules: [transcriber])
|
||||
|
||||
try await analyzer.start(inputAudioFile: audioFile, finishAfterFile: true)
|
||||
|
||||
var transcript: AttributedString = ""
|
||||
for try await result in transcriber.results {
|
||||
transcript += result.text
|
||||
}
|
||||
|
||||
let finalTranscription = String(transcript.characters).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
logger.notice("Native transcription successful. Length: \(finalTranscription.count) characters.")
|
||||
return finalTranscription
|
||||
|
||||
#else
|
||||
logger.error("Speech framework is not available")
|
||||
throw ServiceError.unsupportedOS
|
||||
#endif
|
||||
}
|
||||
|
||||
@available(macOS 26, *)
|
||||
private func ensureModelIsAvailable(for transcriber: SpeechTranscriber, locale: Locale) async throws {
|
||||
#if canImport(Speech)
|
||||
let isInstalled = await SpeechTranscriber.installedLocales.contains(locale)
|
||||
|
||||
if !isInstalled {
|
||||
logger.notice("Assets for '\(locale.identifier)' not installed. Requesting system download.")
|
||||
|
||||
if let request = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
|
||||
try await request.downloadAndInstall()
|
||||
logger.notice("Asset download for '\(locale.identifier)' complete.")
|
||||
} else {
|
||||
logger.error("Asset download for '\(locale.identifier)' failed: Could not create installation request.")
|
||||
// Note: We don't throw an error here, as transcription might still work with a base model.
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -30,6 +30,14 @@ struct ModelCardRowView: View {
|
||||
downloadAction: downloadAction
|
||||
)
|
||||
}
|
||||
case .nativeApple:
|
||||
if let nativeAppleModel = model as? NativeAppleModel {
|
||||
NativeAppleModelCardView(
|
||||
model: nativeAppleModel,
|
||||
isCurrent: isCurrent,
|
||||
setDefaultAction: setDefaultAction
|
||||
)
|
||||
}
|
||||
case .groq, .elevenLabs, .deepgram:
|
||||
if let cloudModel = model as? CloudModel {
|
||||
CloudModelCardView(
|
||||
@ -715,4 +723,115 @@ struct CustomModelCardView: View {
|
||||
.frame(width: 20, height: 20)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Native Apple Model Card View
|
||||
struct NativeAppleModelCardView: View {
|
||||
let model: NativeAppleModel
|
||||
let isCurrent: Bool
|
||||
var setDefaultAction: () -> Void
|
||||
|
||||
var body: some View {
|
||||
HStack(alignment: .top, spacing: 16) {
|
||||
// Main Content
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
headerSection
|
||||
metadataSection
|
||||
descriptionSection
|
||||
}
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
// Action Controls
|
||||
actionSection
|
||||
}
|
||||
.padding(16)
|
||||
.background(CardBackground(isSelected: isCurrent, useAccentGradientWhenSelected: isCurrent))
|
||||
}
|
||||
|
||||
private var headerSection: some View {
|
||||
HStack(alignment: .firstTextBaseline) {
|
||||
Text(model.displayName)
|
||||
.font(.system(size: 13, weight: .semibold))
|
||||
.foregroundColor(Color(.labelColor))
|
||||
|
||||
statusBadge
|
||||
|
||||
Spacer()
|
||||
}
|
||||
}
|
||||
|
||||
private var statusBadge: some View {
|
||||
Group {
|
||||
if isCurrent {
|
||||
Text("Default")
|
||||
.font(.system(size: 11, weight: .medium))
|
||||
.padding(.horizontal, 6)
|
||||
.padding(.vertical, 2)
|
||||
.background(Capsule().fill(Color.accentColor))
|
||||
.foregroundColor(.white)
|
||||
} else {
|
||||
Text("Built-in")
|
||||
.font(.system(size: 11, weight: .medium))
|
||||
.padding(.horizontal, 6)
|
||||
.padding(.vertical, 2)
|
||||
.background(Capsule().fill(Color.blue.opacity(0.2)))
|
||||
.foregroundColor(Color.blue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var metadataSection: some View {
|
||||
HStack(spacing: 12) {
|
||||
// Native Apple
|
||||
Label("Native Apple", systemImage: "apple.logo")
|
||||
.font(.system(size: 11))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
.lineLimit(1)
|
||||
|
||||
// Language
|
||||
Label(model.language, systemImage: "globe")
|
||||
.font(.system(size: 11))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
.lineLimit(1)
|
||||
|
||||
// On-Device
|
||||
Label("On-Device", systemImage: "checkmark.shield")
|
||||
.font(.system(size: 11))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
.lineLimit(1)
|
||||
|
||||
// Requires macOS 26+
|
||||
Label("macOS 26+", systemImage: "macbook")
|
||||
.font(.system(size: 11))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
.lineLimit(1)
|
||||
}
|
||||
.lineLimit(1)
|
||||
}
|
||||
|
||||
private var descriptionSection: some View {
|
||||
Text(model.description)
|
||||
.font(.system(size: 11))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
.lineLimit(2)
|
||||
.fixedSize(horizontal: false, vertical: true)
|
||||
.padding(.top, 4)
|
||||
}
|
||||
|
||||
private var actionSection: some View {
|
||||
HStack(spacing: 8) {
|
||||
if isCurrent {
|
||||
Text("Default Model")
|
||||
.font(.system(size: 12))
|
||||
.foregroundColor(Color(.secondaryLabelColor))
|
||||
} else {
|
||||
Button(action: setDefaultAction) {
|
||||
Text("Set as Default")
|
||||
.font(.system(size: 12))
|
||||
}
|
||||
.buttonStyle(.bordered)
|
||||
.controlSize(.small)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,6 +6,9 @@ extension WhisperState {
|
||||
switch model.provider {
|
||||
case .local:
|
||||
return availableModels.contains { $0.name == model.name }
|
||||
case .nativeApple:
|
||||
// Native Apple models are always available (though they require macOS 26+)
|
||||
return true
|
||||
case .groq:
|
||||
let key = UserDefaults.standard.string(forKey: "GROQAPIKey")
|
||||
return key != nil && !key!.isEmpty
|
||||
|
||||
@ -59,6 +59,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
|
||||
// Transcription Services
|
||||
private var localTranscriptionService: LocalTranscriptionService
|
||||
private let cloudTranscriptionService = CloudTranscriptionService()
|
||||
private let nativeAppleTranscriptionService = NativeAppleTranscriptionService()
|
||||
|
||||
private var modelUrl: URL? {
|
||||
let possibleURLs = [
|
||||
@ -294,8 +295,16 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
|
||||
throw WhisperStateError.transcriptionFailed
|
||||
}
|
||||
|
||||
let transcriptionService: TranscriptionService
|
||||
switch model.provider {
|
||||
case .local:
|
||||
transcriptionService = localTranscriptionService
|
||||
case .nativeApple:
|
||||
transcriptionService = nativeAppleTranscriptionService
|
||||
default:
|
||||
transcriptionService = cloudTranscriptionService
|
||||
}
|
||||
|
||||
let transcriptionService: TranscriptionService = (model.provider == .local) ? localTranscriptionService : cloudTranscriptionService
|
||||
var text = try await transcriptionService.transcribe(audioURL: url, model: model)
|
||||
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user