feat: Added Deepgram transcription support
This commit is contained in:
parent
fde8b168eb
commit
00e1d5e8da
@ -144,7 +144,16 @@ import Foundation
|
||||
isMultilingual: true,
|
||||
supportedLanguages: getLanguageDictionary(isMultilingual: true, isLargeV3: true)
|
||||
),
|
||||
|
||||
CloudModel(
|
||||
name: "nova-2",
|
||||
displayName: "Nova (Deepgram)",
|
||||
description: "Deepgram's Nova model for fast, accurate, and cost-effective transcription.",
|
||||
provider: .deepgram,
|
||||
speed: 0.9,
|
||||
accuracy: 0.95,
|
||||
isMultilingual: true,
|
||||
supportedLanguages: getLanguageDictionary(isMultilingual: true, isLargeV3: true)
|
||||
),
|
||||
]
|
||||
|
||||
static let allLanguages = [
|
||||
|
||||
@ -5,6 +5,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
|
||||
case local = "Local"
|
||||
case groq = "Groq"
|
||||
case elevenLabs = "ElevenLabs"
|
||||
case deepgram = "Deepgram"
|
||||
// Future providers can be added here
|
||||
}
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ enum AIProvider: String, CaseIterable {
|
||||
case mistral = "Mistral"
|
||||
case ollama = "Ollama"
|
||||
case elevenLabs = "ElevenLabs"
|
||||
case deepgram = "Deepgram"
|
||||
case custom = "Custom"
|
||||
|
||||
var baseURL: String {
|
||||
@ -30,6 +31,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return "https://api.elevenlabs.io/v1/speech-to-text"
|
||||
case .ollama:
|
||||
return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434"
|
||||
case .deepgram:
|
||||
return "https://api.deepgram.com/v1/listen"
|
||||
case .custom:
|
||||
return UserDefaults.standard.string(forKey: "customProviderBaseURL") ?? ""
|
||||
}
|
||||
@ -53,6 +56,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return "scribe_v1"
|
||||
case .ollama:
|
||||
return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral"
|
||||
case .deepgram:
|
||||
return "whisper-1"
|
||||
case .custom:
|
||||
return UserDefaults.standard.string(forKey: "customProviderModel") ?? ""
|
||||
}
|
||||
@ -97,6 +102,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return ["scribe_v1", "scribe_v1_experimental"]
|
||||
case .ollama:
|
||||
return []
|
||||
case .deepgram:
|
||||
return ["whisper-1"]
|
||||
case .custom:
|
||||
return []
|
||||
}
|
||||
@ -266,6 +273,8 @@ class AIService: ObservableObject {
|
||||
verifyAnthropicAPIKey(key, completion: completion)
|
||||
case .elevenLabs:
|
||||
verifyElevenLabsAPIKey(key, completion: completion)
|
||||
case .deepgram:
|
||||
verifyDeepgramAPIKey(key, completion: completion)
|
||||
default:
|
||||
verifyOpenAICompatibleAPIKey(key, completion: completion)
|
||||
}
|
||||
@ -400,7 +409,26 @@ class AIService: ObservableObject {
|
||||
}.resume()
|
||||
}
|
||||
|
||||
|
||||
private func verifyDeepgramAPIKey(_ key: String, completion: @escaping (Bool) -> Void) {
|
||||
let url = URL(string: "https://api.deepgram.com/v1/auth/token")!
|
||||
var request = URLRequest(url: url)
|
||||
request.httpMethod = "GET"
|
||||
request.addValue("Token \(key)", forHTTPHeaderField: "Authorization")
|
||||
|
||||
URLSession.shared.dataTask(with: request) { data, response, error in
|
||||
if let error = error {
|
||||
self.logger.error("Deepgram API key verification failed: \(error.localizedDescription)")
|
||||
completion(false)
|
||||
return
|
||||
}
|
||||
|
||||
if let httpResponse = response as? HTTPURLResponse {
|
||||
completion(httpResponse.statusCode == 200)
|
||||
} else {
|
||||
completion(false)
|
||||
}
|
||||
}.resume()
|
||||
}
|
||||
|
||||
func clearAPIKey() {
|
||||
guard selectedProvider.requiresAPIKey else { return }
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
import Foundation
|
||||
import os
|
||||
|
||||
enum CloudTranscriptionError: Error, LocalizedError {
|
||||
case unsupportedProvider
|
||||
case missingAPIKey
|
||||
case invalidAPIKey
|
||||
case audioFileNotFound
|
||||
case apiRequestFailed(statusCode: Int, message: String)
|
||||
case networkError(Error)
|
||||
case noTranscriptionReturned
|
||||
case dataEncodingError
|
||||
|
||||
var errorDescription: String? {
|
||||
switch self {
|
||||
case .unsupportedProvider:
|
||||
return "The model provider is not supported by this service."
|
||||
case .missingAPIKey:
|
||||
return "API key for this service is missing. Please configure it in the settings."
|
||||
case .invalidAPIKey:
|
||||
return "The provided API key is invalid."
|
||||
case .audioFileNotFound:
|
||||
return "The audio file to transcribe could not be found."
|
||||
case .apiRequestFailed(let statusCode, let message):
|
||||
return "The API request failed with status code \(statusCode): \(message)"
|
||||
case .networkError(let error):
|
||||
return "A network error occurred: \(error.localizedDescription)"
|
||||
case .noTranscriptionReturned:
|
||||
return "The API returned an empty or invalid response."
|
||||
case .dataEncodingError:
|
||||
return "Failed to encode the request body."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CloudTranscriptionService: TranscriptionService {
|
||||
|
||||
private let groqService = GroqTranscriptionService()
|
||||
private let elevenLabsService = ElevenLabsTranscriptionService()
|
||||
private let deepgramService = DeepgramTranscriptionService()
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
switch model.provider {
|
||||
case .groq:
|
||||
return try await groqService.transcribe(audioURL: audioURL, model: model)
|
||||
case .elevenLabs:
|
||||
return try await elevenLabsService.transcribe(audioURL: audioURL, model: model)
|
||||
case .deepgram:
|
||||
return try await deepgramService.transcribe(audioURL: audioURL, model: model)
|
||||
default:
|
||||
throw CloudTranscriptionError.unsupportedProvider
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,103 @@
|
||||
import Foundation
|
||||
import os
|
||||
|
||||
class DeepgramTranscriptionService {
|
||||
private let logger = Logger(subsystem: "com.voiceink.transcription", category: "DeepgramService")
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
let config = try getAPIConfig(for: model)
|
||||
|
||||
var request = URLRequest(url: config.url)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("Token \(config.apiKey)", forHTTPHeaderField: "Authorization")
|
||||
request.setValue("audio/wav", forHTTPHeaderField: "Content-Type")
|
||||
|
||||
guard let audioData = try? Data(contentsOf: audioURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
|
||||
let (data, response) = try await URLSession.shared.upload(for: request, from: audioData)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
logger.error("Deepgram API request failed with status \(httpResponse.statusCode): \(errorMessage, privacy: .public)")
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
|
||||
do {
|
||||
let transcriptionResponse = try JSONDecoder().decode(DeepgramResponse.self, from: data)
|
||||
guard let transcript = transcriptionResponse.results.channels.first?.alternatives.first?.transcript,
|
||||
!transcript.isEmpty else {
|
||||
logger.error("No transcript found in Deepgram response")
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
return transcript
|
||||
} catch {
|
||||
logger.error("Failed to decode Deepgram API response: \(error.localizedDescription)")
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
|
||||
guard let apiKey = UserDefaults.standard.string(forKey: "DeepgramAPIKey"), !apiKey.isEmpty else {
|
||||
throw CloudTranscriptionError.missingAPIKey
|
||||
}
|
||||
|
||||
// Build the URL with query parameters
|
||||
var components = URLComponents(string: "https://api.deepgram.com/v1/listen")!
|
||||
var queryItems: [URLQueryItem] = []
|
||||
|
||||
// Add language parameter if not auto-detect
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
|
||||
// Choose model based on language
|
||||
let modelName = selectedLanguage == "en" ? "nova-3" : "nova-2"
|
||||
queryItems.append(URLQueryItem(name: "model", value: modelName))
|
||||
|
||||
queryItems.append(contentsOf: [
|
||||
URLQueryItem(name: "smart_format", value: "true"),
|
||||
URLQueryItem(name: "dictation", value: "true"),
|
||||
URLQueryItem(name: "punctuate", value: "true"),
|
||||
URLQueryItem(name: "paragraphs", value: "true"),
|
||||
URLQueryItem(name: "filler_words", value: "false")
|
||||
])
|
||||
|
||||
if selectedLanguage != "auto" && !selectedLanguage.isEmpty {
|
||||
queryItems.append(URLQueryItem(name: "language", value: selectedLanguage))
|
||||
}
|
||||
|
||||
components.queryItems = queryItems
|
||||
|
||||
guard let apiURL = components.url else {
|
||||
throw CloudTranscriptionError.dataEncodingError
|
||||
}
|
||||
|
||||
return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name)
|
||||
}
|
||||
|
||||
private struct APIConfig {
|
||||
let url: URL
|
||||
let apiKey: String
|
||||
let modelName: String
|
||||
}
|
||||
|
||||
private struct DeepgramResponse: Decodable {
|
||||
let results: Results
|
||||
|
||||
struct Results: Decodable {
|
||||
let channels: [Channel]
|
||||
|
||||
struct Channel: Decodable {
|
||||
let alternatives: [Alternative]
|
||||
|
||||
struct Alternative: Decodable {
|
||||
let transcript: String
|
||||
let confidence: Double?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,93 @@
|
||||
import Foundation
|
||||
|
||||
class ElevenLabsTranscriptionService {
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
let config = try getAPIConfig(for: model)
|
||||
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var request = URLRequest(url: config.url)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
|
||||
request.setValue(config.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
|
||||
let body = try createElevenLabsRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary)
|
||||
|
||||
let (data, response) = try await URLSession.shared.upload(for: request, from: body)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
|
||||
do {
|
||||
let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data)
|
||||
return transcriptionResponse.text
|
||||
} catch {
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
|
||||
guard let apiKey = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey"), !apiKey.isEmpty else {
|
||||
throw CloudTranscriptionError.missingAPIKey
|
||||
}
|
||||
|
||||
let apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text")!
|
||||
return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name)
|
||||
}
|
||||
|
||||
private func createElevenLabsRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data {
|
||||
var body = Data()
|
||||
let crlf = "\r\n"
|
||||
|
||||
guard let audioData = try? Data(contentsOf: audioURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
|
||||
// File
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(audioData)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
// Model ID
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"model_id\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(modelName.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
if selectedLanguage != "auto", !selectedLanguage.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"language_code\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(selectedLanguage.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
|
||||
|
||||
return body
|
||||
}
|
||||
|
||||
private struct APIConfig {
|
||||
let url: URL
|
||||
let apiKey: String
|
||||
let modelName: String
|
||||
}
|
||||
|
||||
private struct TranscriptionResponse: Decodable {
|
||||
let text: String
|
||||
let language: String?
|
||||
let duration: Double?
|
||||
let x_groq: GroqMetadata?
|
||||
|
||||
struct GroqMetadata: Decodable {
|
||||
let id: String?
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,114 @@
|
||||
import Foundation
|
||||
import os
|
||||
|
||||
class GroqTranscriptionService {
|
||||
private let logger = Logger(subsystem: "com.voiceink.transcription", category: "GroqService")
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
let config = try getAPIConfig(for: model)
|
||||
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var request = URLRequest(url: config.url)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
|
||||
request.setValue("Bearer \(config.apiKey)", forHTTPHeaderField: "Authorization")
|
||||
|
||||
let body = try createOpenAICompatibleRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary)
|
||||
|
||||
let (data, response) = try await URLSession.shared.upload(for: request, from: body)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
logger.error("Groq API request failed with status \(httpResponse.statusCode): \(errorMessage, privacy: .public)")
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
|
||||
do {
|
||||
let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data)
|
||||
return transcriptionResponse.text
|
||||
} catch {
|
||||
logger.error("Failed to decode Groq API response: \(error.localizedDescription)")
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
|
||||
guard let apiKey = UserDefaults.standard.string(forKey: "GROQAPIKey"), !apiKey.isEmpty else {
|
||||
throw CloudTranscriptionError.missingAPIKey
|
||||
}
|
||||
|
||||
let apiURL = URL(string: "https://api.groq.com/openai/v1/audio/transcriptions")!
|
||||
return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name)
|
||||
}
|
||||
|
||||
private func createOpenAICompatibleRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data {
|
||||
var body = Data()
|
||||
let crlf = "\r\n"
|
||||
|
||||
guard let audioData = try? Data(contentsOf: audioURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
let prompt = UserDefaults.standard.string(forKey: "TranscriptionPrompt") ?? ""
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(audioData)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"model\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(modelName.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
if selectedLanguage != "auto", !selectedLanguage.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"language\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(selectedLanguage.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
// Include prompt for OpenAI-compatible APIs
|
||||
if !prompt.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"prompt\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(prompt.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"response_format\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append("json".data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"temperature\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append("0".data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
|
||||
|
||||
return body
|
||||
}
|
||||
|
||||
private struct APIConfig {
|
||||
let url: URL
|
||||
let apiKey: String
|
||||
let modelName: String
|
||||
}
|
||||
|
||||
private struct TranscriptionResponse: Decodable {
|
||||
let text: String
|
||||
let language: String?
|
||||
let duration: Double?
|
||||
let x_groq: GroqMetadata?
|
||||
|
||||
struct GroqMetadata: Decodable {
|
||||
let id: String?
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,211 +0,0 @@
|
||||
import Foundation
|
||||
import os
|
||||
|
||||
enum CloudTranscriptionError: Error, LocalizedError {
|
||||
case unsupportedProvider
|
||||
case missingAPIKey
|
||||
case invalidAPIKey
|
||||
case audioFileNotFound
|
||||
case apiRequestFailed(statusCode: Int, message: String)
|
||||
case networkError(Error)
|
||||
case noTranscriptionReturned
|
||||
case dataEncodingError
|
||||
|
||||
var errorDescription: String? {
|
||||
switch self {
|
||||
case .unsupportedProvider:
|
||||
return "The model provider is not supported by this service."
|
||||
case .missingAPIKey:
|
||||
return "API key for this service is missing. Please configure it in the settings."
|
||||
case .invalidAPIKey:
|
||||
return "The provided API key is invalid."
|
||||
case .audioFileNotFound:
|
||||
return "The audio file to transcribe could not be found."
|
||||
case .apiRequestFailed(let statusCode, let message):
|
||||
return "The API request failed with status code \(statusCode): \(message)"
|
||||
case .networkError(let error):
|
||||
return "A network error occurred: \(error.localizedDescription)"
|
||||
case .noTranscriptionReturned:
|
||||
return "The API returned an empty or invalid response."
|
||||
case .dataEncodingError:
|
||||
return "Failed to encode the request body."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CloudTranscriptionService: TranscriptionService {
|
||||
|
||||
private struct APIConfig {
|
||||
let url: URL
|
||||
let apiKey: String
|
||||
let modelName: String
|
||||
}
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
let config = try getAPIConfig(for: model)
|
||||
|
||||
var request: URLRequest
|
||||
var body: Data
|
||||
|
||||
switch model.provider {
|
||||
case .elevenLabs:
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var elevenLabsRequest = URLRequest(url: config.url)
|
||||
elevenLabsRequest.httpMethod = "POST"
|
||||
elevenLabsRequest.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
|
||||
elevenLabsRequest.setValue(config.apiKey, forHTTPHeaderField: "xi-api-key")
|
||||
body = try createElevenLabsRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary)
|
||||
request = elevenLabsRequest
|
||||
|
||||
case .groq:
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var openAICompatibleRequest = URLRequest(url: config.url)
|
||||
openAICompatibleRequest.httpMethod = "POST"
|
||||
openAICompatibleRequest.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
|
||||
openAICompatibleRequest.setValue("Bearer \(config.apiKey)", forHTTPHeaderField: "Authorization")
|
||||
body = try createOpenAICompatibleRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary)
|
||||
request = openAICompatibleRequest
|
||||
|
||||
|
||||
|
||||
default:
|
||||
throw CloudTranscriptionError.unsupportedProvider
|
||||
}
|
||||
|
||||
let (data, response) = try await URLSession.shared.upload(for: request, from: body)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
|
||||
do {
|
||||
let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data)
|
||||
return transcriptionResponse.text
|
||||
} catch {
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
|
||||
let providerKey: String
|
||||
let apiURL: URL
|
||||
|
||||
switch model.provider {
|
||||
case .groq:
|
||||
providerKey = "GROQ"
|
||||
apiURL = URL(string: "https://api.groq.com/openai/v1/audio/transcriptions")!
|
||||
case .elevenLabs:
|
||||
providerKey = "ElevenLabs"
|
||||
apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text")!
|
||||
default:
|
||||
throw CloudTranscriptionError.unsupportedProvider
|
||||
}
|
||||
|
||||
guard let apiKey = UserDefaults.standard.string(forKey: "\(providerKey)APIKey"), !apiKey.isEmpty else {
|
||||
throw CloudTranscriptionError.missingAPIKey
|
||||
}
|
||||
|
||||
return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name)
|
||||
}
|
||||
|
||||
private func createElevenLabsRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data {
|
||||
var body = Data()
|
||||
let crlf = "\r\n"
|
||||
|
||||
guard let audioData = try? Data(contentsOf: audioURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
|
||||
// File
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(audioData)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
// Model ID
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"model_id\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(modelName.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
if selectedLanguage != "auto", !selectedLanguage.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"language_code\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(selectedLanguage.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
|
||||
|
||||
return body
|
||||
}
|
||||
|
||||
private func createOpenAICompatibleRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data {
|
||||
var body = Data()
|
||||
let crlf = "\r\n"
|
||||
|
||||
guard let audioData = try? Data(contentsOf: audioURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
let prompt = UserDefaults.standard.string(forKey: "TranscriptionPrompt") ?? ""
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(audioData)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"model\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(modelName.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
if selectedLanguage != "auto", !selectedLanguage.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"language\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(selectedLanguage.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
// Include prompt for OpenAI-compatible APIs
|
||||
if !prompt.isEmpty {
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"prompt\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(prompt.data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
}
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"response_format\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append("json".data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"temperature\"\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append("0".data(using: .utf8)!)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
|
||||
|
||||
return body
|
||||
}
|
||||
|
||||
private struct TranscriptionResponse: Decodable {
|
||||
let text: String
|
||||
let language: String?
|
||||
let duration: Double?
|
||||
let x_groq: GroqMetadata?
|
||||
|
||||
struct GroqMetadata: Decodable {
|
||||
let id: String?
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -387,6 +387,8 @@ struct APIKeyManagementView: View {
|
||||
URL(string: "https://console.mistral.ai/api-keys")!
|
||||
case .elevenLabs:
|
||||
URL(string: "https://elevenlabs.io/speech-synthesis")!
|
||||
case .deepgram:
|
||||
URL(string: "https://console.deepgram.com/api-keys")!
|
||||
case .ollama, .custom:
|
||||
URL(string: "")! // This case should never be reached
|
||||
}
|
||||
|
||||
@ -29,7 +29,7 @@ struct ModelCardRowView: View {
|
||||
downloadAction: downloadAction
|
||||
)
|
||||
}
|
||||
case .groq, .elevenLabs:
|
||||
case .groq, .elevenLabs, .deepgram:
|
||||
if let cloudModel = model as? CloudModel {
|
||||
CloudModelCardView(
|
||||
model: cloudModel,
|
||||
@ -259,6 +259,8 @@ struct CloudModelCardView: View {
|
||||
return "GROQ"
|
||||
case .elevenLabs:
|
||||
return "ElevenLabs"
|
||||
case .deepgram:
|
||||
return "Deepgram"
|
||||
default:
|
||||
return model.provider.rawValue
|
||||
}
|
||||
@ -497,6 +499,8 @@ struct CloudModelCardView: View {
|
||||
aiService.selectedProvider = .groq
|
||||
} else if model.provider == .elevenLabs {
|
||||
aiService.selectedProvider = .elevenLabs
|
||||
} else if model.provider == .deepgram {
|
||||
aiService.selectedProvider = .deepgram
|
||||
}
|
||||
|
||||
aiService.verifyAPIKey(apiKey) { [self] isValid in
|
||||
|
||||
@ -12,6 +12,9 @@ extension WhisperState {
|
||||
case .elevenLabs:
|
||||
let key = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey")
|
||||
return key != nil && !key!.isEmpty
|
||||
case .deepgram:
|
||||
let key = UserDefaults.standard.string(forKey: "DeepgramAPIKey")
|
||||
return key != nil && !key!.isEmpty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user