diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index 2681aca..65526c4 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -144,7 +144,16 @@ import Foundation isMultilingual: true, supportedLanguages: getLanguageDictionary(isMultilingual: true, isLargeV3: true) ), - + CloudModel( + name: "nova-2", + displayName: "Nova (Deepgram)", + description: "Deepgram's Nova model for fast, accurate, and cost-effective transcription.", + provider: .deepgram, + speed: 0.9, + accuracy: 0.95, + isMultilingual: true, + supportedLanguages: getLanguageDictionary(isMultilingual: true, isLargeV3: true) + ), ] static let allLanguages = [ diff --git a/VoiceInk/Models/TranscriptionModel.swift b/VoiceInk/Models/TranscriptionModel.swift index 47d53e0..39941f1 100644 --- a/VoiceInk/Models/TranscriptionModel.swift +++ b/VoiceInk/Models/TranscriptionModel.swift @@ -5,6 +5,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable { case local = "Local" case groq = "Groq" case elevenLabs = "ElevenLabs" + case deepgram = "Deepgram" // Future providers can be added here } diff --git a/VoiceInk/Services/AIService.swift b/VoiceInk/Services/AIService.swift index c85125c..14070a1 100644 --- a/VoiceInk/Services/AIService.swift +++ b/VoiceInk/Services/AIService.swift @@ -10,6 +10,7 @@ enum AIProvider: String, CaseIterable { case mistral = "Mistral" case ollama = "Ollama" case elevenLabs = "ElevenLabs" + case deepgram = "Deepgram" case custom = "Custom" var baseURL: String { @@ -30,6 +31,8 @@ enum AIProvider: String, CaseIterable { return "https://api.elevenlabs.io/v1/speech-to-text" case .ollama: return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434" + case .deepgram: + return "https://api.deepgram.com/v1/listen" case .custom: return UserDefaults.standard.string(forKey: "customProviderBaseURL") ?? "" } @@ -53,6 +56,8 @@ enum AIProvider: String, CaseIterable { return "scribe_v1" case .ollama: return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral" + case .deepgram: + return "whisper-1" case .custom: return UserDefaults.standard.string(forKey: "customProviderModel") ?? "" } @@ -97,6 +102,8 @@ enum AIProvider: String, CaseIterable { return ["scribe_v1", "scribe_v1_experimental"] case .ollama: return [] + case .deepgram: + return ["whisper-1"] case .custom: return [] } @@ -266,6 +273,8 @@ class AIService: ObservableObject { verifyAnthropicAPIKey(key, completion: completion) case .elevenLabs: verifyElevenLabsAPIKey(key, completion: completion) + case .deepgram: + verifyDeepgramAPIKey(key, completion: completion) default: verifyOpenAICompatibleAPIKey(key, completion: completion) } @@ -400,7 +409,26 @@ class AIService: ObservableObject { }.resume() } - + private func verifyDeepgramAPIKey(_ key: String, completion: @escaping (Bool) -> Void) { + let url = URL(string: "https://api.deepgram.com/v1/auth/token")! + var request = URLRequest(url: url) + request.httpMethod = "GET" + request.addValue("Token \(key)", forHTTPHeaderField: "Authorization") + + URLSession.shared.dataTask(with: request) { data, response, error in + if let error = error { + self.logger.error("Deepgram API key verification failed: \(error.localizedDescription)") + completion(false) + return + } + + if let httpResponse = response as? HTTPURLResponse { + completion(httpResponse.statusCode == 200) + } else { + completion(false) + } + }.resume() + } func clearAPIKey() { guard selectedProvider.requiresAPIKey else { return } diff --git a/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift new file mode 100644 index 0000000..28299c8 --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift @@ -0,0 +1,57 @@ +import Foundation +import os + +enum CloudTranscriptionError: Error, LocalizedError { + case unsupportedProvider + case missingAPIKey + case invalidAPIKey + case audioFileNotFound + case apiRequestFailed(statusCode: Int, message: String) + case networkError(Error) + case noTranscriptionReturned + case dataEncodingError + + var errorDescription: String? { + switch self { + case .unsupportedProvider: + return "The model provider is not supported by this service." + case .missingAPIKey: + return "API key for this service is missing. Please configure it in the settings." + case .invalidAPIKey: + return "The provided API key is invalid." + case .audioFileNotFound: + return "The audio file to transcribe could not be found." + case .apiRequestFailed(let statusCode, let message): + return "The API request failed with status code \(statusCode): \(message)" + case .networkError(let error): + return "A network error occurred: \(error.localizedDescription)" + case .noTranscriptionReturned: + return "The API returned an empty or invalid response." + case .dataEncodingError: + return "Failed to encode the request body." + } + } +} + +class CloudTranscriptionService: TranscriptionService { + + private let groqService = GroqTranscriptionService() + private let elevenLabsService = ElevenLabsTranscriptionService() + private let deepgramService = DeepgramTranscriptionService() + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + switch model.provider { + case .groq: + return try await groqService.transcribe(audioURL: audioURL, model: model) + case .elevenLabs: + return try await elevenLabsService.transcribe(audioURL: audioURL, model: model) + case .deepgram: + return try await deepgramService.transcribe(audioURL: audioURL, model: model) + default: + throw CloudTranscriptionError.unsupportedProvider + } + } + + + +} \ No newline at end of file diff --git a/VoiceInk/Services/CloudTranscription/DeepgramTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/DeepgramTranscriptionService.swift new file mode 100644 index 0000000..d2b6b91 --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/DeepgramTranscriptionService.swift @@ -0,0 +1,103 @@ +import Foundation +import os + +class DeepgramTranscriptionService { + private let logger = Logger(subsystem: "com.voiceink.transcription", category: "DeepgramService") + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + let config = try getAPIConfig(for: model) + + var request = URLRequest(url: config.url) + request.httpMethod = "POST" + request.setValue("Token \(config.apiKey)", forHTTPHeaderField: "Authorization") + request.setValue("audio/wav", forHTTPHeaderField: "Content-Type") + + guard let audioData = try? Data(contentsOf: audioURL) else { + throw CloudTranscriptionError.audioFileNotFound + } + + let (data, response) = try await URLSession.shared.upload(for: request, from: audioData) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + logger.error("Deepgram API request failed with status \(httpResponse.statusCode): \(errorMessage, privacy: .public)") + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + + do { + let transcriptionResponse = try JSONDecoder().decode(DeepgramResponse.self, from: data) + guard let transcript = transcriptionResponse.results.channels.first?.alternatives.first?.transcript, + !transcript.isEmpty else { + logger.error("No transcript found in Deepgram response") + throw CloudTranscriptionError.noTranscriptionReturned + } + return transcript + } catch { + logger.error("Failed to decode Deepgram API response: \(error.localizedDescription)") + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { + guard let apiKey = UserDefaults.standard.string(forKey: "DeepgramAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } + + // Build the URL with query parameters + var components = URLComponents(string: "https://api.deepgram.com/v1/listen")! + var queryItems: [URLQueryItem] = [] + + // Add language parameter if not auto-detect + let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" + + // Choose model based on language + let modelName = selectedLanguage == "en" ? "nova-3" : "nova-2" + queryItems.append(URLQueryItem(name: "model", value: modelName)) + + queryItems.append(contentsOf: [ + URLQueryItem(name: "smart_format", value: "true"), + URLQueryItem(name: "dictation", value: "true"), + URLQueryItem(name: "punctuate", value: "true"), + URLQueryItem(name: "paragraphs", value: "true"), + URLQueryItem(name: "filler_words", value: "false") + ]) + + if selectedLanguage != "auto" && !selectedLanguage.isEmpty { + queryItems.append(URLQueryItem(name: "language", value: selectedLanguage)) + } + + components.queryItems = queryItems + + guard let apiURL = components.url else { + throw CloudTranscriptionError.dataEncodingError + } + + return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) + } + + private struct APIConfig { + let url: URL + let apiKey: String + let modelName: String + } + + private struct DeepgramResponse: Decodable { + let results: Results + + struct Results: Decodable { + let channels: [Channel] + + struct Channel: Decodable { + let alternatives: [Alternative] + + struct Alternative: Decodable { + let transcript: String + let confidence: Double? + } + } + } + } +} \ No newline at end of file diff --git a/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift new file mode 100644 index 0000000..14c31ae --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift @@ -0,0 +1,93 @@ +import Foundation + +class ElevenLabsTranscriptionService { + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + let config = try getAPIConfig(for: model) + + let boundary = "Boundary-\(UUID().uuidString)" + var request = URLRequest(url: config.url) + request.httpMethod = "POST" + request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") + request.setValue(config.apiKey, forHTTPHeaderField: "xi-api-key") + + let body = try createElevenLabsRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary) + + let (data, response) = try await URLSession.shared.upload(for: request, from: body) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + + do { + let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data) + return transcriptionResponse.text + } catch { + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { + guard let apiKey = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } + + let apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text")! + return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) + } + + private func createElevenLabsRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { + var body = Data() + let crlf = "\r\n" + + guard let audioData = try? Data(contentsOf: audioURL) else { + throw CloudTranscriptionError.audioFileNotFound + } + + // File + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) + body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) + body.append(audioData) + body.append(crlf.data(using: .utf8)!) + + // Model ID + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"model_id\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append(modelName.data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + + let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" + if selectedLanguage != "auto", !selectedLanguage.isEmpty { + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"language_code\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append(selectedLanguage.data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + } + + body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) + + return body + } + + private struct APIConfig { + let url: URL + let apiKey: String + let modelName: String + } + + private struct TranscriptionResponse: Decodable { + let text: String + let language: String? + let duration: Double? + let x_groq: GroqMetadata? + + struct GroqMetadata: Decodable { + let id: String? + } + } +} \ No newline at end of file diff --git a/VoiceInk/Services/CloudTranscription/GroqTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/GroqTranscriptionService.swift new file mode 100644 index 0000000..a7587f7 --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/GroqTranscriptionService.swift @@ -0,0 +1,114 @@ +import Foundation +import os + +class GroqTranscriptionService { + private let logger = Logger(subsystem: "com.voiceink.transcription", category: "GroqService") + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + let config = try getAPIConfig(for: model) + + let boundary = "Boundary-\(UUID().uuidString)" + var request = URLRequest(url: config.url) + request.httpMethod = "POST" + request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") + request.setValue("Bearer \(config.apiKey)", forHTTPHeaderField: "Authorization") + + let body = try createOpenAICompatibleRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary) + + let (data, response) = try await URLSession.shared.upload(for: request, from: body) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + logger.error("Groq API request failed with status \(httpResponse.statusCode): \(errorMessage, privacy: .public)") + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + + do { + let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data) + return transcriptionResponse.text + } catch { + logger.error("Failed to decode Groq API response: \(error.localizedDescription)") + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { + guard let apiKey = UserDefaults.standard.string(forKey: "GROQAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } + + let apiURL = URL(string: "https://api.groq.com/openai/v1/audio/transcriptions")! + return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) + } + + private func createOpenAICompatibleRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { + var body = Data() + let crlf = "\r\n" + + guard let audioData = try? Data(contentsOf: audioURL) else { + throw CloudTranscriptionError.audioFileNotFound + } + + let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" + let prompt = UserDefaults.standard.string(forKey: "TranscriptionPrompt") ?? "" + + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) + body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) + body.append(audioData) + body.append(crlf.data(using: .utf8)!) + + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"model\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append(modelName.data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + + if selectedLanguage != "auto", !selectedLanguage.isEmpty { + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"language\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append(selectedLanguage.data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + } + + // Include prompt for OpenAI-compatible APIs + if !prompt.isEmpty { + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"prompt\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append(prompt.data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + } + + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"response_format\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append("json".data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"temperature\"\(crlf)\(crlf)".data(using: .utf8)!) + body.append("0".data(using: .utf8)!) + body.append(crlf.data(using: .utf8)!) + body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) + + return body + } + + private struct APIConfig { + let url: URL + let apiKey: String + let modelName: String + } + + private struct TranscriptionResponse: Decodable { + let text: String + let language: String? + let duration: Double? + let x_groq: GroqMetadata? + + struct GroqMetadata: Decodable { + let id: String? + } + } +} \ No newline at end of file diff --git a/VoiceInk/Services/CloudTranscriptionService.swift b/VoiceInk/Services/CloudTranscriptionService.swift deleted file mode 100644 index ee66625..0000000 --- a/VoiceInk/Services/CloudTranscriptionService.swift +++ /dev/null @@ -1,211 +0,0 @@ -import Foundation -import os - -enum CloudTranscriptionError: Error, LocalizedError { - case unsupportedProvider - case missingAPIKey - case invalidAPIKey - case audioFileNotFound - case apiRequestFailed(statusCode: Int, message: String) - case networkError(Error) - case noTranscriptionReturned - case dataEncodingError - - var errorDescription: String? { - switch self { - case .unsupportedProvider: - return "The model provider is not supported by this service." - case .missingAPIKey: - return "API key for this service is missing. Please configure it in the settings." - case .invalidAPIKey: - return "The provided API key is invalid." - case .audioFileNotFound: - return "The audio file to transcribe could not be found." - case .apiRequestFailed(let statusCode, let message): - return "The API request failed with status code \(statusCode): \(message)" - case .networkError(let error): - return "A network error occurred: \(error.localizedDescription)" - case .noTranscriptionReturned: - return "The API returned an empty or invalid response." - case .dataEncodingError: - return "Failed to encode the request body." - } - } -} - -class CloudTranscriptionService: TranscriptionService { - - private struct APIConfig { - let url: URL - let apiKey: String - let modelName: String - } - - func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { - let config = try getAPIConfig(for: model) - - var request: URLRequest - var body: Data - - switch model.provider { - case .elevenLabs: - let boundary = "Boundary-\(UUID().uuidString)" - var elevenLabsRequest = URLRequest(url: config.url) - elevenLabsRequest.httpMethod = "POST" - elevenLabsRequest.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") - elevenLabsRequest.setValue(config.apiKey, forHTTPHeaderField: "xi-api-key") - body = try createElevenLabsRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary) - request = elevenLabsRequest - - case .groq: - let boundary = "Boundary-\(UUID().uuidString)" - var openAICompatibleRequest = URLRequest(url: config.url) - openAICompatibleRequest.httpMethod = "POST" - openAICompatibleRequest.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") - openAICompatibleRequest.setValue("Bearer \(config.apiKey)", forHTTPHeaderField: "Authorization") - body = try createOpenAICompatibleRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary) - request = openAICompatibleRequest - - - - default: - throw CloudTranscriptionError.unsupportedProvider - } - - let (data, response) = try await URLSession.shared.upload(for: request, from: body) - guard let httpResponse = response as? HTTPURLResponse else { - throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) - } - - if !(200...299).contains(httpResponse.statusCode) { - let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" - throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) - } - - do { - let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data) - return transcriptionResponse.text - } catch { - throw CloudTranscriptionError.noTranscriptionReturned - } - } - - private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { - let providerKey: String - let apiURL: URL - - switch model.provider { - case .groq: - providerKey = "GROQ" - apiURL = URL(string: "https://api.groq.com/openai/v1/audio/transcriptions")! - case .elevenLabs: - providerKey = "ElevenLabs" - apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text")! - default: - throw CloudTranscriptionError.unsupportedProvider - } - - guard let apiKey = UserDefaults.standard.string(forKey: "\(providerKey)APIKey"), !apiKey.isEmpty else { - throw CloudTranscriptionError.missingAPIKey - } - - return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) - } - - private func createElevenLabsRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { - var body = Data() - let crlf = "\r\n" - - guard let audioData = try? Data(contentsOf: audioURL) else { - throw CloudTranscriptionError.audioFileNotFound - } - - // File - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) - body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) - body.append(audioData) - body.append(crlf.data(using: .utf8)!) - - // Model ID - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"model_id\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(modelName.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - - let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" - if selectedLanguage != "auto", !selectedLanguage.isEmpty { - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"language_code\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(selectedLanguage.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - } - - body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) - - return body - } - - private func createOpenAICompatibleRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { - var body = Data() - let crlf = "\r\n" - - guard let audioData = try? Data(contentsOf: audioURL) else { - throw CloudTranscriptionError.audioFileNotFound - } - - let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" - let prompt = UserDefaults.standard.string(forKey: "TranscriptionPrompt") ?? "" - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) - body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) - body.append(audioData) - body.append(crlf.data(using: .utf8)!) - - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"model\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(modelName.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - - if selectedLanguage != "auto", !selectedLanguage.isEmpty { - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"language\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(selectedLanguage.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - } - - // Include prompt for OpenAI-compatible APIs - if !prompt.isEmpty { - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"prompt\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(prompt.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - } - - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"response_format\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append("json".data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"temperature\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append("0".data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) - - return body - } - - private struct TranscriptionResponse: Decodable { - let text: String - let language: String? - let duration: Double? - let x_groq: GroqMetadata? - - struct GroqMetadata: Decodable { - let id: String? - } - } - - -} \ No newline at end of file diff --git a/VoiceInk/Views/APIKeyManagementView.swift b/VoiceInk/Views/APIKeyManagementView.swift index 902a1c7..9ebf516 100644 --- a/VoiceInk/Views/APIKeyManagementView.swift +++ b/VoiceInk/Views/APIKeyManagementView.swift @@ -387,6 +387,8 @@ struct APIKeyManagementView: View { URL(string: "https://console.mistral.ai/api-keys")! case .elevenLabs: URL(string: "https://elevenlabs.io/speech-synthesis")! + case .deepgram: + URL(string: "https://console.deepgram.com/api-keys")! case .ollama, .custom: URL(string: "")! // This case should never be reached } diff --git a/VoiceInk/Views/ModelCardRowView.swift b/VoiceInk/Views/ModelCardRowView.swift index eb805e5..457a6da 100644 --- a/VoiceInk/Views/ModelCardRowView.swift +++ b/VoiceInk/Views/ModelCardRowView.swift @@ -29,7 +29,7 @@ struct ModelCardRowView: View { downloadAction: downloadAction ) } - case .groq, .elevenLabs: + case .groq, .elevenLabs, .deepgram: if let cloudModel = model as? CloudModel { CloudModelCardView( model: cloudModel, @@ -259,6 +259,8 @@ struct CloudModelCardView: View { return "GROQ" case .elevenLabs: return "ElevenLabs" + case .deepgram: + return "Deepgram" default: return model.provider.rawValue } @@ -497,6 +499,8 @@ struct CloudModelCardView: View { aiService.selectedProvider = .groq } else if model.provider == .elevenLabs { aiService.selectedProvider = .elevenLabs + } else if model.provider == .deepgram { + aiService.selectedProvider = .deepgram } aiService.verifyAPIKey(apiKey) { [self] isValid in diff --git a/VoiceInk/Whisper/WhisperState+ModelQueries.swift b/VoiceInk/Whisper/WhisperState+ModelQueries.swift index 4058b9b..b74666a 100644 --- a/VoiceInk/Whisper/WhisperState+ModelQueries.swift +++ b/VoiceInk/Whisper/WhisperState+ModelQueries.swift @@ -12,6 +12,9 @@ extension WhisperState { case .elevenLabs: let key = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey") return key != nil && !key!.isEmpty + case .deepgram: + let key = UserDefaults.standard.string(forKey: "DeepgramAPIKey") + return key != nil && !key!.isEmpty } } }