diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index 2fee46d..9fb28f0 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -265,6 +265,17 @@ import Foundation isMultilingual: true, supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini) ) + , + CloudModel( + name: "stt-async-v3", + displayName: "Soniox (stt-async-v3)", + description: "Soniox asynchronous transcription model v3.", + provider: .soniox, + speed: 0.8, + accuracy: 0.96, + isMultilingual: true, + supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .soniox) + ) ] static let allLanguages = [ diff --git a/VoiceInk/Models/TranscriptionModel.swift b/VoiceInk/Models/TranscriptionModel.swift index 8d308ee..2617682 100644 --- a/VoiceInk/Models/TranscriptionModel.swift +++ b/VoiceInk/Models/TranscriptionModel.swift @@ -9,6 +9,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable { case deepgram = "Deepgram" case mistral = "Mistral" case gemini = "Gemini" + case soniox = "Soniox" case custom = "Custom" case nativeApple = "Native Apple" // Future providers can be added here diff --git a/VoiceInk/Services/AIService.swift b/VoiceInk/Services/AIService.swift index 1fd8eb3..d4a1a87 100644 --- a/VoiceInk/Services/AIService.swift +++ b/VoiceInk/Services/AIService.swift @@ -11,6 +11,7 @@ enum AIProvider: String, CaseIterable { case mistral = "Mistral" case elevenLabs = "ElevenLabs" case deepgram = "Deepgram" + case soniox = "Soniox" case ollama = "Ollama" case custom = "Custom" @@ -35,6 +36,8 @@ enum AIProvider: String, CaseIterable { return "https://api.elevenlabs.io/v1/speech-to-text" case .deepgram: return "https://api.deepgram.com/v1/listen" + case .soniox: + return "https://api.soniox.com/v1" case .ollama: return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434" case .custom: @@ -60,6 +63,8 @@ enum AIProvider: String, CaseIterable { return "scribe_v1" case .deepgram: return "whisper-1" + case .soniox: + return "stt-async-v3" case .ollama: return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral" case .custom: @@ -122,6 +127,8 @@ enum AIProvider: String, CaseIterable { return ["scribe_v1", "scribe_v1_experimental"] case .deepgram: return ["whisper-1"] + case .soniox: + return ["stt-async-v3"] case .ollama: return [] case .custom: @@ -308,6 +315,8 @@ class AIService: ObservableObject { verifyDeepgramAPIKey(key, completion: completion) case .mistral: verifyMistralAPIKey(key, completion: completion) + case .soniox: + verifySonioxAPIKey(key, completion: completion) default: verifyOpenAICompatibleAPIKey(key, completion: completion) } @@ -462,6 +471,31 @@ class AIService: ObservableObject { }.resume() } + private func verifySonioxAPIKey(_ key: String, completion: @escaping (Bool) -> Void) { + guard let url = URL(string: "https://api.soniox.com/v1/files") else { + completion(false) + return + } + var request = URLRequest(url: url) + request.httpMethod = "GET" + request.addValue("Bearer \(key)", forHTTPHeaderField: "Authorization") + request.addValue("application/json", forHTTPHeaderField: "Accept") + + URLSession.shared.dataTask(with: request) { _, response, error in + if let error = error { + self.logger.error("Soniox API key verification failed: \(error.localizedDescription)") + completion(false) + return + } + + if let httpResponse = response as? HTTPURLResponse { + completion(httpResponse.statusCode == 200) + } else { + completion(false) + } + }.resume() + } + func clearAPIKey() { guard selectedProvider.requiresAPIKey else { return } diff --git a/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift index f7037c3..75af540 100644 --- a/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift +++ b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift @@ -41,6 +41,7 @@ class CloudTranscriptionService: TranscriptionService { private lazy var mistralService = MistralTranscriptionService() private lazy var geminiService = GeminiTranscriptionService() private lazy var openAICompatibleService = OpenAICompatibleTranscriptionService() + private lazy var sonioxService = SonioxTranscriptionService() func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { var text: String @@ -56,6 +57,8 @@ class CloudTranscriptionService: TranscriptionService { text = try await mistralService.transcribe(audioURL: audioURL, model: model) case .gemini: text = try await geminiService.transcribe(audioURL: audioURL, model: model) + case .soniox: + text = try await sonioxService.transcribe(audioURL: audioURL, model: model) case .custom: guard let customModel = model as? CustomCloudModel else { throw CloudTranscriptionError.unsupportedProvider diff --git a/VoiceInk/Services/CloudTranscription/SonioxTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/SonioxTranscriptionService.swift new file mode 100644 index 0000000..4eac4f4 --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/SonioxTranscriptionService.swift @@ -0,0 +1,201 @@ +import Foundation + +class SonioxTranscriptionService { + private let apiBase = "https://api.soniox.com/v1" + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + let config = try getAPIConfig(for: model) + + let fileId = try await uploadFile(audioURL: audioURL, apiKey: config.apiKey) + let transcriptionId = try await createTranscription(fileId: fileId, apiKey: config.apiKey, modelName: model.name) + try await pollTranscriptionStatus(id: transcriptionId, apiKey: config.apiKey) + let transcript = try await fetchTranscript(id: transcriptionId, apiKey: config.apiKey) + + guard !transcript.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw CloudTranscriptionError.noTranscriptionReturned + } + return transcript + } + + private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { + guard let apiKey = UserDefaults.standard.string(forKey: "SonioxAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } + return APIConfig(apiKey: apiKey) + } + + private func uploadFile(audioURL: URL, apiKey: String) async throws -> String { + guard let apiURL = URL(string: "\(apiBase)/files") else { + throw CloudTranscriptionError.dataEncodingError + } + var request = URLRequest(url: apiURL) + request.httpMethod = "POST" + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + let boundary = "Boundary-\(UUID().uuidString)" + request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") + let body = try createMultipartBody(fileURL: audioURL, boundary: boundary) + let (data, response) = try await URLSession.shared.upload(for: request, from: body) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + do { + let uploadResponse = try JSONDecoder().decode(FileUploadResponse.self, from: data) + return uploadResponse.id + } catch { + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func createTranscription(fileId: String, apiKey: String, modelName: String) async throws -> String { + guard let apiURL = URL(string: "\(apiBase)/transcriptions") else { + throw CloudTranscriptionError.dataEncodingError + } + var request = URLRequest(url: apiURL) + request.httpMethod = "POST" + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + var payload: [String: Any] = [ + "file_id": fileId, + "model": modelName, + // Disable diarization as per app requirement + "enable_speaker_diarization": false + ] + // Attach custom vocabulary terms from the app's dictionary (if any) + let dictionaryTerms = getCustomDictionaryTerms() + if !dictionaryTerms.isEmpty { + payload["context"] = [ + "terms": dictionaryTerms + ] + } + let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" + if selectedLanguage != "auto" && !selectedLanguage.isEmpty { + payload["language_hints"] = [selectedLanguage] + } + request.httpBody = try JSONSerialization.data(withJSONObject: payload) + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + do { + let createResponse = try JSONDecoder().decode(CreateTranscriptionResponse.self, from: data) + return createResponse.id + } catch { + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func pollTranscriptionStatus(id: String, apiKey: String) async throws { + guard let baseURL = URL(string: "\(apiBase)/transcriptions/\(id)") else { + throw CloudTranscriptionError.dataEncodingError + } + let start = Date() + let maxWaitSeconds: TimeInterval = 300 + while true { + var request = URLRequest(url: baseURL) + request.httpMethod = "GET" + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + do { + let status = try JSONDecoder().decode(TranscriptionStatusResponse.self, from: data) + switch status.status.lowercased() { + case "completed": + return + case "failed": + throw CloudTranscriptionError.apiRequestFailed(statusCode: 500, message: "Transcription failed") + default: + break + } + } catch { + // Decoding status failed, will retry + } + if Date().timeIntervalSince(start) > maxWaitSeconds { + throw CloudTranscriptionError.apiRequestFailed(statusCode: 504, message: "Transcription timed out") + } + try await Task.sleep(nanoseconds: 1_000_000_000) + } + } + + private func fetchTranscript(id: String, apiKey: String) async throws -> String { + guard let apiURL = URL(string: "\(apiBase)/transcriptions/\(id)/transcript") else { + throw CloudTranscriptionError.dataEncodingError + } + var request = URLRequest(url: apiURL) + request.httpMethod = "GET" + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + if let decoded = try? JSONDecoder().decode(TranscriptResponse.self, from: data) { + return decoded.text + } + if let asString = String(data: data, encoding: .utf8), !asString.isEmpty { + return asString + } + throw CloudTranscriptionError.noTranscriptionReturned + } + + private func createMultipartBody(fileURL: URL, boundary: String) throws -> Data { + var body = Data() + let crlf = "\r\n" + guard let audioData = try? Data(contentsOf: fileURL) else { + throw CloudTranscriptionError.audioFileNotFound + } + body.append("--\(boundary)\(crlf)".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(fileURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) + body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) + body.append(audioData) + body.append(crlf.data(using: .utf8)!) + body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) + return body + } + + private func getCustomDictionaryTerms() -> [String] { + guard let data = UserDefaults.standard.data(forKey: "CustomDictionaryItems") else { + return [] + } + // Decode without depending on UI layer types; extract "word" strings + guard let json = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else { + return [] + } + let words = json.compactMap { $0["word"] as? String } + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + // De-duplicate while preserving order + var seen = Set() + var unique: [String] = [] + for w in words { + let key = w.lowercased() + if !seen.contains(key) { + seen.insert(key) + unique.append(w) + } + } + return unique + } + + private struct APIConfig { let apiKey: String } + private struct FileUploadResponse: Decodable { let id: String } + private struct CreateTranscriptionResponse: Decodable { let id: String } + private struct TranscriptionStatusResponse: Decodable { let status: String } + private struct TranscriptResponse: Decodable { let text: String } +} diff --git a/VoiceInk/Views/AI Models/APIKeyManagementView.swift b/VoiceInk/Views/AI Models/APIKeyManagementView.swift index e36dd01..45df228 100644 --- a/VoiceInk/Views/AI Models/APIKeyManagementView.swift +++ b/VoiceInk/Views/AI Models/APIKeyManagementView.swift @@ -17,7 +17,7 @@ struct APIKeyManagementView: View { // Provider Selection HStack { Picker("AI Provider", selection: $aiService.selectedProvider) { - ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram }, id: \.self) { provider in + ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram && $0 != .soniox }, id: \.self) { provider in Text(provider.rawValue).tag(provider) } } @@ -411,6 +411,8 @@ struct APIKeyManagementView: View { URL(string: "https://elevenlabs.io/speech-synthesis")! case .deepgram: URL(string: "https://console.deepgram.com/api-keys")! + case .soniox: + URL(string: "https://console.soniox.com/")! case .ollama, .custom: URL(string: "")! // This case should never be reached case .openRouter: diff --git a/VoiceInk/Views/AI Models/CloudModelCardRowView.swift b/VoiceInk/Views/AI Models/CloudModelCardRowView.swift index 04fc2f8..d00cc1a 100644 --- a/VoiceInk/Views/AI Models/CloudModelCardRowView.swift +++ b/VoiceInk/Views/AI Models/CloudModelCardRowView.swift @@ -38,6 +38,8 @@ struct CloudModelCardView: View { return "Mistral" case .gemini: return "Gemini" + case .soniox: + return "Soniox" default: return model.provider.rawValue } @@ -281,6 +283,8 @@ struct CloudModelCardView: View { aiService.selectedProvider = .mistral case .gemini: aiService.selectedProvider = .gemini + case .soniox: + aiService.selectedProvider = .soniox default: // This case should ideally not be hit for cloud models in this view print("Warning: verifyAPIKey called for unsupported provider \(model.provider.rawValue)") diff --git a/VoiceInk/Views/AI Models/ModelCardRowView.swift b/VoiceInk/Views/AI Models/ModelCardRowView.swift index 012a750..c63d8c1 100644 --- a/VoiceInk/Views/AI Models/ModelCardRowView.swift +++ b/VoiceInk/Views/AI Models/ModelCardRowView.swift @@ -56,7 +56,7 @@ struct ModelCardRowView: View { setDefaultAction: setDefaultAction ) } - case .groq, .elevenLabs, .deepgram, .mistral, .gemini: + case .groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox: if let cloudModel = model as? CloudModel { CloudModelCardView( model: cloudModel, diff --git a/VoiceInk/Views/AI Models/ModelManagementView.swift b/VoiceInk/Views/AI Models/ModelManagementView.swift index 3e1cbe4..001f80e 100644 --- a/VoiceInk/Views/AI Models/ModelManagementView.swift +++ b/VoiceInk/Views/AI Models/ModelManagementView.swift @@ -224,7 +224,7 @@ struct ModelManagementView: View { case .local: return whisperState.allAvailableModels.filter { $0.provider == .local || $0.provider == .nativeApple || $0.provider == .parakeet } case .cloud: - let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini] + let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox] return whisperState.allAvailableModels.filter { cloudProviders.contains($0.provider) } case .custom: return whisperState.allAvailableModels.filter { $0.provider == .custom } diff --git a/VoiceInk/Whisper/WhisperState+ModelQueries.swift b/VoiceInk/Whisper/WhisperState+ModelQueries.swift index 5c76b0e..f634ea0 100644 --- a/VoiceInk/Whisper/WhisperState+ModelQueries.swift +++ b/VoiceInk/Whisper/WhisperState+ModelQueries.swift @@ -29,6 +29,9 @@ extension WhisperState { case .gemini: let key = UserDefaults.standard.string(forKey: "GeminiAPIKey") return key != nil && !key!.isEmpty + case .soniox: + let key = UserDefaults.standard.string(forKey: "SonioxAPIKey") + return key != nil && !key!.isEmpty case .custom: // Custom models are always usable since they contain their own API keys return true