feat: Adding support for soniox v3

This commit is contained in:
Beingpax 2025-10-28 22:31:16 +05:45
parent 357804c03d
commit f6402b1520
10 changed files with 262 additions and 3 deletions

View File

@ -265,6 +265,17 @@ import Foundation
isMultilingual: true, isMultilingual: true,
supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini) supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini)
) )
,
CloudModel(
name: "stt-async-v3",
displayName: "Soniox (stt-async-v3)",
description: "Soniox asynchronous transcription model v3.",
provider: .soniox,
speed: 0.8,
accuracy: 0.96,
isMultilingual: true,
supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .soniox)
)
] ]
static let allLanguages = [ static let allLanguages = [

View File

@ -9,6 +9,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
case deepgram = "Deepgram" case deepgram = "Deepgram"
case mistral = "Mistral" case mistral = "Mistral"
case gemini = "Gemini" case gemini = "Gemini"
case soniox = "Soniox"
case custom = "Custom" case custom = "Custom"
case nativeApple = "Native Apple" case nativeApple = "Native Apple"
// Future providers can be added here // Future providers can be added here

View File

@ -11,6 +11,7 @@ enum AIProvider: String, CaseIterable {
case mistral = "Mistral" case mistral = "Mistral"
case elevenLabs = "ElevenLabs" case elevenLabs = "ElevenLabs"
case deepgram = "Deepgram" case deepgram = "Deepgram"
case soniox = "Soniox"
case ollama = "Ollama" case ollama = "Ollama"
case custom = "Custom" case custom = "Custom"
@ -35,6 +36,8 @@ enum AIProvider: String, CaseIterable {
return "https://api.elevenlabs.io/v1/speech-to-text" return "https://api.elevenlabs.io/v1/speech-to-text"
case .deepgram: case .deepgram:
return "https://api.deepgram.com/v1/listen" return "https://api.deepgram.com/v1/listen"
case .soniox:
return "https://api.soniox.com/v1"
case .ollama: case .ollama:
return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434" return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434"
case .custom: case .custom:
@ -60,6 +63,8 @@ enum AIProvider: String, CaseIterable {
return "scribe_v1" return "scribe_v1"
case .deepgram: case .deepgram:
return "whisper-1" return "whisper-1"
case .soniox:
return "stt-async-v3"
case .ollama: case .ollama:
return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral" return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral"
case .custom: case .custom:
@ -122,6 +127,8 @@ enum AIProvider: String, CaseIterable {
return ["scribe_v1", "scribe_v1_experimental"] return ["scribe_v1", "scribe_v1_experimental"]
case .deepgram: case .deepgram:
return ["whisper-1"] return ["whisper-1"]
case .soniox:
return ["stt-async-v3"]
case .ollama: case .ollama:
return [] return []
case .custom: case .custom:
@ -308,6 +315,8 @@ class AIService: ObservableObject {
verifyDeepgramAPIKey(key, completion: completion) verifyDeepgramAPIKey(key, completion: completion)
case .mistral: case .mistral:
verifyMistralAPIKey(key, completion: completion) verifyMistralAPIKey(key, completion: completion)
case .soniox:
verifySonioxAPIKey(key, completion: completion)
default: default:
verifyOpenAICompatibleAPIKey(key, completion: completion) verifyOpenAICompatibleAPIKey(key, completion: completion)
} }
@ -462,6 +471,31 @@ class AIService: ObservableObject {
}.resume() }.resume()
} }
private func verifySonioxAPIKey(_ key: String, completion: @escaping (Bool) -> Void) {
guard let url = URL(string: "https://api.soniox.com/v1/files") else {
completion(false)
return
}
var request = URLRequest(url: url)
request.httpMethod = "GET"
request.addValue("Bearer \(key)", forHTTPHeaderField: "Authorization")
request.addValue("application/json", forHTTPHeaderField: "Accept")
URLSession.shared.dataTask(with: request) { _, response, error in
if let error = error {
self.logger.error("Soniox API key verification failed: \(error.localizedDescription)")
completion(false)
return
}
if let httpResponse = response as? HTTPURLResponse {
completion(httpResponse.statusCode == 200)
} else {
completion(false)
}
}.resume()
}
func clearAPIKey() { func clearAPIKey() {
guard selectedProvider.requiresAPIKey else { return } guard selectedProvider.requiresAPIKey else { return }

View File

@ -41,6 +41,7 @@ class CloudTranscriptionService: TranscriptionService {
private lazy var mistralService = MistralTranscriptionService() private lazy var mistralService = MistralTranscriptionService()
private lazy var geminiService = GeminiTranscriptionService() private lazy var geminiService = GeminiTranscriptionService()
private lazy var openAICompatibleService = OpenAICompatibleTranscriptionService() private lazy var openAICompatibleService = OpenAICompatibleTranscriptionService()
private lazy var sonioxService = SonioxTranscriptionService()
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
var text: String var text: String
@ -56,6 +57,8 @@ class CloudTranscriptionService: TranscriptionService {
text = try await mistralService.transcribe(audioURL: audioURL, model: model) text = try await mistralService.transcribe(audioURL: audioURL, model: model)
case .gemini: case .gemini:
text = try await geminiService.transcribe(audioURL: audioURL, model: model) text = try await geminiService.transcribe(audioURL: audioURL, model: model)
case .soniox:
text = try await sonioxService.transcribe(audioURL: audioURL, model: model)
case .custom: case .custom:
guard let customModel = model as? CustomCloudModel else { guard let customModel = model as? CustomCloudModel else {
throw CloudTranscriptionError.unsupportedProvider throw CloudTranscriptionError.unsupportedProvider

View File

@ -0,0 +1,201 @@
import Foundation
class SonioxTranscriptionService {
private let apiBase = "https://api.soniox.com/v1"
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
let config = try getAPIConfig(for: model)
let fileId = try await uploadFile(audioURL: audioURL, apiKey: config.apiKey)
let transcriptionId = try await createTranscription(fileId: fileId, apiKey: config.apiKey, modelName: model.name)
try await pollTranscriptionStatus(id: transcriptionId, apiKey: config.apiKey)
let transcript = try await fetchTranscript(id: transcriptionId, apiKey: config.apiKey)
guard !transcript.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
throw CloudTranscriptionError.noTranscriptionReturned
}
return transcript
}
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
guard let apiKey = UserDefaults.standard.string(forKey: "SonioxAPIKey"), !apiKey.isEmpty else {
throw CloudTranscriptionError.missingAPIKey
}
return APIConfig(apiKey: apiKey)
}
private func uploadFile(audioURL: URL, apiKey: String) async throws -> String {
guard let apiURL = URL(string: "\(apiBase)/files") else {
throw CloudTranscriptionError.dataEncodingError
}
var request = URLRequest(url: apiURL)
request.httpMethod = "POST"
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
let boundary = "Boundary-\(UUID().uuidString)"
request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
let body = try createMultipartBody(fileURL: audioURL, boundary: boundary)
let (data, response) = try await URLSession.shared.upload(for: request, from: body)
guard let httpResponse = response as? HTTPURLResponse else {
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
}
if !(200...299).contains(httpResponse.statusCode) {
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
}
do {
let uploadResponse = try JSONDecoder().decode(FileUploadResponse.self, from: data)
return uploadResponse.id
} catch {
throw CloudTranscriptionError.noTranscriptionReturned
}
}
private func createTranscription(fileId: String, apiKey: String, modelName: String) async throws -> String {
guard let apiURL = URL(string: "\(apiBase)/transcriptions") else {
throw CloudTranscriptionError.dataEncodingError
}
var request = URLRequest(url: apiURL)
request.httpMethod = "POST"
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
var payload: [String: Any] = [
"file_id": fileId,
"model": modelName,
// Disable diarization as per app requirement
"enable_speaker_diarization": false
]
// Attach custom vocabulary terms from the app's dictionary (if any)
let dictionaryTerms = getCustomDictionaryTerms()
if !dictionaryTerms.isEmpty {
payload["context"] = [
"terms": dictionaryTerms
]
}
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
if selectedLanguage != "auto" && !selectedLanguage.isEmpty {
payload["language_hints"] = [selectedLanguage]
}
request.httpBody = try JSONSerialization.data(withJSONObject: payload)
let (data, response) = try await URLSession.shared.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
}
if !(200...299).contains(httpResponse.statusCode) {
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
}
do {
let createResponse = try JSONDecoder().decode(CreateTranscriptionResponse.self, from: data)
return createResponse.id
} catch {
throw CloudTranscriptionError.noTranscriptionReturned
}
}
private func pollTranscriptionStatus(id: String, apiKey: String) async throws {
guard let baseURL = URL(string: "\(apiBase)/transcriptions/\(id)") else {
throw CloudTranscriptionError.dataEncodingError
}
let start = Date()
let maxWaitSeconds: TimeInterval = 300
while true {
var request = URLRequest(url: baseURL)
request.httpMethod = "GET"
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
let (data, response) = try await URLSession.shared.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
}
if !(200...299).contains(httpResponse.statusCode) {
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
}
do {
let status = try JSONDecoder().decode(TranscriptionStatusResponse.self, from: data)
switch status.status.lowercased() {
case "completed":
return
case "failed":
throw CloudTranscriptionError.apiRequestFailed(statusCode: 500, message: "Transcription failed")
default:
break
}
} catch {
// Decoding status failed, will retry
}
if Date().timeIntervalSince(start) > maxWaitSeconds {
throw CloudTranscriptionError.apiRequestFailed(statusCode: 504, message: "Transcription timed out")
}
try await Task.sleep(nanoseconds: 1_000_000_000)
}
}
private func fetchTranscript(id: String, apiKey: String) async throws -> String {
guard let apiURL = URL(string: "\(apiBase)/transcriptions/\(id)/transcript") else {
throw CloudTranscriptionError.dataEncodingError
}
var request = URLRequest(url: apiURL)
request.httpMethod = "GET"
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
let (data, response) = try await URLSession.shared.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
}
if !(200...299).contains(httpResponse.statusCode) {
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
}
if let decoded = try? JSONDecoder().decode(TranscriptResponse.self, from: data) {
return decoded.text
}
if let asString = String(data: data, encoding: .utf8), !asString.isEmpty {
return asString
}
throw CloudTranscriptionError.noTranscriptionReturned
}
private func createMultipartBody(fileURL: URL, boundary: String) throws -> Data {
var body = Data()
let crlf = "\r\n"
guard let audioData = try? Data(contentsOf: fileURL) else {
throw CloudTranscriptionError.audioFileNotFound
}
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(fileURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
body.append(audioData)
body.append(crlf.data(using: .utf8)!)
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
return body
}
private func getCustomDictionaryTerms() -> [String] {
guard let data = UserDefaults.standard.data(forKey: "CustomDictionaryItems") else {
return []
}
// Decode without depending on UI layer types; extract "word" strings
guard let json = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else {
return []
}
let words = json.compactMap { $0["word"] as? String }
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
// De-duplicate while preserving order
var seen = Set<String>()
var unique: [String] = []
for w in words {
let key = w.lowercased()
if !seen.contains(key) {
seen.insert(key)
unique.append(w)
}
}
return unique
}
private struct APIConfig { let apiKey: String }
private struct FileUploadResponse: Decodable { let id: String }
private struct CreateTranscriptionResponse: Decodable { let id: String }
private struct TranscriptionStatusResponse: Decodable { let status: String }
private struct TranscriptResponse: Decodable { let text: String }
}

View File

@ -17,7 +17,7 @@ struct APIKeyManagementView: View {
// Provider Selection // Provider Selection
HStack { HStack {
Picker("AI Provider", selection: $aiService.selectedProvider) { Picker("AI Provider", selection: $aiService.selectedProvider) {
ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram }, id: \.self) { provider in ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram && $0 != .soniox }, id: \.self) { provider in
Text(provider.rawValue).tag(provider) Text(provider.rawValue).tag(provider)
} }
} }
@ -411,6 +411,8 @@ struct APIKeyManagementView: View {
URL(string: "https://elevenlabs.io/speech-synthesis")! URL(string: "https://elevenlabs.io/speech-synthesis")!
case .deepgram: case .deepgram:
URL(string: "https://console.deepgram.com/api-keys")! URL(string: "https://console.deepgram.com/api-keys")!
case .soniox:
URL(string: "https://console.soniox.com/")!
case .ollama, .custom: case .ollama, .custom:
URL(string: "")! // This case should never be reached URL(string: "")! // This case should never be reached
case .openRouter: case .openRouter:

View File

@ -38,6 +38,8 @@ struct CloudModelCardView: View {
return "Mistral" return "Mistral"
case .gemini: case .gemini:
return "Gemini" return "Gemini"
case .soniox:
return "Soniox"
default: default:
return model.provider.rawValue return model.provider.rawValue
} }
@ -281,6 +283,8 @@ struct CloudModelCardView: View {
aiService.selectedProvider = .mistral aiService.selectedProvider = .mistral
case .gemini: case .gemini:
aiService.selectedProvider = .gemini aiService.selectedProvider = .gemini
case .soniox:
aiService.selectedProvider = .soniox
default: default:
// This case should ideally not be hit for cloud models in this view // This case should ideally not be hit for cloud models in this view
print("Warning: verifyAPIKey called for unsupported provider \(model.provider.rawValue)") print("Warning: verifyAPIKey called for unsupported provider \(model.provider.rawValue)")

View File

@ -56,7 +56,7 @@ struct ModelCardRowView: View {
setDefaultAction: setDefaultAction setDefaultAction: setDefaultAction
) )
} }
case .groq, .elevenLabs, .deepgram, .mistral, .gemini: case .groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox:
if let cloudModel = model as? CloudModel { if let cloudModel = model as? CloudModel {
CloudModelCardView( CloudModelCardView(
model: cloudModel, model: cloudModel,

View File

@ -224,7 +224,7 @@ struct ModelManagementView: View {
case .local: case .local:
return whisperState.allAvailableModels.filter { $0.provider == .local || $0.provider == .nativeApple || $0.provider == .parakeet } return whisperState.allAvailableModels.filter { $0.provider == .local || $0.provider == .nativeApple || $0.provider == .parakeet }
case .cloud: case .cloud:
let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini] let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox]
return whisperState.allAvailableModels.filter { cloudProviders.contains($0.provider) } return whisperState.allAvailableModels.filter { cloudProviders.contains($0.provider) }
case .custom: case .custom:
return whisperState.allAvailableModels.filter { $0.provider == .custom } return whisperState.allAvailableModels.filter { $0.provider == .custom }

View File

@ -29,6 +29,9 @@ extension WhisperState {
case .gemini: case .gemini:
let key = UserDefaults.standard.string(forKey: "GeminiAPIKey") let key = UserDefaults.standard.string(forKey: "GeminiAPIKey")
return key != nil && !key!.isEmpty return key != nil && !key!.isEmpty
case .soniox:
let key = UserDefaults.standard.string(forKey: "SonioxAPIKey")
return key != nil && !key!.isEmpty
case .custom: case .custom:
// Custom models are always usable since they contain their own API keys // Custom models are always usable since they contain their own API keys
return true return true