feat: Adding support for soniox v3
This commit is contained in:
parent
357804c03d
commit
f6402b1520
@ -265,6 +265,17 @@ import Foundation
|
||||
isMultilingual: true,
|
||||
supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini)
|
||||
)
|
||||
,
|
||||
CloudModel(
|
||||
name: "stt-async-v3",
|
||||
displayName: "Soniox (stt-async-v3)",
|
||||
description: "Soniox asynchronous transcription model v3.",
|
||||
provider: .soniox,
|
||||
speed: 0.8,
|
||||
accuracy: 0.96,
|
||||
isMultilingual: true,
|
||||
supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .soniox)
|
||||
)
|
||||
]
|
||||
|
||||
static let allLanguages = [
|
||||
|
||||
@ -9,6 +9,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable {
|
||||
case deepgram = "Deepgram"
|
||||
case mistral = "Mistral"
|
||||
case gemini = "Gemini"
|
||||
case soniox = "Soniox"
|
||||
case custom = "Custom"
|
||||
case nativeApple = "Native Apple"
|
||||
// Future providers can be added here
|
||||
|
||||
@ -11,6 +11,7 @@ enum AIProvider: String, CaseIterable {
|
||||
case mistral = "Mistral"
|
||||
case elevenLabs = "ElevenLabs"
|
||||
case deepgram = "Deepgram"
|
||||
case soniox = "Soniox"
|
||||
case ollama = "Ollama"
|
||||
case custom = "Custom"
|
||||
|
||||
@ -35,6 +36,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return "https://api.elevenlabs.io/v1/speech-to-text"
|
||||
case .deepgram:
|
||||
return "https://api.deepgram.com/v1/listen"
|
||||
case .soniox:
|
||||
return "https://api.soniox.com/v1"
|
||||
case .ollama:
|
||||
return UserDefaults.standard.string(forKey: "ollamaBaseURL") ?? "http://localhost:11434"
|
||||
case .custom:
|
||||
@ -60,6 +63,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return "scribe_v1"
|
||||
case .deepgram:
|
||||
return "whisper-1"
|
||||
case .soniox:
|
||||
return "stt-async-v3"
|
||||
case .ollama:
|
||||
return UserDefaults.standard.string(forKey: "ollamaSelectedModel") ?? "mistral"
|
||||
case .custom:
|
||||
@ -122,6 +127,8 @@ enum AIProvider: String, CaseIterable {
|
||||
return ["scribe_v1", "scribe_v1_experimental"]
|
||||
case .deepgram:
|
||||
return ["whisper-1"]
|
||||
case .soniox:
|
||||
return ["stt-async-v3"]
|
||||
case .ollama:
|
||||
return []
|
||||
case .custom:
|
||||
@ -308,6 +315,8 @@ class AIService: ObservableObject {
|
||||
verifyDeepgramAPIKey(key, completion: completion)
|
||||
case .mistral:
|
||||
verifyMistralAPIKey(key, completion: completion)
|
||||
case .soniox:
|
||||
verifySonioxAPIKey(key, completion: completion)
|
||||
default:
|
||||
verifyOpenAICompatibleAPIKey(key, completion: completion)
|
||||
}
|
||||
@ -462,6 +471,31 @@ class AIService: ObservableObject {
|
||||
}.resume()
|
||||
}
|
||||
|
||||
private func verifySonioxAPIKey(_ key: String, completion: @escaping (Bool) -> Void) {
|
||||
guard let url = URL(string: "https://api.soniox.com/v1/files") else {
|
||||
completion(false)
|
||||
return
|
||||
}
|
||||
var request = URLRequest(url: url)
|
||||
request.httpMethod = "GET"
|
||||
request.addValue("Bearer \(key)", forHTTPHeaderField: "Authorization")
|
||||
request.addValue("application/json", forHTTPHeaderField: "Accept")
|
||||
|
||||
URLSession.shared.dataTask(with: request) { _, response, error in
|
||||
if let error = error {
|
||||
self.logger.error("Soniox API key verification failed: \(error.localizedDescription)")
|
||||
completion(false)
|
||||
return
|
||||
}
|
||||
|
||||
if let httpResponse = response as? HTTPURLResponse {
|
||||
completion(httpResponse.statusCode == 200)
|
||||
} else {
|
||||
completion(false)
|
||||
}
|
||||
}.resume()
|
||||
}
|
||||
|
||||
func clearAPIKey() {
|
||||
guard selectedProvider.requiresAPIKey else { return }
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@ class CloudTranscriptionService: TranscriptionService {
|
||||
private lazy var mistralService = MistralTranscriptionService()
|
||||
private lazy var geminiService = GeminiTranscriptionService()
|
||||
private lazy var openAICompatibleService = OpenAICompatibleTranscriptionService()
|
||||
private lazy var sonioxService = SonioxTranscriptionService()
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
var text: String
|
||||
@ -56,6 +57,8 @@ class CloudTranscriptionService: TranscriptionService {
|
||||
text = try await mistralService.transcribe(audioURL: audioURL, model: model)
|
||||
case .gemini:
|
||||
text = try await geminiService.transcribe(audioURL: audioURL, model: model)
|
||||
case .soniox:
|
||||
text = try await sonioxService.transcribe(audioURL: audioURL, model: model)
|
||||
case .custom:
|
||||
guard let customModel = model as? CustomCloudModel else {
|
||||
throw CloudTranscriptionError.unsupportedProvider
|
||||
|
||||
@ -0,0 +1,201 @@
|
||||
import Foundation
|
||||
|
||||
class SonioxTranscriptionService {
|
||||
private let apiBase = "https://api.soniox.com/v1"
|
||||
|
||||
func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String {
|
||||
let config = try getAPIConfig(for: model)
|
||||
|
||||
let fileId = try await uploadFile(audioURL: audioURL, apiKey: config.apiKey)
|
||||
let transcriptionId = try await createTranscription(fileId: fileId, apiKey: config.apiKey, modelName: model.name)
|
||||
try await pollTranscriptionStatus(id: transcriptionId, apiKey: config.apiKey)
|
||||
let transcript = try await fetchTranscript(id: transcriptionId, apiKey: config.apiKey)
|
||||
|
||||
guard !transcript.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
return transcript
|
||||
}
|
||||
|
||||
private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig {
|
||||
guard let apiKey = UserDefaults.standard.string(forKey: "SonioxAPIKey"), !apiKey.isEmpty else {
|
||||
throw CloudTranscriptionError.missingAPIKey
|
||||
}
|
||||
return APIConfig(apiKey: apiKey)
|
||||
}
|
||||
|
||||
private func uploadFile(audioURL: URL, apiKey: String) async throws -> String {
|
||||
guard let apiURL = URL(string: "\(apiBase)/files") else {
|
||||
throw CloudTranscriptionError.dataEncodingError
|
||||
}
|
||||
var request = URLRequest(url: apiURL)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type")
|
||||
let body = try createMultipartBody(fileURL: audioURL, boundary: boundary)
|
||||
let (data, response) = try await URLSession.shared.upload(for: request, from: body)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
do {
|
||||
let uploadResponse = try JSONDecoder().decode(FileUploadResponse.self, from: data)
|
||||
return uploadResponse.id
|
||||
} catch {
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func createTranscription(fileId: String, apiKey: String, modelName: String) async throws -> String {
|
||||
guard let apiURL = URL(string: "\(apiBase)/transcriptions") else {
|
||||
throw CloudTranscriptionError.dataEncodingError
|
||||
}
|
||||
var request = URLRequest(url: apiURL)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
|
||||
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
var payload: [String: Any] = [
|
||||
"file_id": fileId,
|
||||
"model": modelName,
|
||||
// Disable diarization as per app requirement
|
||||
"enable_speaker_diarization": false
|
||||
]
|
||||
// Attach custom vocabulary terms from the app's dictionary (if any)
|
||||
let dictionaryTerms = getCustomDictionaryTerms()
|
||||
if !dictionaryTerms.isEmpty {
|
||||
payload["context"] = [
|
||||
"terms": dictionaryTerms
|
||||
]
|
||||
}
|
||||
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
|
||||
if selectedLanguage != "auto" && !selectedLanguage.isEmpty {
|
||||
payload["language_hints"] = [selectedLanguage]
|
||||
}
|
||||
request.httpBody = try JSONSerialization.data(withJSONObject: payload)
|
||||
let (data, response) = try await URLSession.shared.data(for: request)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
do {
|
||||
let createResponse = try JSONDecoder().decode(CreateTranscriptionResponse.self, from: data)
|
||||
return createResponse.id
|
||||
} catch {
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
}
|
||||
|
||||
private func pollTranscriptionStatus(id: String, apiKey: String) async throws {
|
||||
guard let baseURL = URL(string: "\(apiBase)/transcriptions/\(id)") else {
|
||||
throw CloudTranscriptionError.dataEncodingError
|
||||
}
|
||||
let start = Date()
|
||||
let maxWaitSeconds: TimeInterval = 300
|
||||
while true {
|
||||
var request = URLRequest(url: baseURL)
|
||||
request.httpMethod = "GET"
|
||||
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
|
||||
let (data, response) = try await URLSession.shared.data(for: request)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
do {
|
||||
let status = try JSONDecoder().decode(TranscriptionStatusResponse.self, from: data)
|
||||
switch status.status.lowercased() {
|
||||
case "completed":
|
||||
return
|
||||
case "failed":
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: 500, message: "Transcription failed")
|
||||
default:
|
||||
break
|
||||
}
|
||||
} catch {
|
||||
// Decoding status failed, will retry
|
||||
}
|
||||
if Date().timeIntervalSince(start) > maxWaitSeconds {
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: 504, message: "Transcription timed out")
|
||||
}
|
||||
try await Task.sleep(nanoseconds: 1_000_000_000)
|
||||
}
|
||||
}
|
||||
|
||||
private func fetchTranscript(id: String, apiKey: String) async throws -> String {
|
||||
guard let apiURL = URL(string: "\(apiBase)/transcriptions/\(id)/transcript") else {
|
||||
throw CloudTranscriptionError.dataEncodingError
|
||||
}
|
||||
var request = URLRequest(url: apiURL)
|
||||
request.httpMethod = "GET"
|
||||
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
|
||||
let (data, response) = try await URLSession.shared.data(for: request)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw CloudTranscriptionError.networkError(URLError(.badServerResponse))
|
||||
}
|
||||
if !(200...299).contains(httpResponse.statusCode) {
|
||||
let errorMessage = String(data: data, encoding: .utf8) ?? "No error message"
|
||||
throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage)
|
||||
}
|
||||
if let decoded = try? JSONDecoder().decode(TranscriptResponse.self, from: data) {
|
||||
return decoded.text
|
||||
}
|
||||
if let asString = String(data: data, encoding: .utf8), !asString.isEmpty {
|
||||
return asString
|
||||
}
|
||||
throw CloudTranscriptionError.noTranscriptionReturned
|
||||
}
|
||||
|
||||
private func createMultipartBody(fileURL: URL, boundary: String) throws -> Data {
|
||||
var body = Data()
|
||||
let crlf = "\r\n"
|
||||
guard let audioData = try? Data(contentsOf: fileURL) else {
|
||||
throw CloudTranscriptionError.audioFileNotFound
|
||||
}
|
||||
body.append("--\(boundary)\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(fileURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!)
|
||||
body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!)
|
||||
body.append(audioData)
|
||||
body.append(crlf.data(using: .utf8)!)
|
||||
body.append("--\(boundary)--\(crlf)".data(using: .utf8)!)
|
||||
return body
|
||||
}
|
||||
|
||||
private func getCustomDictionaryTerms() -> [String] {
|
||||
guard let data = UserDefaults.standard.data(forKey: "CustomDictionaryItems") else {
|
||||
return []
|
||||
}
|
||||
// Decode without depending on UI layer types; extract "word" strings
|
||||
guard let json = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else {
|
||||
return []
|
||||
}
|
||||
let words = json.compactMap { $0["word"] as? String }
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
// De-duplicate while preserving order
|
||||
var seen = Set<String>()
|
||||
var unique: [String] = []
|
||||
for w in words {
|
||||
let key = w.lowercased()
|
||||
if !seen.contains(key) {
|
||||
seen.insert(key)
|
||||
unique.append(w)
|
||||
}
|
||||
}
|
||||
return unique
|
||||
}
|
||||
|
||||
private struct APIConfig { let apiKey: String }
|
||||
private struct FileUploadResponse: Decodable { let id: String }
|
||||
private struct CreateTranscriptionResponse: Decodable { let id: String }
|
||||
private struct TranscriptionStatusResponse: Decodable { let status: String }
|
||||
private struct TranscriptResponse: Decodable { let text: String }
|
||||
}
|
||||
@ -17,7 +17,7 @@ struct APIKeyManagementView: View {
|
||||
// Provider Selection
|
||||
HStack {
|
||||
Picker("AI Provider", selection: $aiService.selectedProvider) {
|
||||
ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram }, id: \.self) { provider in
|
||||
ForEach(AIProvider.allCases.filter { $0 != .elevenLabs && $0 != .deepgram && $0 != .soniox }, id: \.self) { provider in
|
||||
Text(provider.rawValue).tag(provider)
|
||||
}
|
||||
}
|
||||
@ -411,6 +411,8 @@ struct APIKeyManagementView: View {
|
||||
URL(string: "https://elevenlabs.io/speech-synthesis")!
|
||||
case .deepgram:
|
||||
URL(string: "https://console.deepgram.com/api-keys")!
|
||||
case .soniox:
|
||||
URL(string: "https://console.soniox.com/")!
|
||||
case .ollama, .custom:
|
||||
URL(string: "")! // This case should never be reached
|
||||
case .openRouter:
|
||||
|
||||
@ -38,6 +38,8 @@ struct CloudModelCardView: View {
|
||||
return "Mistral"
|
||||
case .gemini:
|
||||
return "Gemini"
|
||||
case .soniox:
|
||||
return "Soniox"
|
||||
default:
|
||||
return model.provider.rawValue
|
||||
}
|
||||
@ -281,6 +283,8 @@ struct CloudModelCardView: View {
|
||||
aiService.selectedProvider = .mistral
|
||||
case .gemini:
|
||||
aiService.selectedProvider = .gemini
|
||||
case .soniox:
|
||||
aiService.selectedProvider = .soniox
|
||||
default:
|
||||
// This case should ideally not be hit for cloud models in this view
|
||||
print("Warning: verifyAPIKey called for unsupported provider \(model.provider.rawValue)")
|
||||
|
||||
@ -56,7 +56,7 @@ struct ModelCardRowView: View {
|
||||
setDefaultAction: setDefaultAction
|
||||
)
|
||||
}
|
||||
case .groq, .elevenLabs, .deepgram, .mistral, .gemini:
|
||||
case .groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox:
|
||||
if let cloudModel = model as? CloudModel {
|
||||
CloudModelCardView(
|
||||
model: cloudModel,
|
||||
|
||||
@ -224,7 +224,7 @@ struct ModelManagementView: View {
|
||||
case .local:
|
||||
return whisperState.allAvailableModels.filter { $0.provider == .local || $0.provider == .nativeApple || $0.provider == .parakeet }
|
||||
case .cloud:
|
||||
let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini]
|
||||
let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini, .soniox]
|
||||
return whisperState.allAvailableModels.filter { cloudProviders.contains($0.provider) }
|
||||
case .custom:
|
||||
return whisperState.allAvailableModels.filter { $0.provider == .custom }
|
||||
|
||||
@ -29,6 +29,9 @@ extension WhisperState {
|
||||
case .gemini:
|
||||
let key = UserDefaults.standard.string(forKey: "GeminiAPIKey")
|
||||
return key != nil && !key!.isEmpty
|
||||
case .soniox:
|
||||
let key = UserDefaults.standard.string(forKey: "SonioxAPIKey")
|
||||
return key != nil && !key!.isEmpty
|
||||
case .custom:
|
||||
// Custom models are always usable since they contain their own API keys
|
||||
return true
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user