From 308098694de982e6cd36ecd07ce6089aa22ec039 Mon Sep 17 00:00:00 2001 From: Beingpax Date: Mon, 17 Nov 2025 10:14:32 +0545 Subject: [PATCH] Add support for scribe v2 model --- VoiceInk/Models/PredefinedModels.swift | 14 ++- .../ElevenLabsTranscriptionService.swift | 105 +++++++----------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index 9fb28f0..8e12bcb 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -205,14 +205,24 @@ import Foundation ), CloudModel( name: "scribe_v1", - displayName: "Scribe v1 (ElevenLabs)", - description: "ElevenLabs' Scribe model for fast and accurate transcription.", + displayName: "Scribe v1", + description: "ElevenLabs' Scribe model for fast & accurate transcription.", provider: .elevenLabs, speed: 0.7, accuracy: 0.98, isMultilingual: true, supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .elevenLabs) ), + CloudModel( + name: "scribe_v2_realtime", + displayName: "Scribe v2", + description: "ElevenLabs' Scribe v2 model for the most accurate transcription.", + provider: .elevenLabs, + speed: 0.75, + accuracy: 0.99, + isMultilingual: true, + supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .elevenLabs) + ), CloudModel( name: "nova-2", displayName: "Nova (Deepgram)", diff --git a/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift index 3cadccb..f2ede32 100644 --- a/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift +++ b/VoiceInk/Services/CloudTranscription/ElevenLabsTranscriptionService.swift @@ -1,107 +1,86 @@ import Foundation +import OSLog class ElevenLabsTranscriptionService { + private let apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text")! + private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "ElevenLabsTranscriptionService") func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { - let config = try getAPIConfig(for: model) + guard let apiKey = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } let boundary = "Boundary-\(UUID().uuidString)" - var request = URLRequest(url: config.url) + var request = URLRequest(url: apiURL) request.httpMethod = "POST" request.setValue("multipart/form-data; boundary=\(boundary)", forHTTPHeaderField: "Content-Type") request.setValue("application/json", forHTTPHeaderField: "Accept") - request.setValue(config.apiKey, forHTTPHeaderField: "xi-api-key") + request.setValue(apiKey, forHTTPHeaderField: "xi-api-key") - let body = try createElevenLabsRequestBody(audioURL: audioURL, modelName: config.modelName, boundary: boundary) + let body = try createRequestBody(audioURL: audioURL, modelName: model.name, boundary: boundary) let (data, response) = try await URLSession.shared.upload(for: request, from: body) + guard let httpResponse = response as? HTTPURLResponse else { throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) } + logger.notice("ElevenLabs API Response Status: \(httpResponse.statusCode)") + if let responseBody = String(data: data, encoding: .utf8) { + logger.notice("ElevenLabs API Response Body: \(responseBody)") + } + if !(200...299).contains(httpResponse.statusCode) { let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) } do { - let transcriptionResponse = try JSONDecoder().decode(TranscriptionResponse.self, from: data) + let transcriptionResponse = try JSONDecoder().decode(ElevenLabsTranscriptionResponse.self, from: data) return transcriptionResponse.text } catch { throw CloudTranscriptionError.noTranscriptionReturned } } - private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { - guard let apiKey = UserDefaults.standard.string(forKey: "ElevenLabsAPIKey"), !apiKey.isEmpty else { - throw CloudTranscriptionError.missingAPIKey - } - - guard let apiURL = URL(string: "https://api.elevenlabs.io/v1/speech-to-text") else { - throw NSError(domain: "ElevenLabsTranscriptionService", code: -1, userInfo: [NSLocalizedDescriptionKey: "Invalid API URL"]) - } - return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) - } - - private func createElevenLabsRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { + private func createRequestBody(audioURL: URL, modelName: String, boundary: String) throws -> Data { var body = Data() - let crlf = "\r\n" - guard let audioData = try? Data(contentsOf: audioURL) else { - throw CloudTranscriptionError.audioFileNotFound - } - - // File - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"file\"; filename=\"\(audioURL.lastPathComponent)\"\(crlf)".data(using: .utf8)!) - body.append("Content-Type: audio/wav\(crlf)\(crlf)".data(using: .utf8)!) - body.append(audioData) - body.append(crlf.data(using: .utf8)!) - - // Model ID - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"model_id\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(modelName.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - - // Disable audio event tagging - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"tag_audio_events\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append("false".data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) - - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"temperature\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append("0".data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) + body.append(formField: "file", fileName: audioURL.lastPathComponent, fileData: try Data(contentsOf: audioURL), mimeType: "audio/wav", boundary: boundary) + body.append(formField: "model_id", value: modelName, boundary: boundary) + body.append(formField: "temperature", value: "0.0", boundary: boundary) + body.append(formField: "tag_audio_events", value: "false", boundary: boundary) let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto" if selectedLanguage != "auto", !selectedLanguage.isEmpty { - body.append("--\(boundary)\(crlf)".data(using: .utf8)!) - body.append("Content-Disposition: form-data; name=\"language_code\"\(crlf)\(crlf)".data(using: .utf8)!) - body.append(selectedLanguage.data(using: .utf8)!) - body.append(crlf.data(using: .utf8)!) + body.append(formField: "language_code", value: selectedLanguage, boundary: boundary) } - body.append("--\(boundary)--\(crlf)".data(using: .utf8)!) + body.append("--\(boundary)--\r\n".data(using: .utf8)!) return body } - private struct APIConfig { - let url: URL - let apiKey: String - let modelName: String + private struct ElevenLabsTranscriptionResponse: Decodable { + let text: String + } +} + +private extension Data { + mutating func append(formField: String, value: String, boundary: String) { + let crlf = "\r\n" + append("--\(boundary)\(crlf)".data(using: .utf8)!) + append("Content-Disposition: form-data; name=\"\(formField)\"\(crlf)\(crlf)".data(using: .utf8)!) + append(value.data(using: .utf8)!) + append(crlf.data(using: .utf8)!) } - private struct TranscriptionResponse: Decodable { - let text: String - let language: String? - let duration: Double? - let x_groq: GroqMetadata? - - struct GroqMetadata: Decodable { - let id: String? - } + mutating func append(formField: String, fileName: String, fileData: Data, mimeType: String, boundary: String) { + let crlf = "\r\n" + append("--\(boundary)\(crlf)".data(using: .utf8)!) + append("Content-Disposition: form-data; name=\"\(formField)\"; filename=\"\(fileName)\"\(crlf)".data(using: .utf8)!) + append("Content-Type: \(mimeType)\(crlf)\(crlf)".data(using: .utf8)!) + append(fileData) + append(crlf.data(using: .utf8)!) } } \ No newline at end of file