From 2708cc502a097e2189efbdd694b918443f64dae3 Mon Sep 17 00:00:00 2001 From: Beingpax Date: Mon, 25 Aug 2025 12:00:15 +0545 Subject: [PATCH] Added support for Gemini models for transcription --- VoiceInk/Models/PredefinedModels.swift | 22 +++ VoiceInk/Models/TranscriptionModel.swift | 1 + .../CloudTranscriptionService.swift | 3 + .../GeminiTranscriptionService.swift | 158 ++++++++++++++++++ .../AI Models/CloudModelCardRowView.swift | 4 + .../Views/AI Models/ModelCardRowView.swift | 2 +- .../Views/AI Models/ModelManagementView.swift | 2 +- .../Whisper/WhisperState+ModelQueries.swift | 3 + 8 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 VoiceInk/Services/CloudTranscription/GeminiTranscriptionService.swift diff --git a/VoiceInk/Models/PredefinedModels.swift b/VoiceInk/Models/PredefinedModels.swift index 27e958a..d8d8f70 100644 --- a/VoiceInk/Models/PredefinedModels.swift +++ b/VoiceInk/Models/PredefinedModels.swift @@ -232,6 +232,28 @@ import Foundation accuracy: 0.97, isMultilingual: true, supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .mistral) + ), + + // Gemini Models + CloudModel( + name: "gemini-2.5-pro", + displayName: "Gemini 2.5 Pro", + description: "Google's advanced multimodal model with high-quality transcription capabilities.", + provider: .gemini, + speed: 0.7, + accuracy: 0.96, + isMultilingual: true, + supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini) + ), + CloudModel( + name: "gemini-2.5-flash", + displayName: "Gemini 2.5 Flash", + description: "Google's optimized model for low-latency transcription with multimodal support.", + provider: .gemini, + speed: 0.9, + accuracy: 0.94, + isMultilingual: true, + supportedLanguages: getLanguageDictionary(isMultilingual: true, provider: .gemini) ) ] diff --git a/VoiceInk/Models/TranscriptionModel.swift b/VoiceInk/Models/TranscriptionModel.swift index d7856db..8d308ee 100644 --- a/VoiceInk/Models/TranscriptionModel.swift +++ b/VoiceInk/Models/TranscriptionModel.swift @@ -8,6 +8,7 @@ enum ModelProvider: String, Codable, Hashable, CaseIterable { case elevenLabs = "ElevenLabs" case deepgram = "Deepgram" case mistral = "Mistral" + case gemini = "Gemini" case custom = "Custom" case nativeApple = "Native Apple" // Future providers can be added here diff --git a/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift index 9c180b1..a983d9f 100644 --- a/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift +++ b/VoiceInk/Services/CloudTranscription/CloudTranscriptionService.swift @@ -39,6 +39,7 @@ class CloudTranscriptionService: TranscriptionService { private lazy var elevenLabsService = ElevenLabsTranscriptionService() private lazy var deepgramService = DeepgramTranscriptionService() private lazy var mistralService = MistralTranscriptionService() + private lazy var geminiService = GeminiTranscriptionService() private lazy var openAICompatibleService = OpenAICompatibleTranscriptionService() func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { @@ -53,6 +54,8 @@ class CloudTranscriptionService: TranscriptionService { text = try await deepgramService.transcribe(audioURL: audioURL, model: model) case .mistral: text = try await mistralService.transcribe(audioURL: audioURL, model: model) + case .gemini: + text = try await geminiService.transcribe(audioURL: audioURL, model: model) case .custom: guard let customModel = model as? CustomCloudModel else { throw CloudTranscriptionError.unsupportedProvider diff --git a/VoiceInk/Services/CloudTranscription/GeminiTranscriptionService.swift b/VoiceInk/Services/CloudTranscription/GeminiTranscriptionService.swift new file mode 100644 index 0000000..2babeef --- /dev/null +++ b/VoiceInk/Services/CloudTranscription/GeminiTranscriptionService.swift @@ -0,0 +1,158 @@ +import Foundation +import os + +class GeminiTranscriptionService { + private let logger = Logger(subsystem: "com.voiceink.transcription", category: "GeminiService") + + func transcribe(audioURL: URL, model: any TranscriptionModel) async throws -> String { + let config = try getAPIConfig(for: model) + + logger.notice("Starting Gemini transcription with model: \(model.name, privacy: .public)") + + var request = URLRequest(url: config.url) + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.setValue(config.apiKey, forHTTPHeaderField: "x-goog-api-key") + + guard let audioData = try? Data(contentsOf: audioURL) else { + throw CloudTranscriptionError.audioFileNotFound + } + + logger.notice("Audio file loaded, size: \(audioData.count) bytes") + + let base64AudioData = audioData.base64EncodedString() + + let requestBody = GeminiRequest( + contents: [ + GeminiContent( + parts: [ + .text(GeminiTextPart(text: "Please transcribe this audio file. Provide only the transcribed text.")), + .audio(GeminiAudioPart( + inlineData: GeminiInlineData( + mimeType: "audio/wav", + data: base64AudioData + ) + )) + ] + ) + ] + ) + + do { + let jsonData = try JSONEncoder().encode(requestBody) + request.httpBody = jsonData + logger.notice("Request body encoded, sending to Gemini API") + } catch { + logger.error("Failed to encode Gemini request: \(error.localizedDescription)") + throw CloudTranscriptionError.dataEncodingError + } + + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw CloudTranscriptionError.networkError(URLError(.badServerResponse)) + } + + if !(200...299).contains(httpResponse.statusCode) { + let errorMessage = String(data: data, encoding: .utf8) ?? "No error message" + logger.error("Gemini API request failed with status \(httpResponse.statusCode): \(errorMessage, privacy: .public)") + throw CloudTranscriptionError.apiRequestFailed(statusCode: httpResponse.statusCode, message: errorMessage) + } + + do { + let transcriptionResponse = try JSONDecoder().decode(GeminiResponse.self, from: data) + guard let candidate = transcriptionResponse.candidates.first, + let part = candidate.content.parts.first, + !part.text.isEmpty else { + logger.error("No transcript found in Gemini response") + throw CloudTranscriptionError.noTranscriptionReturned + } + logger.notice("Gemini transcription successful, text length: \(part.text.count)") + return part.text.trimmingCharacters(in: .whitespacesAndNewlines) + } catch { + logger.error("Failed to decode Gemini API response: \(error.localizedDescription)") + throw CloudTranscriptionError.noTranscriptionReturned + } + } + + private func getAPIConfig(for model: any TranscriptionModel) throws -> APIConfig { + guard let apiKey = UserDefaults.standard.string(forKey: "GeminiAPIKey"), !apiKey.isEmpty else { + throw CloudTranscriptionError.missingAPIKey + } + + let urlString = "https://generativelanguage.googleapis.com/v1beta/models/\(model.name):generateContent" + guard let apiURL = URL(string: urlString) else { + throw CloudTranscriptionError.dataEncodingError + } + + return APIConfig(url: apiURL, apiKey: apiKey, modelName: model.name) + } + + private struct APIConfig { + let url: URL + let apiKey: String + let modelName: String + } + + private struct GeminiRequest: Codable { + let contents: [GeminiContent] + } + + private struct GeminiContent: Codable { + let parts: [GeminiPart] + } + + private enum GeminiPart: Codable { + case text(GeminiTextPart) + case audio(GeminiAudioPart) + + func encode(to encoder: Encoder) throws { + var container = encoder.singleValueContainer() + switch self { + case .text(let textPart): + try container.encode(textPart) + case .audio(let audioPart): + try container.encode(audioPart) + } + } + + init(from decoder: Decoder) throws { + let container = try decoder.singleValueContainer() + if let textPart = try? container.decode(GeminiTextPart.self) { + self = .text(textPart) + } else if let audioPart = try? container.decode(GeminiAudioPart.self) { + self = .audio(audioPart) + } else { + throw DecodingError.dataCorrupted(DecodingError.Context(codingPath: decoder.codingPath, debugDescription: "Invalid part")) + } + } + } + + private struct GeminiTextPart: Codable { + let text: String + } + + private struct GeminiAudioPart: Codable { + let inlineData: GeminiInlineData + } + + private struct GeminiInlineData: Codable { + let mimeType: String + let data: String + } + + private struct GeminiResponse: Codable { + let candidates: [GeminiCandidate] + } + + private struct GeminiCandidate: Codable { + let content: GeminiResponseContent + } + + private struct GeminiResponseContent: Codable { + let parts: [GeminiResponsePart] + } + + private struct GeminiResponsePart: Codable { + let text: String + } +} \ No newline at end of file diff --git a/VoiceInk/Views/AI Models/CloudModelCardRowView.swift b/VoiceInk/Views/AI Models/CloudModelCardRowView.swift index f01954b..04fc2f8 100644 --- a/VoiceInk/Views/AI Models/CloudModelCardRowView.swift +++ b/VoiceInk/Views/AI Models/CloudModelCardRowView.swift @@ -36,6 +36,8 @@ struct CloudModelCardView: View { return "Deepgram" case .mistral: return "Mistral" + case .gemini: + return "Gemini" default: return model.provider.rawValue } @@ -277,6 +279,8 @@ struct CloudModelCardView: View { aiService.selectedProvider = .deepgram case .mistral: aiService.selectedProvider = .mistral + case .gemini: + aiService.selectedProvider = .gemini default: // This case should ideally not be hit for cloud models in this view print("Warning: verifyAPIKey called for unsupported provider \(model.provider.rawValue)") diff --git a/VoiceInk/Views/AI Models/ModelCardRowView.swift b/VoiceInk/Views/AI Models/ModelCardRowView.swift index 78cd3d9..e33fc71 100644 --- a/VoiceInk/Views/AI Models/ModelCardRowView.swift +++ b/VoiceInk/Views/AI Models/ModelCardRowView.swift @@ -55,7 +55,7 @@ struct ModelCardRowView: View { setDefaultAction: setDefaultAction ) } - case .groq, .elevenLabs, .deepgram, .mistral: + case .groq, .elevenLabs, .deepgram, .mistral, .gemini: if let cloudModel = model as? CloudModel { CloudModelCardView( model: cloudModel, diff --git a/VoiceInk/Views/AI Models/ModelManagementView.swift b/VoiceInk/Views/AI Models/ModelManagementView.swift index 3204d2a..65b2964 100644 --- a/VoiceInk/Views/AI Models/ModelManagementView.swift +++ b/VoiceInk/Views/AI Models/ModelManagementView.swift @@ -218,7 +218,7 @@ struct ModelManagementView: View { case .local: return whisperState.allAvailableModels.filter { $0.provider == .local || $0.provider == .nativeApple || $0.provider == .parakeet } case .cloud: - let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral] + let cloudProviders: [ModelProvider] = [.groq, .elevenLabs, .deepgram, .mistral, .gemini] return whisperState.allAvailableModels.filter { cloudProviders.contains($0.provider) } case .custom: return whisperState.allAvailableModels.filter { $0.provider == .custom } diff --git a/VoiceInk/Whisper/WhisperState+ModelQueries.swift b/VoiceInk/Whisper/WhisperState+ModelQueries.swift index f38d39a..8ecb840 100644 --- a/VoiceInk/Whisper/WhisperState+ModelQueries.swift +++ b/VoiceInk/Whisper/WhisperState+ModelQueries.swift @@ -26,6 +26,9 @@ extension WhisperState { case .mistral: let key = UserDefaults.standard.string(forKey: "MistralAPIKey") return key != nil && !key!.isEmpty + case .gemini: + let key = UserDefaults.standard.string(forKey: "GeminiAPIKey") + return key != nil && !key!.isEmpty case .custom: // Custom models are always usable since they contain their own API keys return true