Retranscription Ability

2025-03-16 17:15:52 +05:45 · 2025-03-16 17:15:52 +05:45 · 8b1e27e1cd
commit 8b1e27e1cd
parent 6aafc992e2
4 changed files with 371 additions and 17 deletions
--- a/VoiceInk/Services/AudioTranscriptionService.swift
+++ b/VoiceInk/Services/AudioTranscriptionService.swift
@ -0,0 +1,201 @@
+import Foundation
+import SwiftUI
+import AVFoundation
+import SwiftData
+import os
+
+@MainActor
+class AudioTranscriptionService: ObservableObject {
+    @Published var isTranscribing = false
+    @Published var messageLog = ""
+    @Published var currentError: TranscriptionError?
+    
+    private var whisperContext: WhisperContext?
+    private let modelContext: ModelContext
+    private let enhancementService: AIEnhancementService?
+    private let whisperState: WhisperState
+    private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "AudioTranscriptionService")
+    
+    enum TranscriptionError: Error {
+        case noAudioFile
+        case transcriptionFailed
+        case modelNotLoaded
+        case invalidAudioFormat
+    }
+    
+    init(modelContext: ModelContext, whisperState: WhisperState) {
+        self.modelContext = modelContext
+        self.whisperState = whisperState
+        self.enhancementService = whisperState.enhancementService
+    }
+    
+    func retranscribeAudio(from url: URL, using whisperModel: WhisperModel) async throws -> Transcription {
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            throw TranscriptionError.noAudioFile
+        }
+        
+        await MainActor.run {
+            isTranscribing = true
+            messageLog = "Loading model...\n"
+        }
+        
+        // Load the whisper model if needed
+        if whisperContext == nil {
+            do {
+                whisperContext = try await WhisperContext.createContext(path: whisperModel.url.path)
+                messageLog += "Model loaded successfully.\n"
+            } catch {
+                logger.error("❌ Failed to load model: \(error.localizedDescription)")
+                messageLog += "Failed to load model: \(error.localizedDescription)\n"
+                isTranscribing = false
+                throw TranscriptionError.modelNotLoaded
+            }
+        }
+        
+        guard let whisperContext = whisperContext else {
+            isTranscribing = false
+            throw TranscriptionError.modelNotLoaded
+        }
+        
+        // Get audio duration
+        let audioAsset = AVURLAsset(url: url)
+        var duration: TimeInterval = 0
+        
+        if #available(macOS 13.0, *) {
+            let durationValue = try await audioAsset.load(.duration)
+            duration = CMTimeGetSeconds(durationValue)
+        } else {
+            duration = CMTimeGetSeconds(audioAsset.duration)
+        }
+        
+        // Create a permanent copy of the audio file
+        let recordingsDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
+            .appendingPathComponent("com.prakashjoshipax.VoiceInk")
+            .appendingPathComponent("Recordings")
+        
+        let fileName = "retranscribed_\(UUID().uuidString).wav"
+        let permanentURL = recordingsDirectory.appendingPathComponent(fileName)
+        
+        do {
+            try FileManager.default.copyItem(at: url, to: permanentURL)
+        } catch {
+            logger.error("❌ Failed to create permanent copy of audio: \(error.localizedDescription)")
+            messageLog += "Failed to create permanent copy of audio: \(error.localizedDescription)\n"
+            isTranscribing = false
+            throw error
+        }
+        
+        let permanentURLString = permanentURL.absoluteString
+        
+        // Transcribe the audio
+        messageLog += "Transcribing audio...\n"
+        
+        do {
+            // Read audio samples
+            let samples = try readAudioSamples(permanentURL)
+            
+            // Process with Whisper - using the same prompt as WhisperState
+            messageLog += "Setting prompt: \(whisperState.whisperPrompt.transcriptionPrompt)\n"
+            await whisperContext.setPrompt(whisperState.whisperPrompt.transcriptionPrompt)
+            
+            try await whisperContext.fullTranscribe(samples: samples)
+            var text = await whisperContext.getTranscription()
+            text = text.trimmingCharacters(in: .whitespacesAndNewlines)
+            logger.notice("✅ Retranscription completed successfully, length: \(text.count) characters")
+            
+            // Apply AI enhancement if enabled - using the same enhancement service as WhisperState
+            if let enhancementService = enhancementService,
+               enhancementService.isEnhancementEnabled,
+               enhancementService.isConfigured {
+                do {
+                    messageLog += "Enhancing transcription with AI...\n"
+                    let enhancedText = try await enhancementService.enhance(text)
+                    messageLog += "Enhancement completed.\n"
+                    
+                    let newTranscription = Transcription(
+                        text: text,
+                        duration: duration,
+                        enhancedText: enhancedText,
+                        audioFileURL: permanentURLString
+                    )
+                    modelContext.insert(newTranscription)
+                    do {
+                        try modelContext.save()
+                    } catch {
+                        logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
+                        messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
+                    }
+                    
+                    await MainActor.run {
+                        isTranscribing = false
+                        messageLog += "Done: \(enhancedText)\n"
+                    }
+                    
+                    return newTranscription
+                } catch {
+                    messageLog += "Enhancement failed: \(error.localizedDescription). Using original transcription.\n"
+                    let newTranscription = Transcription(
+                        text: text,
+                        duration: duration,
+                        audioFileURL: permanentURLString
+                    )
+                    modelContext.insert(newTranscription)
+                    do {
+                        try modelContext.save()
+                    } catch {
+                        logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
+                        messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
+                    }
+                    
+                    await MainActor.run {
+                        isTranscribing = false
+                        messageLog += "Done: \(text)\n"
+                    }
+                    
+                    return newTranscription
+                }
+            } else {
+                let newTranscription = Transcription(
+                    text: text,
+                    duration: duration,
+                    audioFileURL: permanentURLString
+                )
+                modelContext.insert(newTranscription)
+                do {
+                    try modelContext.save()
+                } catch {
+                    logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
+                    messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
+                }
+                
+                await MainActor.run {
+                    isTranscribing = false
+                    messageLog += "Done: \(text)\n"
+                }
+                
+                return newTranscription
+            }
+        } catch {
+            logger.error("❌ Transcription failed: \(error.localizedDescription)")
+            messageLog += "Transcription failed: \(error.localizedDescription)\n"
+            currentError = .transcriptionFailed
+            isTranscribing = false
+            throw error
+        }
+    }
+    
+    private func readAudioSamples(_ url: URL) throws -> [Float] {
+        return try decodeWaveFile(url)
+    }
+    
+    private func decodeWaveFile(_ url: URL) throws -> [Float] {
+        let data = try Data(contentsOf: url)
+        let floats = stride(from: 44, to: data.count, by: 2).map {
+            return data[$0..<$0 + 2].withUnsafeBytes {
+                let short = Int16(littleEndian: $0.load(as: Int16.self))
+                return max(-1.0, min(Float(short) / 32767.0, 1.0))
+            }
+        }
+        return floats
+    }
+}
--- a/VoiceInk/Views/AudioPlayerView.swift
+++ b/VoiceInk/Views/AudioPlayerView.swift
@ -237,6 +237,22 @@ struct AudioPlayerView: View {
    @StateObject private var playerManager = AudioPlayerManager()
    @State private var isHovering = false
    @State private var showingTooltip = false
+    @State private var isRetranscribing = false
+    @State private var showRetranscribeSuccess = false
+    @State private var showRetranscribeError = false
+    @State private var errorMessage = ""
+    
+    // Add environment objects for retranscription
+    @EnvironmentObject private var whisperState: WhisperState
+    @Environment(\.modelContext) private var modelContext
+    
+    // Create the audio transcription service lazily
+    private var transcriptionService: AudioTranscriptionService {
+        AudioTranscriptionService(
+            modelContext: modelContext,
+            whisperState: whisperState
+        )
+    }
    
    var body: some View {
        VStack(spacing: 16) {
@ -298,6 +314,34 @@ struct AudioPlayerView: View {
                        }
                    }
                    
+                    // Add Retranscribe button
+                    Button(action: {
+                        retranscribeAudio()
+                    }) {
+                        Circle()
+                            .fill(Color.green.opacity(0.1))
+                            .frame(width: 44, height: 44)
+                            .overlay(
+                                Group {
+                                    if isRetranscribing {
+                                        ProgressView()
+                                            .controlSize(.small)
+                                    } else if showRetranscribeSuccess {
+                                        Image(systemName: "checkmark")
+                                            .font(.system(size: 18, weight: .semibold))
+                                            .foregroundStyle(Color.green)
+                                    } else {
+                                        Image(systemName: "arrow.clockwise")
+                                            .font(.system(size: 18, weight: .semibold))
+                                            .foregroundStyle(Color.green)
+                                    }
+                                }
+                            )
+                    }
+                    .buttonStyle(.plain)
+                    .disabled(isRetranscribing)
+                    .help("Retranscribe this audio")
+                    
                    // Time
                    Text(formatTime(playerManager.currentTime))
                        .font(.system(size: 14, weight: .medium))
@ -311,6 +355,55 @@ struct AudioPlayerView: View {
        .onAppear {
            playerManager.loadAudio(from: url)
        }
+        .overlay(
+            // Success notification
+            VStack {
+                if showRetranscribeSuccess {
+                    HStack(spacing: 8) {
+                        Image(systemName: "checkmark.circle.fill")
+                            .foregroundColor(.green)
+                        Text("Retranscription successful")
+                            .font(.system(size: 14, weight: .medium))
+                    }
+                    .padding(.horizontal, 16)
+                    .padding(.vertical, 10)
+                    .background(
+                        RoundedRectangle(cornerRadius: 8)
+                            .fill(Color.green.opacity(0.1))
+                            .overlay(
+                                RoundedRectangle(cornerRadius: 8)
+                                    .stroke(Color.green.opacity(0.2), lineWidth: 1)
+                            )
+                    )
+                    .transition(.move(edge: .top).combined(with: .opacity))
+                }
+                
+                if showRetranscribeError {
+                    HStack(spacing: 8) {
+                        Image(systemName: "exclamationmark.circle.fill")
+                            .foregroundColor(.red)
+                        Text(errorMessage.isEmpty ? "Retranscription failed" : errorMessage)
+                            .font(.system(size: 14, weight: .medium))
+                    }
+                    .padding(.horizontal, 16)
+                    .padding(.vertical, 10)
+                    .background(
+                        RoundedRectangle(cornerRadius: 8)
+                            .fill(Color.red.opacity(0.1))
+                            .overlay(
+                                RoundedRectangle(cornerRadius: 8)
+                                    .stroke(Color.red.opacity(0.2), lineWidth: 1)
+                            )
+                    )
+                    .transition(.move(edge: .top).combined(with: .opacity))
+                }
+                
+                Spacer()
+            }
+            .padding(.top, 16)
+            .animation(.spring(response: 0.3, dampingFraction: 0.7), value: showRetranscribeSuccess)
+            .animation(.spring(response: 0.3, dampingFraction: 0.7), value: showRetranscribeError)
+        )
    }
    
    private func formatTime(_ time: TimeInterval) -> String {
@ -318,4 +411,57 @@ struct AudioPlayerView: View {
        let seconds = Int(time) % 60
        return String(format: "%d:%02d", minutes, seconds)
    }
+    
+    private func retranscribeAudio() {
+        guard let currentModel = whisperState.currentModel else {
+            errorMessage = "No transcription model selected"
+            showRetranscribeError = true
+            
+            // Hide error after 3 seconds
+            DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
+                withAnimation {
+                    showRetranscribeError = false
+                }
+            }
+            return
+        }
+        
+        isRetranscribing = true
+        
+        Task {
+            do {
+                // Use the AudioTranscriptionService to retranscribe the audio
+                let _ = try await transcriptionService.retranscribeAudio(
+                    from: url,
+                    using: currentModel
+                )
+                
+                // Show success notification
+                await MainActor.run {
+                    isRetranscribing = false
+                    showRetranscribeSuccess = true
+                    
+                    // Hide success after 3 seconds
+                    DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
+                        withAnimation {
+                            showRetranscribeSuccess = false
+                        }
+                    }
+                }
+            } catch {
+                await MainActor.run {
+                    isRetranscribing = false
+                    errorMessage = error.localizedDescription
+                    showRetranscribeError = true
+                    
+                    // Hide error after 3 seconds
+                    DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
+                        withAnimation {
+                            showRetranscribeError = false
+                        }
+                    }
+                }
+            }
+        }
+    }
 } 
--- a/VoiceInk/Views/TranscriptionHistoryView.swift
+++ b/VoiceInk/Views/TranscriptionHistoryView.swift
@ -17,8 +17,8 @@ struct TranscriptionHistoryView: View {
    @State private var lastTimestamp: Date?
    private let pageSize = 20
    
-    // Query for latest transcriptions (used only for monitoring new additions)
-    @Query(sort: \Transcription.timestamp, order: .reverse) 
+    // Query for latest transcriptions (used for real-time updates)
+    @Query(sort: \Transcription.timestamp, order: .reverse, animation: .default) 
    private var latestTranscriptions: [Transcription]
    
    // Cursor-based query descriptor
@ -130,9 +130,27 @@ struct TranscriptionHistoryView: View {
                await loadInitialContent()
            }
        }
-        // Monitor for new transcriptions
-        .onChange(of: latestTranscriptions) { _, newTranscriptions in
-            handleNewTranscriptions(newTranscriptions)
+        // Improved change detection for new transcriptions
+        .onChange(of: latestTranscriptions) { oldValue, newValue in
+            // Check if a new transcription was added
+            if !newValue.isEmpty && (oldValue.isEmpty || newValue[0].id != oldValue[0].id) {
+                // Only refresh if we're on the first page (no pagination cursor set)
+                if lastTimestamp == nil {
+                    Task {
+                        await loadInitialContent()
+                    }
+                } else {
+                    // If we're on a paginated view, show a notification or indicator that new content is available
+                    // This could be a banner or button to "Show new transcriptions"
+                    withAnimation {
+                        // Reset pagination to show the latest content
+                        Task {
+                            await resetPagination()
+                            await loadInitialContent()
+                        }
+                    }
+                }
+            }
        }
    }
    
@ -305,17 +323,6 @@ struct TranscriptionHistoryView: View {
        }
    }
    
-    // Simplified function to handle new transcriptions
-    private func handleNewTranscriptions(_ newTranscriptions: [Transcription]) {
-        // Only update if we're on the first page and not searching
-        // Only check the first item since we only care about the newest one
-        if lastTimestamp == nil && searchText.isEmpty && !newTranscriptions.isEmpty {
-            Task {
-                await loadInitialContent()
-            }
-        }
-    }
-    
    // Modified function to select all transcriptions in the database
    private func selectAllTranscriptions() async {
        do {
--- a/VoiceInk/Whisper/WhisperState.swift
+++ b/VoiceInk/Whisper/WhisperState.swift
@ -60,7 +60,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    
    let modelsDirectory: URL
    let recordingsDirectory: URL
-    private let enhancementService: AIEnhancementService?
+    let enhancementService: AIEnhancementService?
    private let licenseViewModel: LicenseViewModel
    private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperState")
    private var transcriptionStartTime: Date?