Retranscription Ability

This commit is contained in:
Beingpax 2025-03-16 17:15:52 +05:45
parent 6aafc992e2
commit 8b1e27e1cd
4 changed files with 371 additions and 17 deletions

View File

@ -0,0 +1,201 @@
import Foundation
import SwiftUI
import AVFoundation
import SwiftData
import os
@MainActor
class AudioTranscriptionService: ObservableObject {
@Published var isTranscribing = false
@Published var messageLog = ""
@Published var currentError: TranscriptionError?
private var whisperContext: WhisperContext?
private let modelContext: ModelContext
private let enhancementService: AIEnhancementService?
private let whisperState: WhisperState
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "AudioTranscriptionService")
enum TranscriptionError: Error {
case noAudioFile
case transcriptionFailed
case modelNotLoaded
case invalidAudioFormat
}
init(modelContext: ModelContext, whisperState: WhisperState) {
self.modelContext = modelContext
self.whisperState = whisperState
self.enhancementService = whisperState.enhancementService
}
func retranscribeAudio(from url: URL, using whisperModel: WhisperModel) async throws -> Transcription {
guard FileManager.default.fileExists(atPath: url.path) else {
throw TranscriptionError.noAudioFile
}
await MainActor.run {
isTranscribing = true
messageLog = "Loading model...\n"
}
// Load the whisper model if needed
if whisperContext == nil {
do {
whisperContext = try await WhisperContext.createContext(path: whisperModel.url.path)
messageLog += "Model loaded successfully.\n"
} catch {
logger.error("❌ Failed to load model: \(error.localizedDescription)")
messageLog += "Failed to load model: \(error.localizedDescription)\n"
isTranscribing = false
throw TranscriptionError.modelNotLoaded
}
}
guard let whisperContext = whisperContext else {
isTranscribing = false
throw TranscriptionError.modelNotLoaded
}
// Get audio duration
let audioAsset = AVURLAsset(url: url)
var duration: TimeInterval = 0
if #available(macOS 13.0, *) {
let durationValue = try await audioAsset.load(.duration)
duration = CMTimeGetSeconds(durationValue)
} else {
duration = CMTimeGetSeconds(audioAsset.duration)
}
// Create a permanent copy of the audio file
let recordingsDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
.appendingPathComponent("com.prakashjoshipax.VoiceInk")
.appendingPathComponent("Recordings")
let fileName = "retranscribed_\(UUID().uuidString).wav"
let permanentURL = recordingsDirectory.appendingPathComponent(fileName)
do {
try FileManager.default.copyItem(at: url, to: permanentURL)
} catch {
logger.error("❌ Failed to create permanent copy of audio: \(error.localizedDescription)")
messageLog += "Failed to create permanent copy of audio: \(error.localizedDescription)\n"
isTranscribing = false
throw error
}
let permanentURLString = permanentURL.absoluteString
// Transcribe the audio
messageLog += "Transcribing audio...\n"
do {
// Read audio samples
let samples = try readAudioSamples(permanentURL)
// Process with Whisper - using the same prompt as WhisperState
messageLog += "Setting prompt: \(whisperState.whisperPrompt.transcriptionPrompt)\n"
await whisperContext.setPrompt(whisperState.whisperPrompt.transcriptionPrompt)
try await whisperContext.fullTranscribe(samples: samples)
var text = await whisperContext.getTranscription()
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
logger.notice("✅ Retranscription completed successfully, length: \(text.count) characters")
// Apply AI enhancement if enabled - using the same enhancement service as WhisperState
if let enhancementService = enhancementService,
enhancementService.isEnhancementEnabled,
enhancementService.isConfigured {
do {
messageLog += "Enhancing transcription with AI...\n"
let enhancedText = try await enhancementService.enhance(text)
messageLog += "Enhancement completed.\n"
let newTranscription = Transcription(
text: text,
duration: duration,
enhancedText: enhancedText,
audioFileURL: permanentURLString
)
modelContext.insert(newTranscription)
do {
try modelContext.save()
} catch {
logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
}
await MainActor.run {
isTranscribing = false
messageLog += "Done: \(enhancedText)\n"
}
return newTranscription
} catch {
messageLog += "Enhancement failed: \(error.localizedDescription). Using original transcription.\n"
let newTranscription = Transcription(
text: text,
duration: duration,
audioFileURL: permanentURLString
)
modelContext.insert(newTranscription)
do {
try modelContext.save()
} catch {
logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
}
await MainActor.run {
isTranscribing = false
messageLog += "Done: \(text)\n"
}
return newTranscription
}
} else {
let newTranscription = Transcription(
text: text,
duration: duration,
audioFileURL: permanentURLString
)
modelContext.insert(newTranscription)
do {
try modelContext.save()
} catch {
logger.error("❌ Failed to save transcription: \(error.localizedDescription)")
messageLog += "Failed to save transcription: \(error.localizedDescription)\n"
}
await MainActor.run {
isTranscribing = false
messageLog += "Done: \(text)\n"
}
return newTranscription
}
} catch {
logger.error("❌ Transcription failed: \(error.localizedDescription)")
messageLog += "Transcription failed: \(error.localizedDescription)\n"
currentError = .transcriptionFailed
isTranscribing = false
throw error
}
}
private func readAudioSamples(_ url: URL) throws -> [Float] {
return try decodeWaveFile(url)
}
private func decodeWaveFile(_ url: URL) throws -> [Float] {
let data = try Data(contentsOf: url)
let floats = stride(from: 44, to: data.count, by: 2).map {
return data[$0..<$0 + 2].withUnsafeBytes {
let short = Int16(littleEndian: $0.load(as: Int16.self))
return max(-1.0, min(Float(short) / 32767.0, 1.0))
}
}
return floats
}
}

View File

@ -237,6 +237,22 @@ struct AudioPlayerView: View {
@StateObject private var playerManager = AudioPlayerManager()
@State private var isHovering = false
@State private var showingTooltip = false
@State private var isRetranscribing = false
@State private var showRetranscribeSuccess = false
@State private var showRetranscribeError = false
@State private var errorMessage = ""
// Add environment objects for retranscription
@EnvironmentObject private var whisperState: WhisperState
@Environment(\.modelContext) private var modelContext
// Create the audio transcription service lazily
private var transcriptionService: AudioTranscriptionService {
AudioTranscriptionService(
modelContext: modelContext,
whisperState: whisperState
)
}
var body: some View {
VStack(spacing: 16) {
@ -298,6 +314,34 @@ struct AudioPlayerView: View {
}
}
// Add Retranscribe button
Button(action: {
retranscribeAudio()
}) {
Circle()
.fill(Color.green.opacity(0.1))
.frame(width: 44, height: 44)
.overlay(
Group {
if isRetranscribing {
ProgressView()
.controlSize(.small)
} else if showRetranscribeSuccess {
Image(systemName: "checkmark")
.font(.system(size: 18, weight: .semibold))
.foregroundStyle(Color.green)
} else {
Image(systemName: "arrow.clockwise")
.font(.system(size: 18, weight: .semibold))
.foregroundStyle(Color.green)
}
}
)
}
.buttonStyle(.plain)
.disabled(isRetranscribing)
.help("Retranscribe this audio")
// Time
Text(formatTime(playerManager.currentTime))
.font(.system(size: 14, weight: .medium))
@ -311,6 +355,55 @@ struct AudioPlayerView: View {
.onAppear {
playerManager.loadAudio(from: url)
}
.overlay(
// Success notification
VStack {
if showRetranscribeSuccess {
HStack(spacing: 8) {
Image(systemName: "checkmark.circle.fill")
.foregroundColor(.green)
Text("Retranscription successful")
.font(.system(size: 14, weight: .medium))
}
.padding(.horizontal, 16)
.padding(.vertical, 10)
.background(
RoundedRectangle(cornerRadius: 8)
.fill(Color.green.opacity(0.1))
.overlay(
RoundedRectangle(cornerRadius: 8)
.stroke(Color.green.opacity(0.2), lineWidth: 1)
)
)
.transition(.move(edge: .top).combined(with: .opacity))
}
if showRetranscribeError {
HStack(spacing: 8) {
Image(systemName: "exclamationmark.circle.fill")
.foregroundColor(.red)
Text(errorMessage.isEmpty ? "Retranscription failed" : errorMessage)
.font(.system(size: 14, weight: .medium))
}
.padding(.horizontal, 16)
.padding(.vertical, 10)
.background(
RoundedRectangle(cornerRadius: 8)
.fill(Color.red.opacity(0.1))
.overlay(
RoundedRectangle(cornerRadius: 8)
.stroke(Color.red.opacity(0.2), lineWidth: 1)
)
)
.transition(.move(edge: .top).combined(with: .opacity))
}
Spacer()
}
.padding(.top, 16)
.animation(.spring(response: 0.3, dampingFraction: 0.7), value: showRetranscribeSuccess)
.animation(.spring(response: 0.3, dampingFraction: 0.7), value: showRetranscribeError)
)
}
private func formatTime(_ time: TimeInterval) -> String {
@ -318,4 +411,57 @@ struct AudioPlayerView: View {
let seconds = Int(time) % 60
return String(format: "%d:%02d", minutes, seconds)
}
private func retranscribeAudio() {
guard let currentModel = whisperState.currentModel else {
errorMessage = "No transcription model selected"
showRetranscribeError = true
// Hide error after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeError = false
}
}
return
}
isRetranscribing = true
Task {
do {
// Use the AudioTranscriptionService to retranscribe the audio
let _ = try await transcriptionService.retranscribeAudio(
from: url,
using: currentModel
)
// Show success notification
await MainActor.run {
isRetranscribing = false
showRetranscribeSuccess = true
// Hide success after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeSuccess = false
}
}
}
} catch {
await MainActor.run {
isRetranscribing = false
errorMessage = error.localizedDescription
showRetranscribeError = true
// Hide error after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeError = false
}
}
}
}
}
}
}

View File

@ -17,8 +17,8 @@ struct TranscriptionHistoryView: View {
@State private var lastTimestamp: Date?
private let pageSize = 20
// Query for latest transcriptions (used only for monitoring new additions)
@Query(sort: \Transcription.timestamp, order: .reverse)
// Query for latest transcriptions (used for real-time updates)
@Query(sort: \Transcription.timestamp, order: .reverse, animation: .default)
private var latestTranscriptions: [Transcription]
// Cursor-based query descriptor
@ -130,9 +130,27 @@ struct TranscriptionHistoryView: View {
await loadInitialContent()
}
}
// Monitor for new transcriptions
.onChange(of: latestTranscriptions) { _, newTranscriptions in
handleNewTranscriptions(newTranscriptions)
// Improved change detection for new transcriptions
.onChange(of: latestTranscriptions) { oldValue, newValue in
// Check if a new transcription was added
if !newValue.isEmpty && (oldValue.isEmpty || newValue[0].id != oldValue[0].id) {
// Only refresh if we're on the first page (no pagination cursor set)
if lastTimestamp == nil {
Task {
await loadInitialContent()
}
} else {
// If we're on a paginated view, show a notification or indicator that new content is available
// This could be a banner or button to "Show new transcriptions"
withAnimation {
// Reset pagination to show the latest content
Task {
await resetPagination()
await loadInitialContent()
}
}
}
}
}
}
@ -305,17 +323,6 @@ struct TranscriptionHistoryView: View {
}
}
// Simplified function to handle new transcriptions
private func handleNewTranscriptions(_ newTranscriptions: [Transcription]) {
// Only update if we're on the first page and not searching
// Only check the first item since we only care about the newest one
if lastTimestamp == nil && searchText.isEmpty && !newTranscriptions.isEmpty {
Task {
await loadInitialContent()
}
}
}
// Modified function to select all transcriptions in the database
private func selectAllTranscriptions() async {
do {

View File

@ -60,7 +60,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
let modelsDirectory: URL
let recordingsDirectory: URL
private let enhancementService: AIEnhancementService?
let enhancementService: AIEnhancementService?
private let licenseViewModel: LicenseViewModel
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperState")
private var transcriptionStartTime: Date?