Transcribe Audio Files

This commit is contained in:
Beingpax 2025-03-17 17:04:29 +05:45
parent 8b1e27e1cd
commit 4ceccb0990
8 changed files with 961 additions and 358 deletions

View File

@ -468,7 +468,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
CURRENT_PROJECT_VERSION = 0.99;
CURRENT_PROJECT_VERSION = 112;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@ -483,7 +483,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 0.99;
MARKETING_VERSION = 1.12;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
@ -501,7 +501,7 @@
"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
CODE_SIGN_STYLE = Automatic;
COMBINE_HIDPI_IMAGES = YES;
CURRENT_PROJECT_VERSION = 0.99;
CURRENT_PROJECT_VERSION = 112;
DEVELOPMENT_ASSET_PATHS = "\"VoiceInk/Preview Content\"";
DEVELOPMENT_TEAM = V6J6A3VWY2;
ENABLE_HARDENED_RUNTIME = YES;
@ -516,7 +516,7 @@
"@executable_path/../Frameworks",
);
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 0.99;
MARKETING_VERSION = 1.12;
PRODUCT_BUNDLE_IDENTIFIER = com.prakashjoshipax.VoiceInk;
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;

View File

@ -0,0 +1,176 @@
import Foundation
import AVFoundation
import os
class AudioProcessor {
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "AudioProcessor")
struct AudioFormat {
static let targetSampleRate: Double = 16000.0
static let targetChannels: UInt32 = 1
static let targetBitDepth: UInt32 = 16
}
enum AudioProcessingError: LocalizedError {
case invalidAudioFile
case conversionFailed
case exportFailed
case unsupportedFormat
case sampleExtractionFailed
var errorDescription: String? {
switch self {
case .invalidAudioFile:
return "The audio file is invalid or corrupted"
case .conversionFailed:
return "Failed to convert the audio format"
case .exportFailed:
return "Failed to export the processed audio"
case .unsupportedFormat:
return "The audio format is not supported"
case .sampleExtractionFailed:
return "Failed to extract audio samples"
}
}
}
/// Process audio file and return samples ready for Whisper
/// - Parameter url: URL of the input audio file
/// - Returns: Array of normalized float samples
func processAudioToSamples(_ url: URL) async throws -> [Float] {
logger.notice("🎵 Processing audio file to samples: \(url.lastPathComponent)")
// Create AVAudioFile from input
guard let audioFile = try? AVAudioFile(forReading: url) else {
logger.error("❌ Failed to create AVAudioFile from input")
throw AudioProcessingError.invalidAudioFile
}
// Get format information
let format = audioFile.processingFormat
let sampleRate = format.sampleRate
let channels = format.channelCount
logger.notice("📊 Input format - Sample Rate: \(sampleRate), Channels: \(channels)")
// Create output format (always 16kHz mono float)
let outputFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: AudioFormat.targetSampleRate,
channels: AudioFormat.targetChannels,
interleaved: false
)
guard let outputFormat = outputFormat else {
logger.error("❌ Failed to create output format")
throw AudioProcessingError.unsupportedFormat
}
// Read input file into buffer
let inputBuffer = AVAudioPCMBuffer(
pcmFormat: format,
frameCapacity: AVAudioFrameCount(audioFile.length)
)
guard let inputBuffer = inputBuffer else {
logger.error("❌ Failed to create input buffer")
throw AudioProcessingError.conversionFailed
}
try audioFile.read(into: inputBuffer)
// If format matches our target, just convert to samples
if sampleRate == AudioFormat.targetSampleRate && channels == AudioFormat.targetChannels {
logger.notice("✅ Audio format already matches requirements")
return convertToWhisperFormat(inputBuffer)
}
// Create converter for format conversion
guard let converter = AVAudioConverter(from: format, to: outputFormat) else {
logger.error("❌ Failed to create audio converter")
throw AudioProcessingError.conversionFailed
}
// Create output buffer
let ratio = AudioFormat.targetSampleRate / sampleRate
let outputBuffer = AVAudioPCMBuffer(
pcmFormat: outputFormat,
frameCapacity: AVAudioFrameCount(Double(inputBuffer.frameLength) * ratio)
)
guard let outputBuffer = outputBuffer else {
logger.error("❌ Failed to create output buffer")
throw AudioProcessingError.conversionFailed
}
// Perform conversion
var error: NSError?
let status = converter.convert(
to: outputBuffer,
error: &error,
withInputFrom: { inNumPackets, outStatus in
outStatus.pointee = .haveData
return inputBuffer
}
)
if let error = error {
logger.error("❌ Conversion failed: \(error.localizedDescription)")
throw AudioProcessingError.conversionFailed
}
if status == .error {
logger.error("❌ Conversion failed with status: error")
throw AudioProcessingError.conversionFailed
}
logger.notice("✅ Successfully converted audio format")
return convertToWhisperFormat(outputBuffer)
}
/// Convert audio buffer to Whisper-compatible samples
private func convertToWhisperFormat(_ buffer: AVAudioPCMBuffer) -> [Float] {
guard let channelData = buffer.floatChannelData else {
logger.error("❌ No channel data available in buffer")
return []
}
let channelCount = Int(buffer.format.channelCount)
let frameLength = Int(buffer.frameLength)
var samples = Array(repeating: Float(0), count: frameLength)
logger.notice("📊 Converting buffer - Channels: \(channelCount), Frames: \(frameLength)")
// If mono, just copy the samples
if channelCount == 1 {
samples = Array(UnsafeBufferPointer(start: channelData[0], count: frameLength))
logger.notice("✅ Copied mono samples directly")
}
// If stereo or more, average all channels
else {
logger.notice("🔄 Converting \(channelCount) channels to mono")
for frame in 0..<frameLength {
var sum: Float = 0
for channel in 0..<channelCount {
sum += channelData[channel][frame]
}
samples[frame] = sum / Float(channelCount)
}
}
// Normalize samples to [-1, 1]
let maxSample = samples.map(abs).max() ?? 1
if maxSample > 0 {
logger.notice("📈 Normalizing samples with max amplitude: \(maxSample)")
samples = samples.map { $0 / maxSample }
}
// Log sample statistics
if let min = samples.min(), let max = samples.max() {
logger.notice("📊 Final sample range: [\(min), \(max)]")
}
logger.notice("✅ Successfully converted \(samples.count) samples")
return samples
}
}

View File

@ -0,0 +1,189 @@
import Foundation
import SwiftUI
import AVFoundation
import SwiftData
import os
@MainActor
class AudioTranscriptionManager: ObservableObject {
static let shared = AudioTranscriptionManager()
@Published var isProcessing = false
@Published var processingPhase: ProcessingPhase = .idle
@Published var currentTranscription: Transcription?
@Published var messageLog: String = ""
@Published var errorMessage: String?
private var currentTask: Task<Void, Error>?
private var whisperContext: WhisperContext?
private let audioProcessor = AudioProcessor()
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "AudioTranscriptionManager")
enum ProcessingPhase {
case idle
case loading
case processingAudio
case transcribing
case enhancing
case completed
var message: String {
switch self {
case .idle:
return ""
case .loading:
return "Loading transcription model..."
case .processingAudio:
return "Processing audio file for transcription..."
case .transcribing:
return "Transcribing audio..."
case .enhancing:
return "Enhancing transcription with AI..."
case .completed:
return "Transcription completed!"
}
}
}
private init() {}
func startProcessing(url: URL, modelContext: ModelContext, whisperState: WhisperState) {
// Cancel any existing processing
cancelProcessing()
isProcessing = true
processingPhase = .loading
messageLog = ""
errorMessage = nil
currentTask = Task {
do {
guard let currentModel = whisperState.currentModel else {
throw TranscriptionError.noModelSelected
}
// Load Whisper model
whisperContext = try await WhisperContext.createContext(path: currentModel.url.path)
// Process audio file
processingPhase = .processingAudio
let samples = try await audioProcessor.processAudioToSamples(url)
// Get audio duration
let audioAsset = AVURLAsset(url: url)
var duration: TimeInterval = 0
if #available(macOS 13.0, *) {
let durationValue = try await audioAsset.load(.duration)
duration = CMTimeGetSeconds(durationValue)
} else {
duration = CMTimeGetSeconds(audioAsset.duration)
}
// Create permanent copy of the audio file
let recordingsDirectory = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
.appendingPathComponent("com.prakashjoshipax.VoiceInk")
.appendingPathComponent("Recordings")
let fileName = "transcribed_\(UUID().uuidString).wav"
let permanentURL = recordingsDirectory.appendingPathComponent(fileName)
try FileManager.default.createDirectory(at: recordingsDirectory, withIntermediateDirectories: true)
try FileManager.default.copyItem(at: url, to: permanentURL)
// Transcribe
processingPhase = .transcribing
await whisperContext?.setPrompt(whisperState.whisperPrompt.transcriptionPrompt)
try await whisperContext?.fullTranscribe(samples: samples)
var text = await whisperContext?.getTranscription() ?? ""
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
// Handle enhancement if enabled
if let enhancementService = whisperState.enhancementService,
enhancementService.isEnhancementEnabled,
enhancementService.isConfigured {
processingPhase = .enhancing
do {
let enhancedText = try await enhancementService.enhance(text)
let transcription = Transcription(
text: text,
duration: duration,
enhancedText: enhancedText,
audioFileURL: permanentURL.absoluteString
)
modelContext.insert(transcription)
try modelContext.save()
currentTranscription = transcription
} catch {
logger.error("Enhancement failed: \(error.localizedDescription)")
messageLog += "Enhancement failed: \(error.localizedDescription). Using original transcription.\n"
let transcription = Transcription(
text: text,
duration: duration,
audioFileURL: permanentURL.absoluteString
)
modelContext.insert(transcription)
try modelContext.save()
currentTranscription = transcription
}
} else {
let transcription = Transcription(
text: text,
duration: duration,
audioFileURL: permanentURL.absoluteString
)
modelContext.insert(transcription)
try modelContext.save()
currentTranscription = transcription
}
processingPhase = .completed
try? await Task.sleep(nanoseconds: 1_500_000_000)
await finishProcessing()
} catch {
await handleError(error)
}
}
}
func cancelProcessing() {
currentTask?.cancel()
cleanupResources()
}
private func finishProcessing() {
isProcessing = false
processingPhase = .idle
currentTask = nil
cleanupResources()
}
private func handleError(_ error: Error) {
logger.error("Transcription error: \(error.localizedDescription)")
errorMessage = error.localizedDescription
messageLog += "Error: \(error.localizedDescription)\n"
isProcessing = false
processingPhase = .idle
currentTask = nil
cleanupResources()
}
private func cleanupResources() {
whisperContext = nil
}
}
enum TranscriptionError: Error, LocalizedError {
case noModelSelected
case transcriptionCancelled
var errorDescription: String? {
switch self {
case .noModelSelected:
return "No transcription model selected"
case .transcriptionCancelled:
return "Transcription was cancelled"
}
}
}

View File

@ -2,44 +2,36 @@ import SwiftUI
import AVFoundation
class WaveformGenerator {
static func generateWaveformSamples(from url: URL, sampleCount: Int = 200) -> [Float] {
static func generateWaveformSamples(from url: URL, sampleCount: Int = 200) async -> [Float] {
guard let audioFile = try? AVAudioFile(forReading: url) else { return [] }
let format = audioFile.processingFormat
// Calculate frame count and read size
let frameCount = UInt32(audioFile.length)
let samplesPerFrame = frameCount / UInt32(sampleCount)
var samples = [Float](repeating: 0.0, count: sampleCount)
let stride = max(1, Int(frameCount) / sampleCount)
let bufferSize = min(UInt32(4096), frameCount)
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else { return [] }
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: bufferSize) else { return [] }
do {
try audioFile.read(into: buffer)
var maxValues = [Float](repeating: 0.0, count: sampleCount)
var sampleIndex = 0
var framePosition: AVAudioFramePosition = 0
// Get the raw audio data
guard let channelData = buffer.floatChannelData?[0] else { return [] }
// Process the samples
for i in 0..<sampleCount {
let startFrame = UInt32(i) * samplesPerFrame
let endFrame = min(startFrame + samplesPerFrame, frameCount)
var maxAmplitude: Float = 0.0
while sampleIndex < sampleCount && framePosition < AVAudioFramePosition(frameCount) {
audioFile.framePosition = framePosition
try audioFile.read(into: buffer)
// Find the highest amplitude in this segment
for frame in startFrame..<endFrame {
let amplitude = abs(channelData[Int(frame)])
maxAmplitude = max(maxAmplitude, amplitude)
if let channelData = buffer.floatChannelData?[0], buffer.frameLength > 0 {
maxValues[sampleIndex] = abs(channelData[0])
sampleIndex += 1
}
samples[i] = maxAmplitude
framePosition += AVAudioFramePosition(stride)
}
// Normalize the samples
if let maxSample = samples.max(), maxSample > 0 {
samples = samples.map { $0 / maxSample }
if let maxSample = maxValues.max(), maxSample > 0 {
return maxValues.map { $0 / maxSample }
}
return samples
return maxValues
} catch {
print("Error reading audio file: \(error)")
return []
@ -49,19 +41,27 @@ class WaveformGenerator {
class AudioPlayerManager: ObservableObject {
private var audioPlayer: AVAudioPlayer?
private var timer: Timer?
@Published var isPlaying = false
@Published var currentTime: TimeInterval = 0
@Published var duration: TimeInterval = 0
@Published var waveformSamples: [Float] = []
private var timer: Timer?
@Published var isLoadingWaveform = false
func loadAudio(from url: URL) {
do {
audioPlayer = try AVAudioPlayer(contentsOf: url)
audioPlayer?.prepareToPlay()
duration = audioPlayer?.duration ?? 0
// Generate waveform data
waveformSamples = WaveformGenerator.generateWaveformSamples(from: url)
isLoadingWaveform = true
Task {
let samples = await WaveformGenerator.generateWaveformSamples(from: url)
await MainActor.run {
self.waveformSamples = samples
self.isLoadingWaveform = false
}
}
} catch {
print("Error loading audio: \(error.localizedDescription)")
}
@ -109,6 +109,7 @@ struct WaveformView: View {
let samples: [Float]
let currentTime: TimeInterval
let duration: TimeInterval
let isLoading: Bool
var onSeek: (Double) -> Void
@State private var isHovering = false
@State private var hoverLocation: CGFloat = 0
@ -116,70 +117,72 @@ struct WaveformView: View {
var body: some View {
GeometryReader { geometry in
ZStack(alignment: .leading) {
// Removed the glass-morphic background and its overlays
// Waveform container
HStack(spacing: 1) {
ForEach(0..<samples.count, id: \.self) { index in
WaveformBar(
sample: samples[index],
isPlayed: CGFloat(index) / CGFloat(samples.count) <= CGFloat(currentTime / duration),
totalBars: samples.count,
geometryWidth: geometry.size.width,
isHovering: isHovering,
hoverProgress: hoverLocation / geometry.size.width
)
if isLoading {
VStack {
ProgressView()
.controlSize(.small)
Text("Generating waveform...")
.font(.system(size: 12))
.foregroundColor(.secondary)
}
}
.frame(maxHeight: .infinity)
.padding(.horizontal, 2)
// Hover time indicator
if isHovering {
// Time bubble
Text(formatTime(duration * Double(hoverLocation / geometry.size.width)))
.font(.system(size: 12, weight: .medium))
.monospacedDigit()
.foregroundColor(.white)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(
Capsule()
.fill(Color.accentColor)
.shadow(color: Color.black.opacity(0.1), radius: 3, x: 0, y: 2)
)
.offset(x: max(0, min(hoverLocation - 30, geometry.size.width - 60)))
.offset(y: -30)
.frame(maxWidth: .infinity, maxHeight: .infinity)
} else {
HStack(spacing: 1) {
ForEach(0..<samples.count, id: \.self) { index in
WaveformBar(
sample: samples[index],
isPlayed: CGFloat(index) / CGFloat(samples.count) <= CGFloat(currentTime / duration),
totalBars: samples.count,
geometryWidth: geometry.size.width,
isHovering: isHovering,
hoverProgress: hoverLocation / geometry.size.width
)
}
}
.frame(maxHeight: .infinity)
.padding(.horizontal, 2)
// Progress line
Rectangle()
.fill(Color.accentColor)
.frame(width: 2)
.frame(maxHeight: .infinity)
.offset(x: hoverLocation)
.transition(.opacity)
if isHovering {
Text(formatTime(duration * Double(hoverLocation / geometry.size.width)))
.font(.system(size: 12, weight: .medium))
.monospacedDigit()
.foregroundColor(.white)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Capsule().fill(Color.accentColor))
.offset(x: max(0, min(hoverLocation - 30, geometry.size.width - 60)))
.offset(y: -30)
Rectangle()
.fill(Color.accentColor)
.frame(width: 2)
.frame(maxHeight: .infinity)
.offset(x: hoverLocation)
}
}
}
.contentShape(Rectangle())
.gesture(
DragGesture(minimumDistance: 0)
.onChanged { value in
hoverLocation = value.location.x
let progress = max(0, min(value.location.x / geometry.size.width, 1))
onSeek(Double(progress) * duration)
if !isLoading {
hoverLocation = value.location.x
onSeek(Double(value.location.x / geometry.size.width) * duration)
}
}
)
.onHover { hovering in
withAnimation(.easeInOut(duration: 0.2)) {
isHovering = hovering
if !isLoading {
withAnimation(.easeInOut(duration: 0.2)) {
isHovering = hovering
}
}
}
.onContinuousHover { phase in
switch phase {
case .active(let location):
hoverLocation = location.x
case .ended:
break
if !isLoading {
if case .active(let location) = phase {
hoverLocation = location.x
}
}
}
}
@ -201,12 +204,8 @@ struct WaveformBar: View {
let isHovering: Bool
let hoverProgress: CGFloat
private var barProgress: CGFloat {
CGFloat(sample)
}
private var isNearHover: Bool {
let barPosition = CGFloat(geometryWidth) / CGFloat(totalBars)
let barPosition = geometryWidth / CGFloat(totalBars)
let hoverPosition = hoverProgress * geometryWidth
return abs(barPosition - hoverPosition) < 20
}
@ -215,17 +214,17 @@ struct WaveformBar: View {
Capsule()
.fill(
LinearGradient(
gradient: Gradient(colors: [
colors: [
isPlayed ? Color.accentColor : Color.accentColor.opacity(0.3),
isPlayed ? Color.accentColor.opacity(0.8) : Color.accentColor.opacity(0.2)
]),
],
startPoint: .bottom,
endPoint: .top
)
)
.frame(
width: max((geometryWidth / CGFloat(totalBars)) - 1, 1),
height: max(barProgress * 40, 3)
height: max(CGFloat(sample) * 40, 3)
)
.scaleEffect(y: isHovering && isNearHover ? 1.2 : 1.0)
.animation(.interpolatingSpring(stiffness: 300, damping: 15), value: isHovering && isNearHover)
@ -236,27 +235,19 @@ struct AudioPlayerView: View {
let url: URL
@StateObject private var playerManager = AudioPlayerManager()
@State private var isHovering = false
@State private var showingTooltip = false
@State private var isRetranscribing = false
@State private var showRetranscribeSuccess = false
@State private var showRetranscribeError = false
@State private var errorMessage = ""
// Add environment objects for retranscription
@EnvironmentObject private var whisperState: WhisperState
@Environment(\.modelContext) private var modelContext
// Create the audio transcription service lazily
private var transcriptionService: AudioTranscriptionService {
AudioTranscriptionService(
modelContext: modelContext,
whisperState: whisperState
)
AudioTranscriptionService(modelContext: modelContext, whisperState: whisperState)
}
var body: some View {
VStack(spacing: 16) {
// Title and duration
HStack {
HStack(spacing: 6) {
Image(systemName: "waveform")
@ -274,21 +265,16 @@ struct AudioPlayerView: View {
.foregroundColor(.secondary)
}
// Waveform and controls container
VStack(spacing: 16) {
// Waveform
WaveformView(
samples: playerManager.waveformSamples,
currentTime: playerManager.currentTime,
duration: playerManager.duration,
onSeek: { time in
playerManager.seek(to: time)
}
isLoading: playerManager.isLoadingWaveform,
onSeek: { playerManager.seek(to: $0) }
)
// Controls
HStack(spacing: 20) {
// Play/Pause button
Button(action: {
if playerManager.isPlaying {
playerManager.pause()
@ -314,10 +300,7 @@ struct AudioPlayerView: View {
}
}
// Add Retranscribe button
Button(action: {
retranscribeAudio()
}) {
Button(action: retranscribeAudio) {
Circle()
.fill(Color.green.opacity(0.1))
.frame(width: 44, height: 44)
@ -342,7 +325,6 @@ struct AudioPlayerView: View {
.disabled(isRetranscribing)
.help("Retranscribe this audio")
// Time
Text(formatTime(playerManager.currentTime))
.font(.system(size: 14, weight: .medium))
.monospacedDigit()
@ -356,7 +338,6 @@ struct AudioPlayerView: View {
playerManager.loadAudio(from: url)
}
.overlay(
// Success notification
VStack {
if showRetranscribeSuccess {
HStack(spacing: 8) {
@ -370,10 +351,7 @@ struct AudioPlayerView: View {
.background(
RoundedRectangle(cornerRadius: 8)
.fill(Color.green.opacity(0.1))
.overlay(
RoundedRectangle(cornerRadius: 8)
.stroke(Color.green.opacity(0.2), lineWidth: 1)
)
.stroke(Color.green.opacity(0.2), lineWidth: 1)
)
.transition(.move(edge: .top).combined(with: .opacity))
}
@ -390,10 +368,7 @@ struct AudioPlayerView: View {
.background(
RoundedRectangle(cornerRadius: 8)
.fill(Color.red.opacity(0.1))
.overlay(
RoundedRectangle(cornerRadius: 8)
.stroke(Color.red.opacity(0.2), lineWidth: 1)
)
.stroke(Color.red.opacity(0.2), lineWidth: 1)
)
.transition(.move(edge: .top).combined(with: .opacity))
}
@ -416,12 +391,8 @@ struct AudioPlayerView: View {
guard let currentModel = whisperState.currentModel else {
errorMessage = "No transcription model selected"
showRetranscribeError = true
// Hide error after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeError = false
}
withAnimation { showRetranscribeError = false }
}
return
}
@ -430,22 +401,12 @@ struct AudioPlayerView: View {
Task {
do {
// Use the AudioTranscriptionService to retranscribe the audio
let _ = try await transcriptionService.retranscribeAudio(
from: url,
using: currentModel
)
// Show success notification
let _ = try await transcriptionService.retranscribeAudio(from: url, using: currentModel)
await MainActor.run {
isRetranscribing = false
showRetranscribeSuccess = true
// Hide success after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeSuccess = false
}
withAnimation { showRetranscribeSuccess = false }
}
}
} catch {
@ -453,12 +414,8 @@ struct AudioPlayerView: View {
isRetranscribing = false
errorMessage = error.localizedDescription
showRetranscribeError = true
// Hide error after 3 seconds
DispatchQueue.main.asyncAfter(deadline: .now() + 3) {
withAnimation {
showRetranscribeError = false
}
withAnimation { showRetranscribeError = false }
}
}
}

View File

@ -0,0 +1,283 @@
import SwiftUI
import SwiftData
import UniformTypeIdentifiers
import AVFoundation
struct AudioTranscribeView: View {
@Environment(\.modelContext) private var modelContext
@EnvironmentObject private var whisperState: WhisperState
@StateObject private var transcriptionManager = AudioTranscriptionManager.shared
@State private var isDropTargeted = false
@State private var selectedAudioURL: URL?
@State private var isAudioFileSelected = false
@State private var isEnhancementEnabled = false
@State private var selectedPromptId: UUID?
var body: some View {
VStack(spacing: 0) {
if transcriptionManager.isProcessing {
processingView
} else {
dropZoneView
}
Divider()
.padding(.vertical)
// Show current transcription result
if let transcription = transcriptionManager.currentTranscription {
ScrollView {
VStack(alignment: .leading, spacing: 16) {
Text("Transcription Result")
.font(.headline)
if let enhancedText = transcription.enhancedText {
VStack(alignment: .leading, spacing: 8) {
Text("Enhanced")
.font(.subheadline)
.foregroundColor(.secondary)
Text(enhancedText)
.textSelection(.enabled)
}
Divider()
VStack(alignment: .leading, spacing: 8) {
Text("Original")
.font(.subheadline)
.foregroundColor(.secondary)
Text(transcription.text)
.textSelection(.enabled)
}
} else {
Text(transcription.text)
.textSelection(.enabled)
}
HStack {
Text("Duration: \(formatDuration(transcription.duration))")
.font(.caption)
.foregroundColor(.secondary)
Spacer()
}
}
.padding()
}
}
}
.alert("Error", isPresented: .constant(transcriptionManager.errorMessage != nil)) {
Button("OK", role: .cancel) {
transcriptionManager.errorMessage = nil
}
} message: {
if let errorMessage = transcriptionManager.errorMessage {
Text(errorMessage)
}
}
}
private var dropZoneView: some View {
VStack(spacing: 16) {
if isAudioFileSelected {
VStack(spacing: 16) {
Text("Audio file selected: \(selectedAudioURL?.lastPathComponent ?? "")")
.font(.headline)
// AI Enhancement Settings
if let enhancementService = whisperState.getEnhancementService() {
VStack(spacing: 16) {
// AI Enhancement and Prompt in the same row
HStack(spacing: 16) {
Toggle("AI Enhancement", isOn: $isEnhancementEnabled)
.toggleStyle(.switch)
.onChange(of: isEnhancementEnabled) { newValue in
enhancementService.isEnhancementEnabled = newValue
}
if isEnhancementEnabled {
Divider()
.frame(height: 20)
// Prompt Selection
HStack(spacing: 8) {
Text("Prompt:")
.font(.subheadline)
Menu {
ForEach(enhancementService.allPrompts) { prompt in
Button {
enhancementService.setActivePrompt(prompt)
selectedPromptId = prompt.id
} label: {
HStack {
Image(systemName: prompt.icon.rawValue)
.foregroundColor(.accentColor)
Text(prompt.title)
if selectedPromptId == prompt.id {
Spacer()
Image(systemName: "checkmark")
}
}
}
}
} label: {
HStack {
Text(enhancementService.allPrompts.first(where: { $0.id == selectedPromptId })?.title ?? "Select Prompt")
.foregroundColor(.primary)
Image(systemName: "chevron.down")
.font(.caption)
}
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(
RoundedRectangle(cornerRadius: 6)
.fill(Color(.controlBackgroundColor))
)
}
.fixedSize()
.disabled(!isEnhancementEnabled)
}
}
}
.padding(.horizontal, 12)
.padding(.vertical, 8)
.background(
RoundedRectangle(cornerRadius: 8)
.fill(Color(.windowBackgroundColor).opacity(0.4))
)
}
.frame(maxWidth: .infinity, alignment: .center)
.onAppear {
// Initialize local state from enhancement service
isEnhancementEnabled = enhancementService.isEnhancementEnabled
selectedPromptId = enhancementService.selectedPromptId
}
}
// Action Buttons in a row
HStack(spacing: 12) {
Button("Start Transcription") {
if let url = selectedAudioURL {
transcriptionManager.startProcessing(
url: url,
modelContext: modelContext,
whisperState: whisperState
)
}
}
.buttonStyle(.borderedProminent)
Button("Choose Different File") {
selectedAudioURL = nil
isAudioFileSelected = false
}
.buttonStyle(.bordered)
}
}
.padding()
} else {
ZStack {
RoundedRectangle(cornerRadius: 12)
.fill(Color(.windowBackgroundColor).opacity(0.4))
.overlay(
RoundedRectangle(cornerRadius: 12)
.strokeBorder(
style: StrokeStyle(
lineWidth: 2,
dash: [8]
)
)
.foregroundColor(isDropTargeted ? .blue : .gray.opacity(0.5))
)
VStack(spacing: 16) {
Image(systemName: "arrow.down.doc")
.font(.system(size: 32))
.foregroundColor(isDropTargeted ? .blue : .gray)
Text("Drop audio file here")
.font(.headline)
Text("or")
.foregroundColor(.secondary)
Button("Choose File") {
selectFile()
}
.buttonStyle(.bordered)
}
.padding(32)
}
.frame(height: 200)
.padding(.horizontal)
}
Text("Supported formats: WAV, MP3, M4A, AIFF")
.font(.caption)
.foregroundColor(.secondary)
}
.padding()
.onDrop(of: [.audio, .fileURL], isTargeted: $isDropTargeted) { providers in
Task {
await handleDroppedFile(providers)
}
return true
}
}
private var processingView: some View {
VStack(spacing: 16) {
ProgressView()
.scaleEffect(0.8)
Text(transcriptionManager.processingPhase.message)
.font(.headline)
Text(transcriptionManager.messageLog)
.font(.caption)
.foregroundColor(.secondary)
.multilineTextAlignment(.center)
}
.padding()
}
private func selectFile() {
let panel = NSOpenPanel()
panel.allowsMultipleSelection = false
panel.canChooseDirectories = false
panel.canChooseFiles = true
panel.allowedContentTypes = [
.audio,
.wav,
.mp3,
.mpeg4Audio,
.aiff
]
if panel.runModal() == .OK {
if let url = panel.url {
selectedAudioURL = url
isAudioFileSelected = true
}
}
}
private func handleDroppedFile(_ providers: [NSItemProvider]) async {
guard let provider = providers.first else { return }
if provider.hasItemConformingToTypeIdentifier(UTType.audio.identifier) {
try? await provider.loadItem(forTypeIdentifier: UTType.audio.identifier) { item, error in
if let url = item as? URL {
Task { @MainActor in
selectedAudioURL = url
isAudioFileSelected = true
}
}
}
}
}
private func formatDuration(_ duration: TimeInterval) -> String {
let minutes = Int(duration) / 60
let seconds = Int(duration) % 60
return String(format: "%d:%02d", minutes, seconds)
}
}

View File

@ -6,6 +6,7 @@ import KeyboardShortcuts
enum ViewType: String, CaseIterable {
case metrics = "Dashboard"
case record = "Record Audio"
case transcribeAudio = "Transcribe Audio"
case history = "History"
case models = "AI Models"
case enhancement = "Enhancement"
@ -21,6 +22,7 @@ enum ViewType: String, CaseIterable {
switch self {
case .metrics: return "gauge.medium"
case .record: return "mic.circle.fill"
case .transcribeAudio: return "waveform.circle.fill"
case .history: return "doc.text.fill"
case .models: return "brain.head.profile"
case .enhancement: return "wand.and.stars"
@ -243,6 +245,8 @@ struct ContentView: View {
EnhancementSettingsView()
case .record:
RecordView()
case .transcribeAudio:
AudioTranscribeView()
case .history:
TranscriptionHistoryView()
case .audioInput:

View File

@ -0,0 +1,189 @@
import SwiftUI
import SwiftData
struct TranscriptionCard: View {
let transcription: Transcription
let isExpanded: Bool
let isSelected: Bool
let onDelete: () -> Void
let onToggleSelection: () -> Void
@State private var showOriginalCopiedAlert = false
@State private var showEnhancedCopiedAlert = false
var body: some View {
HStack(spacing: 12) {
// Selection checkbox in macOS style
Toggle("", isOn: Binding(
get: { isSelected },
set: { _ in onToggleSelection() }
))
.toggleStyle(CircularCheckboxStyle())
.labelsHidden()
VStack(alignment: .leading, spacing: 8) {
// Header with date and duration
HStack {
Text(transcription.timestamp, style: .date)
.font(.system(size: 14, weight: .medium, design: .default))
.foregroundColor(.secondary)
Spacer()
Text(formatDuration(transcription.duration))
.font(.system(size: 14, weight: .medium, design: .default))
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Color.blue.opacity(0.1))
.foregroundColor(.blue)
.cornerRadius(6)
}
// Original text section
VStack(alignment: .leading, spacing: 8) {
if isExpanded {
HStack {
Text("Original")
.font(.system(size: 14, weight: .medium))
.foregroundColor(.secondary)
Spacer()
Button {
copyToClipboard(transcription.text)
showOriginalCopiedAlert = true
} label: {
HStack(spacing: 4) {
Image(systemName: showOriginalCopiedAlert ? "checkmark" : "doc.on.doc")
Text(showOriginalCopiedAlert ? "Copied" : "Copy")
}
.foregroundColor(showOriginalCopiedAlert ? .green : .blue)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Color.blue.opacity(0.1))
.cornerRadius(6)
}
.buttonStyle(.plain)
}
}
Text(transcription.text)
.font(.system(size: 15, weight: .regular, design: .default))
.lineLimit(isExpanded ? nil : 2)
.lineSpacing(2)
}
// Enhanced text section (only when expanded)
if isExpanded, let enhancedText = transcription.enhancedText {
Divider()
.padding(.vertical, 8)
VStack(alignment: .leading, spacing: 8) {
HStack {
HStack(spacing: 4) {
Image(systemName: "sparkles")
.foregroundColor(.blue)
Text("Enhanced")
.font(.system(size: 14, weight: .medium))
.foregroundColor(.blue)
}
Spacer()
Button {
copyToClipboard(enhancedText)
showEnhancedCopiedAlert = true
} label: {
HStack(spacing: 4) {
Image(systemName: showEnhancedCopiedAlert ? "checkmark" : "doc.on.doc")
Text(showEnhancedCopiedAlert ? "Copied" : "Copy")
}
.foregroundColor(showEnhancedCopiedAlert ? .green : .blue)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Color.blue.opacity(0.1))
.cornerRadius(6)
}
.buttonStyle(.plain)
}
Text(enhancedText)
.font(.system(size: 15, weight: .regular, design: .default))
.lineSpacing(2)
}
}
// Audio player (if available)
if isExpanded, let urlString = transcription.audioFileURL,
let url = URL(string: urlString),
FileManager.default.fileExists(atPath: url.path) {
Divider()
.padding(.vertical, 8)
AudioPlayerView(url: url)
}
// Timestamp (only when expanded)
if isExpanded {
HStack {
Text(transcription.timestamp, style: .time)
.font(.system(size: 14, weight: .regular, design: .default))
.foregroundColor(.secondary)
Spacer()
}
.padding(.top, 4)
}
}
}
.padding(16)
.background(
RoundedRectangle(cornerRadius: 12)
.fill(Color(.windowBackgroundColor).opacity(0.4))
)
.cornerRadius(12)
.shadow(color: Color.black.opacity(0.05), radius: 3, x: 0, y: 2)
.contextMenu {
if let enhancedText = transcription.enhancedText {
Button {
copyToClipboard(enhancedText)
showEnhancedCopiedAlert = true
} label: {
Label("Copy Enhanced", systemImage: "doc.on.doc")
}
}
Button {
copyToClipboard(transcription.text)
showOriginalCopiedAlert = true
} label: {
Label("Copy Original", systemImage: "doc.on.doc")
}
Button(role: .destructive) {
onDelete()
} label: {
Label("Delete", systemImage: "trash")
}
}
.onChange(of: showOriginalCopiedAlert) { _, isShowing in
if isShowing {
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
showOriginalCopiedAlert = false
}
}
}
.onChange(of: showEnhancedCopiedAlert) { _, isShowing in
if isShowing {
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
showEnhancedCopiedAlert = false
}
}
}
}
private func copyToClipboard(_ text: String) {
let success = ClipboardManager.copyToClipboard(text)
if !success {
print("Failed to copy text to clipboard")
}
}
private func formatDuration(_ duration: TimeInterval) -> String {
let minutes = Int(duration) / 60
let seconds = Int(duration) % 60
return String(format: "%d:%02d", minutes, seconds)
}
}

View File

@ -18,7 +18,7 @@ struct TranscriptionHistoryView: View {
private let pageSize = 20
// Query for latest transcriptions (used for real-time updates)
@Query(sort: \Transcription.timestamp, order: .reverse, animation: .default)
@Query(sort: \Transcription.timestamp, order: .reverse)
private var latestTranscriptions: [Transcription]
// Cursor-based query descriptor
@ -69,13 +69,7 @@ struct TranscriptionHistoryView: View {
onToggleSelection: { toggleSelection(transcription) }
)
.onTapGesture {
withAnimation {
if expandedTranscription == transcription {
expandedTranscription = nil
} else {
expandedTranscription = transcription
}
}
expandedTranscription = expandedTranscription == transcription ? nil : transcription
}
}
@ -140,14 +134,10 @@ struct TranscriptionHistoryView: View {
await loadInitialContent()
}
} else {
// If we're on a paginated view, show a notification or indicator that new content is available
// This could be a banner or button to "Show new transcriptions"
withAnimation {
// Reset pagination to show the latest content
Task {
await resetPagination()
await loadInitialContent()
}
// Reset pagination to show the latest content
Task {
await resetPagination()
await loadInitialContent()
}
}
}
@ -186,14 +176,22 @@ struct TranscriptionHistoryView: View {
}
private var selectionToolbar: some View {
HStack {
HStack(spacing: 12) {
Text("\(selectedTranscriptions.count) selected")
.foregroundColor(.secondary)
.font(.system(size: 14))
Spacer()
Button(action: {
showDeleteConfirmation = true
}) {
Image(systemName: "trash")
HStack(spacing: 4) {
Image(systemName: "trash")
Text("Delete")
}
}
.buttonStyle(.bordered)
.buttonStyle(.borderless)
if selectedTranscriptions.count < displayedTranscriptions.count {
Button("Select All") {
@ -201,16 +199,16 @@ struct TranscriptionHistoryView: View {
await selectAllTranscriptions()
}
}
.buttonStyle(.bordered)
.buttonStyle(.borderless)
} else {
Button("Deselect All") {
selectedTranscriptions.removeAll()
}
.buttonStyle(.bordered)
.buttonStyle(.borderless)
}
}
.padding(24)
.background(Color(.windowBackgroundColor).opacity(0.4))
.padding(16)
.background(Color(.windowBackgroundColor))
}
private func loadInitialContent() async {
@ -377,196 +375,3 @@ struct CircularCheckboxStyle: ToggleStyle {
.buttonStyle(.plain)
}
}
struct TranscriptionCard: View {
let transcription: Transcription
let isExpanded: Bool
let isSelected: Bool
let onDelete: () -> Void
let onToggleSelection: () -> Void
@State private var showOriginalCopiedAlert = false
@State private var showEnhancedCopiedAlert = false
var body: some View {
HStack(spacing: 12) {
// Selection checkbox in macOS style
Toggle("", isOn: Binding(
get: { isSelected },
set: { _ in onToggleSelection() }
))
.toggleStyle(CircularCheckboxStyle())
.labelsHidden()
VStack(alignment: .leading, spacing: 8) {
// Header with date and duration
HStack {
Text(transcription.timestamp, style: .date)
.font(.system(size: 14, weight: .medium, design: .default))
.foregroundColor(.secondary)
Spacer()
Text(formatDuration(transcription.duration))
.font(.system(size: 14, weight: .medium, design: .default))
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(Color.blue.opacity(0.1))
.foregroundColor(.blue)
.cornerRadius(6)
}
// Original text section
VStack(alignment: .leading, spacing: 8) {
if isExpanded {
HStack {
Text("Original")
.font(.system(size: 14, weight: .medium))
.foregroundColor(.secondary)
Spacer()
Button {
copyToClipboard(transcription.text)
showOriginalCopiedAlert = true
} label: {
HStack(spacing: 4) {
Image(systemName: showOriginalCopiedAlert ? "checkmark" : "doc.on.doc")
Text(showOriginalCopiedAlert ? "Copied!" : "Copy")
}
.foregroundColor(showOriginalCopiedAlert ? .green : .blue)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(
RoundedRectangle(cornerRadius: 6)
.fill(showOriginalCopiedAlert ? Color.green.opacity(0.1) : Color.blue.opacity(0.1))
)
}
.buttonStyle(.plain)
.animation(.easeInOut(duration: 0.2), value: showOriginalCopiedAlert)
}
}
Text(transcription.text)
.font(.system(size: 15, weight: .regular, design: .default))
.lineLimit(isExpanded ? nil : 2)
.lineSpacing(2)
}
// Enhanced text section (only when expanded)
if isExpanded, let enhancedText = transcription.enhancedText {
Divider()
.padding(.vertical, 8)
VStack(alignment: .leading, spacing: 8) {
HStack {
HStack(spacing: 4) {
Image(systemName: "sparkles")
.foregroundColor(.blue)
Text("Enhanced")
.font(.system(size: 14, weight: .medium))
.foregroundColor(.blue)
}
Spacer()
Button {
copyToClipboard(enhancedText)
showEnhancedCopiedAlert = true
} label: {
HStack(spacing: 4) {
Image(systemName: showEnhancedCopiedAlert ? "checkmark" : "doc.on.doc")
Text(showEnhancedCopiedAlert ? "Copied!" : "Copy")
}
.foregroundColor(showEnhancedCopiedAlert ? .green : .blue)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(
RoundedRectangle(cornerRadius: 6)
.fill(showEnhancedCopiedAlert ? Color.green.opacity(0.1) : Color.blue.opacity(0.1))
)
}
.buttonStyle(.plain)
.animation(.easeInOut(duration: 0.2), value: showEnhancedCopiedAlert)
}
Text(enhancedText)
.font(.system(size: 15, weight: .regular, design: .default))
.lineSpacing(2)
}
}
// Audio player (if available)
if isExpanded, let urlString = transcription.audioFileURL,
let url = URL(string: urlString),
FileManager.default.fileExists(atPath: url.path) {
Divider()
.padding(.vertical, 8)
AudioPlayerView(url: url)
}
// Timestamp (only when expanded)
if isExpanded {
HStack {
Text(transcription.timestamp, style: .time)
.font(.system(size: 14, weight: .regular, design: .default))
.foregroundColor(.secondary)
Spacer()
}
.padding(.top, 4)
}
}
}
.padding(16)
.background(
RoundedRectangle(cornerRadius: 12)
.fill(Color(.windowBackgroundColor).opacity(0.4))
)
.cornerRadius(12)
.shadow(color: Color.black.opacity(0.05), radius: 3, x: 0, y: 2)
.contextMenu {
if let enhancedText = transcription.enhancedText {
Button {
copyToClipboard(enhancedText)
showEnhancedCopiedAlert = true
} label: {
Label("Copy Enhanced", systemImage: "doc.on.doc")
}
}
Button {
copyToClipboard(transcription.text)
showOriginalCopiedAlert = true
} label: {
Label("Copy Original", systemImage: "doc.on.doc")
}
Button(role: .destructive) {
onDelete()
} label: {
Label("Delete", systemImage: "trash")
}
}
.onChange(of: showOriginalCopiedAlert) { _, isShowing in
if isShowing {
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
showOriginalCopiedAlert = false
}
}
}
.onChange(of: showEnhancedCopiedAlert) { _, isShowing in
if isShowing {
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) {
showEnhancedCopiedAlert = false
}
}
}
}
private func copyToClipboard(_ text: String) {
let success = ClipboardManager.copyToClipboard(text)
if !success {
print("Failed to copy text to clipboard")
}
}
private func formatDuration(_ duration: TimeInterval) -> String {
let minutes = Int(duration) / 60
let seconds = Int(duration) % 60
return String(format: "%d:%02d", minutes, seconds)
}
}