feat: Make LibWhisper more robust with refined parameters

This commit is contained in:
Beingpax 2025-07-08 12:32:01 +05:45
parent fe19b60747
commit 2c34225c88

View File

@ -18,9 +18,7 @@ actor WhisperContext {
private var promptCString: [CChar]?
private let logger = Logger(subsystem: "com.prakashjoshipax.voiceink", category: "WhisperContext")
private init() {
// Private initializer without context
}
private init() {}
init(context: OpaquePointer) {
self.context = context
@ -35,22 +33,18 @@ actor WhisperContext {
func fullTranscribe(samples: [Float]) async {
guard let context = context else { return }
// Leave 2 processors free (i.e. the high-efficiency cores).
let maxThreads = max(1, min(8, cpuCount() - 2))
var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY)
// Read language directly from UserDefaults
let selectedLanguage = UserDefaults.standard.string(forKey: "SelectedLanguage") ?? "auto"
if selectedLanguage != "auto" {
languageCString = Array(selectedLanguage.utf8CString)
params.language = languageCString?.withUnsafeBufferPointer { ptr in
ptr.baseAddress
}
logger.notice("🌐 Using language: \(selectedLanguage)")
} else {
languageCString = nil
params.language = nil
logger.notice("🌐 Using auto language detection")
}
if prompt != nil {
@ -58,53 +52,46 @@ actor WhisperContext {
params.initial_prompt = promptCString?.withUnsafeBufferPointer { ptr in
ptr.baseAddress
}
logger.notice("💬 Using prompt for transcription in language: \(selectedLanguage)")
} else {
promptCString = nil
params.initial_prompt = nil
}
params.print_realtime = true
params.print_progress = false
params.print_realtime = true
params.print_progress = false
params.print_timestamps = true
params.print_special = false
params.translate = false
params.n_threads = Int32(maxThreads)
params.offset_ms = 0
params.no_context = true
params.single_segment = false
params.print_special = false
params.translate = false
params.n_threads = Int32(maxThreads)
params.offset_ms = 0
params.no_context = true
params.single_segment = false
params.suppress_nst = true
params.entropy_thold = 2.0
params.logprob_thold = -0.8
params.no_speech_thold = 0.6
whisper_reset_timings(context)
logger.notice("⚙️ Starting whisper transcription with VAD: \(params.vad ? "ENABLED" : "DISABLED")")
if let vadModelPath = await VADModelManager.shared.getModelPath() {
logger.notice("🎤 VAD is ENABLED - Successfully retrieved VAD model path: \(vadModelPath)")
params.vad = true
params.vad_model_path = (vadModelPath as NSString).utf8String
var vadParams = whisper_vad_default_params()
vadParams.min_speech_duration_ms = 500
vadParams.min_silence_duration_ms = 500
vadParams.threshold = 0.50
vadParams.min_speech_duration_ms = 250
vadParams.min_silence_duration_ms = 100
vadParams.max_speech_duration_s = Float.greatestFiniteMagnitude
vadParams.speech_pad_ms = 30
vadParams.samples_overlap = 0.1
params.vad_params = vadParams
logger.notice("🎤 VAD configured with parameters: min_speech=500ms, min_silence=500ms, overlap=10%")
logger.notice("🎤 VAD will be used for voice activity detection during transcription")
} else {
logger.notice("🎤 VAD is DISABLED - VAD model path not found, proceeding without VAD")
params.vad = false
logger.notice("🎤 Transcription will process entire audio without voice activity detection")
}
samples.withUnsafeBufferPointer { samplesBuffer in
if whisper_full(context, params, samplesBuffer.baseAddress, Int32(samplesBuffer.count)) != 0 {
self.logger.error("Failed to run whisper_full")
} else {
if params.vad {
self.logger.notice("✅ Whisper transcription completed successfully with VAD processing")
} else {
self.logger.notice("✅ Whisper transcription completed successfully without VAD")
}
}
}
@ -118,19 +105,13 @@ actor WhisperContext {
for i in 0..<whisper_full_n_segments(context) {
transcription += String(cString: whisper_full_get_segment_text(context, i))
}
// Apply hallucination filtering
let filteredTranscription = WhisperHallucinationFilter.filter(transcription)
return filteredTranscription
}
static func createContext(path: String) async throws -> WhisperContext {
// Create empty context first
let whisperContext = WhisperContext()
// Initialize the context within the actor's isolated context
try await whisperContext.initializeModel(path: path)
return whisperContext
}
@ -138,7 +119,6 @@ actor WhisperContext {
var params = whisper_context_default_params()
#if targetEnvironment(simulator)
params.use_gpu = false
logger.notice("🖥️ Running on simulator, using CPU")
#endif
let context = whisper_init_from_file_with_params(path, params)
@ -160,7 +140,6 @@ actor WhisperContext {
func setPrompt(_ prompt: String?) {
self.prompt = prompt
logger.notice("💬 Prompt set: \(prompt ?? "none")")
}
}