Feat: Add paragraph formatting in Audio file transcription, remove from LibWhisper

2025-05-11 17:08:33 +05:45 · 2025-05-11 17:08:33 +05:45 · d7200d7058
commit d7200d7058
parent f9af8dd8ed
3 changed files with 114 additions and 50 deletions
--- a/VoiceInk/Services/AudioTranscriptionManager.swift
+++ b/VoiceInk/Services/AudioTranscriptionManager.swift
@ -90,6 +90,7 @@ class AudioTranscriptionManager: ObservableObject {
                try await whisperContext?.fullTranscribe(samples: samples)
                var text = await whisperContext?.getTranscription() ?? ""
                text = text.trimmingCharacters(in: .whitespacesAndNewlines)
+                text = WhisperTextFormatter.format(text)
                
                // Apply word replacements if enabled
                if UserDefaults.standard.bool(forKey: "IsWordReplacementEnabled") {
--- a/VoiceInk/Whisper/LibWhisper.swift
+++ b/VoiceInk/Whisper/LibWhisper.swift
@ -85,7 +85,7 @@ actor WhisperContext {
                let langId = whisper_full_lang_id(context)
                let detectedLang = String(cString: whisper_lang_str(langId))
                logger.notice("✅ Transcription completed - Language: \(detectedLang)")
-                whisper_print_timings(context)
+                
            }
        }
        
@ -102,8 +102,7 @@ actor WhisperContext {
        // Apply hallucination filtering
        let filteredTranscription = WhisperHallucinationFilter.filter(transcription)

-        // Always apply text formatting
-        return WhisperTextFormatter.format(filteredTranscription)
+        return filteredTranscription
    }

    static func createContext(path: String) async throws -> WhisperContext {
--- a/VoiceInk/Whisper/WhisperTextFormatter.swift
+++ b/VoiceInk/Whisper/WhisperTextFormatter.swift
@ -1,59 +1,123 @@
 import Foundation
+import NaturalLanguage

 struct WhisperTextFormatter {
    static func format(_ text: String) -> String {
-        var formattedText = text
+        let TARGET_WORD_COUNT = 30
+        let MAX_SENTENCES_PER_CHUNK = 4
+        let MIN_WORDS_FOR_SIGNIFICANT_SENTENCE = 3
+
+        var finalFormattedText = ""
        
-        // Handle single-word variants
-        let singleWordPatterns = [
-            (pattern: "\\b(newline)\\b", replacement: "new line"),
-            (pattern: "\\b(newparagraph)\\b", replacement: "new paragraph")
-        ]
+        // Attempt to detect the language of the input text
+        let detectedLanguage = NLLanguageRecognizer.dominantLanguage(for: text)
+        let tokenizerLanguage = detectedLanguage ?? .english // Fallback to English if detection fails
        
-        for (pattern, replacement) in singleWordPatterns {
-            formattedText = formattedText.replacingOccurrences(
-                of: pattern,
-                with: replacement,
-                options: [.regularExpression, .caseInsensitive]
-            )
+        let sentenceTokenizer = NLTokenizer(unit: .sentence)
+        sentenceTokenizer.string = text
+        sentenceTokenizer.setLanguage(tokenizerLanguage)
+
+        var allSentencesFromInput = [String]()
+        sentenceTokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { sentenceRange, _ in
+            let rawSentence = String(text[sentenceRange])
+            allSentencesFromInput.append(rawSentence.trimmingCharacters(in: .whitespacesAndNewlines))
+            return true
        }

-        // Insert a period before 'new line' or 'new paragraph' if not preceded by punctuation
-        let punctuationInsertPatterns = [
-            (pattern: "(?<![.!?,\n\r])\\s*(new\\s+line)", replacement: ". new line"),
-            (pattern: "(?<![.!?,\n\r])\\s*(new\\s+paragraph)", replacement: ". new paragraph")
-        ]
-        for (pattern, replacement) in punctuationInsertPatterns {
-            formattedText = formattedText.replacingOccurrences(
-                of: pattern,
-                with: replacement,
-                options: [.regularExpression, .caseInsensitive]
-            )
+        guard !allSentencesFromInput.isEmpty else {
+            return ""
+        }
+
+        var processedSentenceGlobalIndex = 0
+
+        while processedSentenceGlobalIndex < allSentencesFromInput.count {
+            var currentChunkTentativeSentences = [String]()
+            var currentChunkWordCount = 0
+            var currentChunkSignificantSentenceCount = 0
+
+            // Build a tentative chunk based on TARGET_WORD_COUNT
+            for i in processedSentenceGlobalIndex..<allSentencesFromInput.count {
+                let sentence = allSentencesFromInput[i]
+                
+                let wordTokenizer = NLTokenizer(unit: .word)
+                wordTokenizer.string = sentence
+                wordTokenizer.setLanguage(tokenizerLanguage)
+                var wordsInSentence = 0
+                wordTokenizer.enumerateTokens(in: sentence.startIndex..<sentence.endIndex) { _, _ in
+                    wordsInSentence += 1
+                    return true
+                }
+                
+                currentChunkTentativeSentences.append(sentence)
+                currentChunkWordCount += wordsInSentence
+                
+                if wordsInSentence >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
+                    currentChunkSignificantSentenceCount += 1
+                }
+                
+                if currentChunkWordCount >= TARGET_WORD_COUNT {
+                    break // Word target met for this tentative chunk
+                }
+            }
+            
+            // Now, apply MAX_SENTENCES_PER_CHUNK rule based on significant sentences
+            var sentencesForThisFinalChunk = [String]()
+            if currentChunkSignificantSentenceCount > MAX_SENTENCES_PER_CHUNK {
+                var significantSentencesCountedInTrim = 0
+                for sentenceInTentativeChunk in currentChunkTentativeSentences {
+                    sentencesForThisFinalChunk.append(sentenceInTentativeChunk)
+                    
+                    // Re-check if this sentence was significant to count towards the cap
+                    let wordTokenizerForTrimCheck = NLTokenizer(unit: .word)
+                    wordTokenizerForTrimCheck.string = sentenceInTentativeChunk
+                    wordTokenizerForTrimCheck.setLanguage(tokenizerLanguage)
+                    var wordsInCurrentSentenceForTrim = 0
+                    wordTokenizerForTrimCheck.enumerateTokens(in: sentenceInTentativeChunk.startIndex..<sentenceInTentativeChunk.endIndex) { _, _ in
+                        wordsInCurrentSentenceForTrim += 1
+                        return true
+                    }
+
+                    if wordsInCurrentSentenceForTrim >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
+                        significantSentencesCountedInTrim += 1
+                        if significantSentencesCountedInTrim >= MAX_SENTENCES_PER_CHUNK {
+                            break // Reached the cap of significant sentences for this chunk
+                        }
+                    }
+                }
+            } else {
+                sentencesForThisFinalChunk = currentChunkTentativeSentences
+            }
+
+            if !sentencesForThisFinalChunk.isEmpty {
+                let segmentStringToAppend = sentencesForThisFinalChunk.joined(separator: " ")
+                
+                if !finalFormattedText.isEmpty {
+                    finalFormattedText += "\n\n"
+                }
+                finalFormattedText += segmentStringToAppend
+                
+                processedSentenceGlobalIndex += sentencesForThisFinalChunk.count
+            } else {
+                // Safeguard: if no sentences ended up in the final chunk (e.g. all input was processed)
+                // or if currentChunkTentativeSentences was empty (should be caught by outer loop condition)
+                // This ensures we don't loop infinitely if something unexpected happens.
+                if processedSentenceGlobalIndex >= allSentencesFromInput.count && currentChunkTentativeSentences.isEmpty {
+                     break // All input processed
+                } else if sentencesForThisFinalChunk.isEmpty && !currentChunkTentativeSentences.isEmpty {
+                    // This implies currentChunkTentativeSentences had items but trimming resulted in zero items for final chunk
+                    // which is unlikely with the logic, but as a safety, advance by what was considered.
+                    processedSentenceGlobalIndex += currentChunkTentativeSentences.count 
+                } else if sentencesForThisFinalChunk.isEmpty && currentChunkTentativeSentences.isEmpty && processedSentenceGlobalIndex < allSentencesFromInput.count {
+                     // No sentences in tentative, means loop above didn't run, implies processedSentenceGlobalIndex needs to catch up or something is wrong
+                    processedSentenceGlobalIndex = allSentencesFromInput.count // Mark as processed to exit
+                    break;
+                }
+                 else if sentencesForThisFinalChunk.isEmpty { // General catch-all if empty for other reasons
+                    break
+                }
+            }
        }
        
-        // Then handle the new line/paragraph commands with any combination of spaces and punctuation
-        let patterns = [
-            // Handle "new paragraph" with any combination of spaces and punctuation
-            (pattern: "\\s*new\\s+paragraph\\s*[,.!?]?\\s*", replacement: "\n\n"),
-            // Handle "new line" with any combination of spaces and punctuation
-            (pattern: "\\s*new\\s+line\\s*[,.!?]?\\s*", replacement: "\n")
-        ]
-        
-        for (pattern, replacement) in patterns {
-            formattedText = formattedText.replacingOccurrences(
-                of: pattern,
-                with: replacement,
-                options: [.regularExpression, .caseInsensitive]
-            )
-        }
-        
-        // Clean up any multiple consecutive newlines (more than 2)
-        formattedText = formattedText.replacingOccurrences(
-            of: "\n{3,}",
-            with: "\n\n",
-            options: .regularExpression
-        )
-        
-        return formattedText
+        return finalFormattedText.trimmingCharacters(in: .whitespacesAndNewlines)
    }
 }