Feat: Add paragraph formatting in Audio file transcription, remove from LibWhisper

This commit is contained in:
Beingpax 2025-05-11 17:08:33 +05:45
parent f9af8dd8ed
commit d7200d7058
3 changed files with 114 additions and 50 deletions

View File

@ -90,6 +90,7 @@ class AudioTranscriptionManager: ObservableObject {
try await whisperContext?.fullTranscribe(samples: samples)
var text = await whisperContext?.getTranscription() ?? ""
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
text = WhisperTextFormatter.format(text)
// Apply word replacements if enabled
if UserDefaults.standard.bool(forKey: "IsWordReplacementEnabled") {

View File

@ -85,7 +85,7 @@ actor WhisperContext {
let langId = whisper_full_lang_id(context)
let detectedLang = String(cString: whisper_lang_str(langId))
logger.notice("✅ Transcription completed - Language: \(detectedLang)")
whisper_print_timings(context)
}
}
@ -102,8 +102,7 @@ actor WhisperContext {
// Apply hallucination filtering
let filteredTranscription = WhisperHallucinationFilter.filter(transcription)
// Always apply text formatting
return WhisperTextFormatter.format(filteredTranscription)
return filteredTranscription
}
static func createContext(path: String) async throws -> WhisperContext {

View File

@ -1,59 +1,123 @@
import Foundation
import NaturalLanguage
struct WhisperTextFormatter {
static func format(_ text: String) -> String {
var formattedText = text
let TARGET_WORD_COUNT = 30
let MAX_SENTENCES_PER_CHUNK = 4
let MIN_WORDS_FOR_SIGNIFICANT_SENTENCE = 3
var finalFormattedText = ""
// Handle single-word variants
let singleWordPatterns = [
(pattern: "\\b(newline)\\b", replacement: "new line"),
(pattern: "\\b(newparagraph)\\b", replacement: "new paragraph")
]
// Attempt to detect the language of the input text
let detectedLanguage = NLLanguageRecognizer.dominantLanguage(for: text)
let tokenizerLanguage = detectedLanguage ?? .english // Fallback to English if detection fails
for (pattern, replacement) in singleWordPatterns {
formattedText = formattedText.replacingOccurrences(
of: pattern,
with: replacement,
options: [.regularExpression, .caseInsensitive]
)
let sentenceTokenizer = NLTokenizer(unit: .sentence)
sentenceTokenizer.string = text
sentenceTokenizer.setLanguage(tokenizerLanguage)
var allSentencesFromInput = [String]()
sentenceTokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { sentenceRange, _ in
let rawSentence = String(text[sentenceRange])
allSentencesFromInput.append(rawSentence.trimmingCharacters(in: .whitespacesAndNewlines))
return true
}
// Insert a period before 'new line' or 'new paragraph' if not preceded by punctuation
let punctuationInsertPatterns = [
(pattern: "(?<![.!?,\n\r])\\s*(new\\s+line)", replacement: ". new line"),
(pattern: "(?<![.!?,\n\r])\\s*(new\\s+paragraph)", replacement: ". new paragraph")
]
for (pattern, replacement) in punctuationInsertPatterns {
formattedText = formattedText.replacingOccurrences(
of: pattern,
with: replacement,
options: [.regularExpression, .caseInsensitive]
)
guard !allSentencesFromInput.isEmpty else {
return ""
}
var processedSentenceGlobalIndex = 0
while processedSentenceGlobalIndex < allSentencesFromInput.count {
var currentChunkTentativeSentences = [String]()
var currentChunkWordCount = 0
var currentChunkSignificantSentenceCount = 0
// Build a tentative chunk based on TARGET_WORD_COUNT
for i in processedSentenceGlobalIndex..<allSentencesFromInput.count {
let sentence = allSentencesFromInput[i]
let wordTokenizer = NLTokenizer(unit: .word)
wordTokenizer.string = sentence
wordTokenizer.setLanguage(tokenizerLanguage)
var wordsInSentence = 0
wordTokenizer.enumerateTokens(in: sentence.startIndex..<sentence.endIndex) { _, _ in
wordsInSentence += 1
return true
}
currentChunkTentativeSentences.append(sentence)
currentChunkWordCount += wordsInSentence
if wordsInSentence >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
currentChunkSignificantSentenceCount += 1
}
if currentChunkWordCount >= TARGET_WORD_COUNT {
break // Word target met for this tentative chunk
}
}
// Now, apply MAX_SENTENCES_PER_CHUNK rule based on significant sentences
var sentencesForThisFinalChunk = [String]()
if currentChunkSignificantSentenceCount > MAX_SENTENCES_PER_CHUNK {
var significantSentencesCountedInTrim = 0
for sentenceInTentativeChunk in currentChunkTentativeSentences {
sentencesForThisFinalChunk.append(sentenceInTentativeChunk)
// Re-check if this sentence was significant to count towards the cap
let wordTokenizerForTrimCheck = NLTokenizer(unit: .word)
wordTokenizerForTrimCheck.string = sentenceInTentativeChunk
wordTokenizerForTrimCheck.setLanguage(tokenizerLanguage)
var wordsInCurrentSentenceForTrim = 0
wordTokenizerForTrimCheck.enumerateTokens(in: sentenceInTentativeChunk.startIndex..<sentenceInTentativeChunk.endIndex) { _, _ in
wordsInCurrentSentenceForTrim += 1
return true
}
if wordsInCurrentSentenceForTrim >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
significantSentencesCountedInTrim += 1
if significantSentencesCountedInTrim >= MAX_SENTENCES_PER_CHUNK {
break // Reached the cap of significant sentences for this chunk
}
}
}
} else {
sentencesForThisFinalChunk = currentChunkTentativeSentences
}
if !sentencesForThisFinalChunk.isEmpty {
let segmentStringToAppend = sentencesForThisFinalChunk.joined(separator: " ")
if !finalFormattedText.isEmpty {
finalFormattedText += "\n\n"
}
finalFormattedText += segmentStringToAppend
processedSentenceGlobalIndex += sentencesForThisFinalChunk.count
} else {
// Safeguard: if no sentences ended up in the final chunk (e.g. all input was processed)
// or if currentChunkTentativeSentences was empty (should be caught by outer loop condition)
// This ensures we don't loop infinitely if something unexpected happens.
if processedSentenceGlobalIndex >= allSentencesFromInput.count && currentChunkTentativeSentences.isEmpty {
break // All input processed
} else if sentencesForThisFinalChunk.isEmpty && !currentChunkTentativeSentences.isEmpty {
// This implies currentChunkTentativeSentences had items but trimming resulted in zero items for final chunk
// which is unlikely with the logic, but as a safety, advance by what was considered.
processedSentenceGlobalIndex += currentChunkTentativeSentences.count
} else if sentencesForThisFinalChunk.isEmpty && currentChunkTentativeSentences.isEmpty && processedSentenceGlobalIndex < allSentencesFromInput.count {
// No sentences in tentative, means loop above didn't run, implies processedSentenceGlobalIndex needs to catch up or something is wrong
processedSentenceGlobalIndex = allSentencesFromInput.count // Mark as processed to exit
break;
}
else if sentencesForThisFinalChunk.isEmpty { // General catch-all if empty for other reasons
break
}
}
}
// Then handle the new line/paragraph commands with any combination of spaces and punctuation
let patterns = [
// Handle "new paragraph" with any combination of spaces and punctuation
(pattern: "\\s*new\\s+paragraph\\s*[,.!?]?\\s*", replacement: "\n\n"),
// Handle "new line" with any combination of spaces and punctuation
(pattern: "\\s*new\\s+line\\s*[,.!?]?\\s*", replacement: "\n")
]
for (pattern, replacement) in patterns {
formattedText = formattedText.replacingOccurrences(
of: pattern,
with: replacement,
options: [.regularExpression, .caseInsensitive]
)
}
// Clean up any multiple consecutive newlines (more than 2)
formattedText = formattedText.replacingOccurrences(
of: "\n{3,}",
with: "\n\n",
options: .regularExpression
)
return formattedText
return finalFormattedText.trimmingCharacters(in: .whitespacesAndNewlines)
}
}