Feat: Add paragraph formatting in Audio file transcription, remove from LibWhisper
This commit is contained in:
parent
f9af8dd8ed
commit
d7200d7058
@ -90,6 +90,7 @@ class AudioTranscriptionManager: ObservableObject {
|
||||
try await whisperContext?.fullTranscribe(samples: samples)
|
||||
var text = await whisperContext?.getTranscription() ?? ""
|
||||
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
text = WhisperTextFormatter.format(text)
|
||||
|
||||
// Apply word replacements if enabled
|
||||
if UserDefaults.standard.bool(forKey: "IsWordReplacementEnabled") {
|
||||
|
||||
@ -85,7 +85,7 @@ actor WhisperContext {
|
||||
let langId = whisper_full_lang_id(context)
|
||||
let detectedLang = String(cString: whisper_lang_str(langId))
|
||||
logger.notice("✅ Transcription completed - Language: \(detectedLang)")
|
||||
whisper_print_timings(context)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,8 +102,7 @@ actor WhisperContext {
|
||||
// Apply hallucination filtering
|
||||
let filteredTranscription = WhisperHallucinationFilter.filter(transcription)
|
||||
|
||||
// Always apply text formatting
|
||||
return WhisperTextFormatter.format(filteredTranscription)
|
||||
return filteredTranscription
|
||||
}
|
||||
|
||||
static func createContext(path: String) async throws -> WhisperContext {
|
||||
|
||||
@ -1,59 +1,123 @@
|
||||
import Foundation
|
||||
import NaturalLanguage
|
||||
|
||||
struct WhisperTextFormatter {
|
||||
static func format(_ text: String) -> String {
|
||||
var formattedText = text
|
||||
let TARGET_WORD_COUNT = 30
|
||||
let MAX_SENTENCES_PER_CHUNK = 4
|
||||
let MIN_WORDS_FOR_SIGNIFICANT_SENTENCE = 3
|
||||
|
||||
var finalFormattedText = ""
|
||||
|
||||
// Handle single-word variants
|
||||
let singleWordPatterns = [
|
||||
(pattern: "\\b(newline)\\b", replacement: "new line"),
|
||||
(pattern: "\\b(newparagraph)\\b", replacement: "new paragraph")
|
||||
]
|
||||
// Attempt to detect the language of the input text
|
||||
let detectedLanguage = NLLanguageRecognizer.dominantLanguage(for: text)
|
||||
let tokenizerLanguage = detectedLanguage ?? .english // Fallback to English if detection fails
|
||||
|
||||
for (pattern, replacement) in singleWordPatterns {
|
||||
formattedText = formattedText.replacingOccurrences(
|
||||
of: pattern,
|
||||
with: replacement,
|
||||
options: [.regularExpression, .caseInsensitive]
|
||||
)
|
||||
let sentenceTokenizer = NLTokenizer(unit: .sentence)
|
||||
sentenceTokenizer.string = text
|
||||
sentenceTokenizer.setLanguage(tokenizerLanguage)
|
||||
|
||||
var allSentencesFromInput = [String]()
|
||||
sentenceTokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { sentenceRange, _ in
|
||||
let rawSentence = String(text[sentenceRange])
|
||||
allSentencesFromInput.append(rawSentence.trimmingCharacters(in: .whitespacesAndNewlines))
|
||||
return true
|
||||
}
|
||||
|
||||
// Insert a period before 'new line' or 'new paragraph' if not preceded by punctuation
|
||||
let punctuationInsertPatterns = [
|
||||
(pattern: "(?<![.!?,\n\r])\\s*(new\\s+line)", replacement: ". new line"),
|
||||
(pattern: "(?<![.!?,\n\r])\\s*(new\\s+paragraph)", replacement: ". new paragraph")
|
||||
]
|
||||
for (pattern, replacement) in punctuationInsertPatterns {
|
||||
formattedText = formattedText.replacingOccurrences(
|
||||
of: pattern,
|
||||
with: replacement,
|
||||
options: [.regularExpression, .caseInsensitive]
|
||||
)
|
||||
guard !allSentencesFromInput.isEmpty else {
|
||||
return ""
|
||||
}
|
||||
|
||||
var processedSentenceGlobalIndex = 0
|
||||
|
||||
while processedSentenceGlobalIndex < allSentencesFromInput.count {
|
||||
var currentChunkTentativeSentences = [String]()
|
||||
var currentChunkWordCount = 0
|
||||
var currentChunkSignificantSentenceCount = 0
|
||||
|
||||
// Build a tentative chunk based on TARGET_WORD_COUNT
|
||||
for i in processedSentenceGlobalIndex..<allSentencesFromInput.count {
|
||||
let sentence = allSentencesFromInput[i]
|
||||
|
||||
let wordTokenizer = NLTokenizer(unit: .word)
|
||||
wordTokenizer.string = sentence
|
||||
wordTokenizer.setLanguage(tokenizerLanguage)
|
||||
var wordsInSentence = 0
|
||||
wordTokenizer.enumerateTokens(in: sentence.startIndex..<sentence.endIndex) { _, _ in
|
||||
wordsInSentence += 1
|
||||
return true
|
||||
}
|
||||
|
||||
currentChunkTentativeSentences.append(sentence)
|
||||
currentChunkWordCount += wordsInSentence
|
||||
|
||||
if wordsInSentence >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
|
||||
currentChunkSignificantSentenceCount += 1
|
||||
}
|
||||
|
||||
if currentChunkWordCount >= TARGET_WORD_COUNT {
|
||||
break // Word target met for this tentative chunk
|
||||
}
|
||||
}
|
||||
|
||||
// Now, apply MAX_SENTENCES_PER_CHUNK rule based on significant sentences
|
||||
var sentencesForThisFinalChunk = [String]()
|
||||
if currentChunkSignificantSentenceCount > MAX_SENTENCES_PER_CHUNK {
|
||||
var significantSentencesCountedInTrim = 0
|
||||
for sentenceInTentativeChunk in currentChunkTentativeSentences {
|
||||
sentencesForThisFinalChunk.append(sentenceInTentativeChunk)
|
||||
|
||||
// Re-check if this sentence was significant to count towards the cap
|
||||
let wordTokenizerForTrimCheck = NLTokenizer(unit: .word)
|
||||
wordTokenizerForTrimCheck.string = sentenceInTentativeChunk
|
||||
wordTokenizerForTrimCheck.setLanguage(tokenizerLanguage)
|
||||
var wordsInCurrentSentenceForTrim = 0
|
||||
wordTokenizerForTrimCheck.enumerateTokens(in: sentenceInTentativeChunk.startIndex..<sentenceInTentativeChunk.endIndex) { _, _ in
|
||||
wordsInCurrentSentenceForTrim += 1
|
||||
return true
|
||||
}
|
||||
|
||||
if wordsInCurrentSentenceForTrim >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
|
||||
significantSentencesCountedInTrim += 1
|
||||
if significantSentencesCountedInTrim >= MAX_SENTENCES_PER_CHUNK {
|
||||
break // Reached the cap of significant sentences for this chunk
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sentencesForThisFinalChunk = currentChunkTentativeSentences
|
||||
}
|
||||
|
||||
if !sentencesForThisFinalChunk.isEmpty {
|
||||
let segmentStringToAppend = sentencesForThisFinalChunk.joined(separator: " ")
|
||||
|
||||
if !finalFormattedText.isEmpty {
|
||||
finalFormattedText += "\n\n"
|
||||
}
|
||||
finalFormattedText += segmentStringToAppend
|
||||
|
||||
processedSentenceGlobalIndex += sentencesForThisFinalChunk.count
|
||||
} else {
|
||||
// Safeguard: if no sentences ended up in the final chunk (e.g. all input was processed)
|
||||
// or if currentChunkTentativeSentences was empty (should be caught by outer loop condition)
|
||||
// This ensures we don't loop infinitely if something unexpected happens.
|
||||
if processedSentenceGlobalIndex >= allSentencesFromInput.count && currentChunkTentativeSentences.isEmpty {
|
||||
break // All input processed
|
||||
} else if sentencesForThisFinalChunk.isEmpty && !currentChunkTentativeSentences.isEmpty {
|
||||
// This implies currentChunkTentativeSentences had items but trimming resulted in zero items for final chunk
|
||||
// which is unlikely with the logic, but as a safety, advance by what was considered.
|
||||
processedSentenceGlobalIndex += currentChunkTentativeSentences.count
|
||||
} else if sentencesForThisFinalChunk.isEmpty && currentChunkTentativeSentences.isEmpty && processedSentenceGlobalIndex < allSentencesFromInput.count {
|
||||
// No sentences in tentative, means loop above didn't run, implies processedSentenceGlobalIndex needs to catch up or something is wrong
|
||||
processedSentenceGlobalIndex = allSentencesFromInput.count // Mark as processed to exit
|
||||
break;
|
||||
}
|
||||
else if sentencesForThisFinalChunk.isEmpty { // General catch-all if empty for other reasons
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then handle the new line/paragraph commands with any combination of spaces and punctuation
|
||||
let patterns = [
|
||||
// Handle "new paragraph" with any combination of spaces and punctuation
|
||||
(pattern: "\\s*new\\s+paragraph\\s*[,.!?]?\\s*", replacement: "\n\n"),
|
||||
// Handle "new line" with any combination of spaces and punctuation
|
||||
(pattern: "\\s*new\\s+line\\s*[,.!?]?\\s*", replacement: "\n")
|
||||
]
|
||||
|
||||
for (pattern, replacement) in patterns {
|
||||
formattedText = formattedText.replacingOccurrences(
|
||||
of: pattern,
|
||||
with: replacement,
|
||||
options: [.regularExpression, .caseInsensitive]
|
||||
)
|
||||
}
|
||||
|
||||
// Clean up any multiple consecutive newlines (more than 2)
|
||||
formattedText = formattedText.replacingOccurrences(
|
||||
of: "\n{3,}",
|
||||
with: "\n\n",
|
||||
options: .regularExpression
|
||||
)
|
||||
|
||||
return formattedText
|
||||
return finalFormattedText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user