vOOice/VoiceInk/Whisper/WhisperTextFormatter.swift

123 lines
6.1 KiB
Swift

import Foundation
import NaturalLanguage
struct WhisperTextFormatter {
static func format(_ text: String) -> String {
let TARGET_WORD_COUNT = 30
let MAX_SENTENCES_PER_CHUNK = 4
let MIN_WORDS_FOR_SIGNIFICANT_SENTENCE = 3
var finalFormattedText = ""
// Attempt to detect the language of the input text
let detectedLanguage = NLLanguageRecognizer.dominantLanguage(for: text)
let tokenizerLanguage = detectedLanguage ?? .english // Fallback to English if detection fails
let sentenceTokenizer = NLTokenizer(unit: .sentence)
sentenceTokenizer.string = text
sentenceTokenizer.setLanguage(tokenizerLanguage)
var allSentencesFromInput = [String]()
sentenceTokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { sentenceRange, _ in
let rawSentence = String(text[sentenceRange])
allSentencesFromInput.append(rawSentence.trimmingCharacters(in: .whitespacesAndNewlines))
return true
}
guard !allSentencesFromInput.isEmpty else {
return ""
}
var processedSentenceGlobalIndex = 0
while processedSentenceGlobalIndex < allSentencesFromInput.count {
var currentChunkTentativeSentences = [String]()
var currentChunkWordCount = 0
var currentChunkSignificantSentenceCount = 0
// Build a tentative chunk based on TARGET_WORD_COUNT
for i in processedSentenceGlobalIndex..<allSentencesFromInput.count {
let sentence = allSentencesFromInput[i]
let wordTokenizer = NLTokenizer(unit: .word)
wordTokenizer.string = sentence
wordTokenizer.setLanguage(tokenizerLanguage)
var wordsInSentence = 0
wordTokenizer.enumerateTokens(in: sentence.startIndex..<sentence.endIndex) { _, _ in
wordsInSentence += 1
return true
}
currentChunkTentativeSentences.append(sentence)
currentChunkWordCount += wordsInSentence
if wordsInSentence >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
currentChunkSignificantSentenceCount += 1
}
if currentChunkWordCount >= TARGET_WORD_COUNT {
break // Word target met for this tentative chunk
}
}
// Now, apply MAX_SENTENCES_PER_CHUNK rule based on significant sentences
var sentencesForThisFinalChunk = [String]()
if currentChunkSignificantSentenceCount > MAX_SENTENCES_PER_CHUNK {
var significantSentencesCountedInTrim = 0
for sentenceInTentativeChunk in currentChunkTentativeSentences {
sentencesForThisFinalChunk.append(sentenceInTentativeChunk)
// Re-check if this sentence was significant to count towards the cap
let wordTokenizerForTrimCheck = NLTokenizer(unit: .word)
wordTokenizerForTrimCheck.string = sentenceInTentativeChunk
wordTokenizerForTrimCheck.setLanguage(tokenizerLanguage)
var wordsInCurrentSentenceForTrim = 0
wordTokenizerForTrimCheck.enumerateTokens(in: sentenceInTentativeChunk.startIndex..<sentenceInTentativeChunk.endIndex) { _, _ in
wordsInCurrentSentenceForTrim += 1
return true
}
if wordsInCurrentSentenceForTrim >= MIN_WORDS_FOR_SIGNIFICANT_SENTENCE {
significantSentencesCountedInTrim += 1
if significantSentencesCountedInTrim >= MAX_SENTENCES_PER_CHUNK {
break // Reached the cap of significant sentences for this chunk
}
}
}
} else {
sentencesForThisFinalChunk = currentChunkTentativeSentences
}
if !sentencesForThisFinalChunk.isEmpty {
let segmentStringToAppend = sentencesForThisFinalChunk.joined(separator: " ")
if !finalFormattedText.isEmpty {
finalFormattedText += "\n\n"
}
finalFormattedText += segmentStringToAppend
processedSentenceGlobalIndex += sentencesForThisFinalChunk.count
} else {
// Safeguard: if no sentences ended up in the final chunk (e.g. all input was processed)
// or if currentChunkTentativeSentences was empty (should be caught by outer loop condition)
// This ensures we don't loop infinitely if something unexpected happens.
if processedSentenceGlobalIndex >= allSentencesFromInput.count && currentChunkTentativeSentences.isEmpty {
break // All input processed
} else if sentencesForThisFinalChunk.isEmpty && !currentChunkTentativeSentences.isEmpty {
// This implies currentChunkTentativeSentences had items but trimming resulted in zero items for final chunk
// which is unlikely with the logic, but as a safety, advance by what was considered.
processedSentenceGlobalIndex += currentChunkTentativeSentences.count
} else if sentencesForThisFinalChunk.isEmpty && currentChunkTentativeSentences.isEmpty && processedSentenceGlobalIndex < allSentencesFromInput.count {
// No sentences in tentative, means loop above didn't run, implies processedSentenceGlobalIndex needs to catch up or something is wrong
processedSentenceGlobalIndex = allSentencesFromInput.count // Mark as processed to exit
break;
}
else if sentencesForThisFinalChunk.isEmpty { // General catch-all if empty for other reasons
break
}
}
}
return finalFormattedText.trimmingCharacters(in: .whitespacesAndNewlines)
}
}