From 3b043f4da99a31ff1eb619d387052f7d96b0e0f3 Mon Sep 17 00:00:00 2001 From: Nico Duldhardt Date: Sun, 7 Dec 2025 00:23:04 +0100 Subject: [PATCH 1/3] Fix OCR capturing VoiceInk status overlay instead of frontmost app window The screen capture service was selecting the first layer-0 window, which during recording was VoiceInk's own status indicator overlay. This caused OCR to always return 'No text detected' since the overlay has no readable content. Changes: - Filter out windows owned by VoiceInk's process - Prioritize windows belonging to NSWorkspace.frontmostApplication - Filter out tiny windows (<120x120) to avoid tooltips/overlays - Move CGWindowListCopyWindowInfo off main thread for better UI responsiveness - Refactor WindowCandidate struct to class scope --- VoiceInk/Services/ScreenCaptureService.swift | 73 ++++++++++++++++---- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/VoiceInk/Services/ScreenCaptureService.swift b/VoiceInk/Services/ScreenCaptureService.swift index 1f2ad6a..1e9776f 100644 --- a/VoiceInk/Services/ScreenCaptureService.swift +++ b/VoiceInk/Services/ScreenCaptureService.swift @@ -14,27 +14,74 @@ class ScreenCaptureService: ObservableObject { category: "aienhancement" ) - private func getActiveWindowInfo() -> (title: String, ownerName: String, windowID: CGWindowID)? { - let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] + private struct WindowCandidate { + let title: String + let ownerName: String + let windowID: CGWindowID + let ownerPID: pid_t + let layer: Int32 + let bounds: CGRect + } - if let frontWindow = windowListInfo.first(where: { info in - let layer = info[kCGWindowLayer as String] as? Int32 ?? 0 - return layer == 0 - }) { - guard let windowID = frontWindow[kCGWindowNumber as String] as? CGWindowID, - let ownerName = frontWindow[kCGWindowOwnerName as String] as? String, - let title = frontWindow[kCGWindowName as String] as? String else { - return nil + private func getActiveWindowInfo() async -> (title: String, ownerName: String, windowID: CGWindowID)? { + let currentPID = ProcessInfo.processInfo.processIdentifier + let frontmostPID = await NSWorkspace.shared.frontmostApplication?.processIdentifier + + // Move expensive window list retrieval off the main thread + let candidates = await Task.detached(priority: .userInitiated) { () -> [WindowCandidate] in + let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] + + return windowListInfo.compactMap { info -> WindowCandidate? in + guard let windowID = info[kCGWindowNumber as String] as? CGWindowID, + let ownerName = info[kCGWindowOwnerName as String] as? String, + let ownerPIDNumber = info[kCGWindowOwnerPID as String] as? NSNumber, + let layer = info[kCGWindowLayer as String] as? Int32, + let boundsDict = info[kCGWindowBounds as String] as? [String: Any], + let width = boundsDict["Width"] as? CGFloat, + let height = boundsDict["Height"] as? CGFloat else { + return nil + } + + let rawTitle = (info[kCGWindowName as String] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedTitle = rawTitle?.isEmpty == false ? rawTitle! : ownerName + let bounds = CGRect( + x: boundsDict["X"] as? CGFloat ?? 0, + y: boundsDict["Y"] as? CGFloat ?? 0, + width: width, + height: height + ) + + return WindowCandidate( + title: resolvedTitle, + ownerName: ownerName, + windowID: windowID, + ownerPID: ownerPIDNumber.int32Value, + layer: layer, + bounds: bounds + ) } + }.value - return (title: title, ownerName: ownerName, windowID: windowID) + func isEligible(_ candidate: WindowCandidate) -> Bool { + guard candidate.layer == 0 else { return false } + guard candidate.ownerPID != currentPID else { return false } + return candidate.bounds.width >= 120 && candidate.bounds.height >= 120 + } + + if let frontmostPID = frontmostPID, + let focusedWindow = candidates.first(where: { isEligible($0) && $0.ownerPID == frontmostPID }) { + return (title: focusedWindow.title, ownerName: focusedWindow.ownerName, windowID: focusedWindow.windowID) + } + + if let fallbackWindow = candidates.first(where: isEligible) { + return (title: fallbackWindow.title, ownerName: fallbackWindow.ownerName, windowID: fallbackWindow.windowID) } return nil } func captureActiveWindow() async -> NSImage? { - guard let windowInfo = getActiveWindowInfo() else { + guard let windowInfo = await getActiveWindowInfo() else { return nil } @@ -111,7 +158,7 @@ class ScreenCaptureService: ObservableObject { } } - guard let windowInfo = getActiveWindowInfo() else { + guard let windowInfo = await getActiveWindowInfo() else { logger.notice("📸 No active window found") return nil } From df2a8d99019e61120a0a2be70b99b40af199c193 Mon Sep 17 00:00:00 2001 From: Nico Duldhardt Date: Sun, 7 Dec 2025 00:52:52 +0100 Subject: [PATCH 2/3] remove check for windows < 120x120 --- VoiceInk/Services/ScreenCaptureService.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VoiceInk/Services/ScreenCaptureService.swift b/VoiceInk/Services/ScreenCaptureService.swift index 1e9776f..1166187 100644 --- a/VoiceInk/Services/ScreenCaptureService.swift +++ b/VoiceInk/Services/ScreenCaptureService.swift @@ -65,7 +65,7 @@ class ScreenCaptureService: ObservableObject { func isEligible(_ candidate: WindowCandidate) -> Bool { guard candidate.layer == 0 else { return false } guard candidate.ownerPID != currentPID else { return false } - return candidate.bounds.width >= 120 && candidate.bounds.height >= 120 + return true } if let frontmostPID = frontmostPID, From d25ae523336c4111020d6a0697c95d23a0d4d8ff Mon Sep 17 00:00:00 2001 From: Nico Duldhardt Date: Sun, 7 Dec 2025 00:52:52 +0100 Subject: [PATCH 3/3] Fix OCR capturing VoiceInk status overlay instead of frontmost app window The screen capture service was selecting the first layer-0 window, which during recording was VoiceInk's own status indicator overlay. This caused OCR to always return 'No text detected' since the overlay has no readable content. Changes: - Filter out windows owned by VoiceInk's process - Prioritize windows belonging to NSWorkspace.frontmostApplication - Refactor WindowCandidate struct to class scope --- VoiceInk/Services/ScreenCaptureService.swift | 61 ++++++++------------ 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/VoiceInk/Services/ScreenCaptureService.swift b/VoiceInk/Services/ScreenCaptureService.swift index 1166187..153d376 100644 --- a/VoiceInk/Services/ScreenCaptureService.swift +++ b/VoiceInk/Services/ScreenCaptureService.swift @@ -20,47 +20,32 @@ class ScreenCaptureService: ObservableObject { let windowID: CGWindowID let ownerPID: pid_t let layer: Int32 - let bounds: CGRect } - private func getActiveWindowInfo() async -> (title: String, ownerName: String, windowID: CGWindowID)? { + private func getActiveWindowInfo() -> (title: String, ownerName: String, windowID: CGWindowID)? { let currentPID = ProcessInfo.processInfo.processIdentifier - let frontmostPID = await NSWorkspace.shared.frontmostApplication?.processIdentifier + let frontmostPID = NSWorkspace.shared.frontmostApplication?.processIdentifier + let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] - // Move expensive window list retrieval off the main thread - let candidates = await Task.detached(priority: .userInitiated) { () -> [WindowCandidate] in - let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] - - return windowListInfo.compactMap { info -> WindowCandidate? in - guard let windowID = info[kCGWindowNumber as String] as? CGWindowID, - let ownerName = info[kCGWindowOwnerName as String] as? String, - let ownerPIDNumber = info[kCGWindowOwnerPID as String] as? NSNumber, - let layer = info[kCGWindowLayer as String] as? Int32, - let boundsDict = info[kCGWindowBounds as String] as? [String: Any], - let width = boundsDict["Width"] as? CGFloat, - let height = boundsDict["Height"] as? CGFloat else { - return nil - } - - let rawTitle = (info[kCGWindowName as String] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) - let resolvedTitle = rawTitle?.isEmpty == false ? rawTitle! : ownerName - let bounds = CGRect( - x: boundsDict["X"] as? CGFloat ?? 0, - y: boundsDict["Y"] as? CGFloat ?? 0, - width: width, - height: height - ) - - return WindowCandidate( - title: resolvedTitle, - ownerName: ownerName, - windowID: windowID, - ownerPID: ownerPIDNumber.int32Value, - layer: layer, - bounds: bounds - ) + let candidates = windowListInfo.compactMap { info -> WindowCandidate? in + guard let windowID = info[kCGWindowNumber as String] as? CGWindowID, + let ownerName = info[kCGWindowOwnerName as String] as? String, + let ownerPIDNumber = info[kCGWindowOwnerPID as String] as? NSNumber, + let layer = info[kCGWindowLayer as String] as? Int32 else { + return nil } - }.value + + let rawTitle = (info[kCGWindowName as String] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedTitle = rawTitle?.isEmpty == false ? rawTitle! : ownerName + + return WindowCandidate( + title: resolvedTitle, + ownerName: ownerName, + windowID: windowID, + ownerPID: ownerPIDNumber.int32Value, + layer: layer + ) + } func isEligible(_ candidate: WindowCandidate) -> Bool { guard candidate.layer == 0 else { return false } @@ -81,7 +66,7 @@ class ScreenCaptureService: ObservableObject { } func captureActiveWindow() async -> NSImage? { - guard let windowInfo = await getActiveWindowInfo() else { + guard let windowInfo = getActiveWindowInfo() else { return nil } @@ -158,7 +143,7 @@ class ScreenCaptureService: ObservableObject { } } - guard let windowInfo = await getActiveWindowInfo() else { + guard let windowInfo = getActiveWindowInfo() else { logger.notice("📸 No active window found") return nil }