From 3b043f4da99a31ff1eb619d387052f7d96b0e0f3 Mon Sep 17 00:00:00 2001 From: Nico Duldhardt Date: Sun, 7 Dec 2025 00:23:04 +0100 Subject: [PATCH] Fix OCR capturing VoiceInk status overlay instead of frontmost app window The screen capture service was selecting the first layer-0 window, which during recording was VoiceInk's own status indicator overlay. This caused OCR to always return 'No text detected' since the overlay has no readable content. Changes: - Filter out windows owned by VoiceInk's process - Prioritize windows belonging to NSWorkspace.frontmostApplication - Filter out tiny windows (<120x120) to avoid tooltips/overlays - Move CGWindowListCopyWindowInfo off main thread for better UI responsiveness - Refactor WindowCandidate struct to class scope --- VoiceInk/Services/ScreenCaptureService.swift | 73 ++++++++++++++++---- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/VoiceInk/Services/ScreenCaptureService.swift b/VoiceInk/Services/ScreenCaptureService.swift index 1f2ad6a..1e9776f 100644 --- a/VoiceInk/Services/ScreenCaptureService.swift +++ b/VoiceInk/Services/ScreenCaptureService.swift @@ -14,27 +14,74 @@ class ScreenCaptureService: ObservableObject { category: "aienhancement" ) - private func getActiveWindowInfo() -> (title: String, ownerName: String, windowID: CGWindowID)? { - let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] + private struct WindowCandidate { + let title: String + let ownerName: String + let windowID: CGWindowID + let ownerPID: pid_t + let layer: Int32 + let bounds: CGRect + } - if let frontWindow = windowListInfo.first(where: { info in - let layer = info[kCGWindowLayer as String] as? Int32 ?? 0 - return layer == 0 - }) { - guard let windowID = frontWindow[kCGWindowNumber as String] as? CGWindowID, - let ownerName = frontWindow[kCGWindowOwnerName as String] as? String, - let title = frontWindow[kCGWindowName as String] as? String else { - return nil + private func getActiveWindowInfo() async -> (title: String, ownerName: String, windowID: CGWindowID)? { + let currentPID = ProcessInfo.processInfo.processIdentifier + let frontmostPID = await NSWorkspace.shared.frontmostApplication?.processIdentifier + + // Move expensive window list retrieval off the main thread + let candidates = await Task.detached(priority: .userInitiated) { () -> [WindowCandidate] in + let windowListInfo = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] + + return windowListInfo.compactMap { info -> WindowCandidate? in + guard let windowID = info[kCGWindowNumber as String] as? CGWindowID, + let ownerName = info[kCGWindowOwnerName as String] as? String, + let ownerPIDNumber = info[kCGWindowOwnerPID as String] as? NSNumber, + let layer = info[kCGWindowLayer as String] as? Int32, + let boundsDict = info[kCGWindowBounds as String] as? [String: Any], + let width = boundsDict["Width"] as? CGFloat, + let height = boundsDict["Height"] as? CGFloat else { + return nil + } + + let rawTitle = (info[kCGWindowName as String] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + let resolvedTitle = rawTitle?.isEmpty == false ? rawTitle! : ownerName + let bounds = CGRect( + x: boundsDict["X"] as? CGFloat ?? 0, + y: boundsDict["Y"] as? CGFloat ?? 0, + width: width, + height: height + ) + + return WindowCandidate( + title: resolvedTitle, + ownerName: ownerName, + windowID: windowID, + ownerPID: ownerPIDNumber.int32Value, + layer: layer, + bounds: bounds + ) } + }.value - return (title: title, ownerName: ownerName, windowID: windowID) + func isEligible(_ candidate: WindowCandidate) -> Bool { + guard candidate.layer == 0 else { return false } + guard candidate.ownerPID != currentPID else { return false } + return candidate.bounds.width >= 120 && candidate.bounds.height >= 120 + } + + if let frontmostPID = frontmostPID, + let focusedWindow = candidates.first(where: { isEligible($0) && $0.ownerPID == frontmostPID }) { + return (title: focusedWindow.title, ownerName: focusedWindow.ownerName, windowID: focusedWindow.windowID) + } + + if let fallbackWindow = candidates.first(where: isEligible) { + return (title: fallbackWindow.title, ownerName: fallbackWindow.ownerName, windowID: fallbackWindow.windowID) } return nil } func captureActiveWindow() async -> NSImage? { - guard let windowInfo = getActiveWindowInfo() else { + guard let windowInfo = await getActiveWindowInfo() else { return nil } @@ -111,7 +158,7 @@ class ScreenCaptureService: ObservableObject { } } - guard let windowInfo = getActiveWindowInfo() else { + guard let windowInfo = await getActiveWindowInfo() else { logger.notice("📸 No active window found") return nil }