diff --git a/VoiceInk/Services/ScreenCaptureService.swift b/VoiceInk/Services/ScreenCaptureService.swift index 829a6be..0f41fc7 100644 --- a/VoiceInk/Services/ScreenCaptureService.swift +++ b/VoiceInk/Services/ScreenCaptureService.swift @@ -12,35 +12,68 @@ class ScreenCaptureService: ObservableObject { category: "aienhancement" ) + // Maximum number of retries for capture attempts + private let maxCaptureRetries = 3 + // Delay between capture retries in seconds + private let captureRetryDelay: TimeInterval = 0.5 + private func getActiveWindowInfo() -> (title: String, ownerName: String, windowID: CGWindowID)? { - let options = CGWindowListOption([.optionOnScreenOnly, .excludeDesktopElements]) - let windowListInfo = CGWindowListCopyWindowInfo(options, kCGNullWindowID) as? [[String: Any]] ?? [] + // Try multiple window list options to improve reliability + let options: [CGWindowListOption] = [ + [.optionOnScreenOnly, .excludeDesktopElements], + [.optionOnScreenOnly], + [] + ] - // Find the frontmost window that isn't our own app - guard let frontWindow = windowListInfo.first(where: { info in - let layer = info[kCGWindowLayer as String] as? Int32 ?? 0 + for option in options { + let windowListInfo = CGWindowListCopyWindowInfo(option, kCGNullWindowID) as? [[String: Any]] ?? [] + + // Find the frontmost window that isn't our own app + if let frontWindow = windowListInfo.first(where: { info in + let layer = info[kCGWindowLayer as String] as? Int32 ?? 0 + let ownerName = info[kCGWindowOwnerName as String] as? String ?? "" + // Exclude our own app and system UI elements + return layer == 0 && ownerName != "VoiceInk" && !ownerName.contains("Dock") && !ownerName.contains("Menu Bar") + }) { + guard let windowID = frontWindow[kCGWindowNumber as String] as? CGWindowID, + let ownerName = frontWindow[kCGWindowOwnerName as String] as? String, + let title = frontWindow[kCGWindowName as String] as? String else { + continue + } + + return (title: title, ownerName: ownerName, windowID: windowID) + } + } + + // If we couldn't find a window with the normal approach, try a more aggressive approach + logger.notice("Trying fallback window detection approach") + let allWindows = CGWindowListCopyWindowInfo(.optionAll, kCGNullWindowID) as? [[String: Any]] ?? [] + + // Find any visible window that isn't our own + if let visibleWindow = allWindows.first(where: { info in let ownerName = info[kCGWindowOwnerName as String] as? String ?? "" - // Exclude our own app and system UI elements - return layer == 0 && ownerName != "VoiceInk" && !ownerName.contains("Dock") && !ownerName.contains("Menu Bar") - }) else { - return nil + let alpha = info[kCGWindowAlpha as String] as? Double ?? 0 + return ownerName != "VoiceInk" && !ownerName.contains("Dock") && alpha > 0 + }) { + let windowID = visibleWindow[kCGWindowNumber as String] as? CGWindowID ?? 0 + let ownerName = visibleWindow[kCGWindowOwnerName as String] as? String ?? "Unknown App" + let title = visibleWindow[kCGWindowName as String] as? String ?? "Unknown Window" + + logger.notice("Found fallback window: \(title, privacy: .public) (\(ownerName, privacy: .public))") + return (title: title, ownerName: ownerName, windowID: windowID) } - guard let windowID = frontWindow[kCGWindowNumber as String] as? CGWindowID, - let ownerName = frontWindow[kCGWindowOwnerName as String] as? String, - let title = frontWindow[kCGWindowName as String] as? String else { - return nil - } - - return (title: title, ownerName: ownerName, windowID: windowID) + logger.notice("❌ No suitable window found for capture") + return nil } func captureActiveWindow() -> NSImage? { guard let windowInfo = getActiveWindowInfo() else { - return nil + logger.notice("❌ Failed to get window info for capture") + return captureFullScreen() // Fallback to full screen capture } - // Capture the window + // Try to capture the specific window let cgImage = CGWindowListCreateImage( .null, .optionIncludingWindow, @@ -48,23 +81,55 @@ class ScreenCaptureService: ObservableObject { [.boundsIgnoreFraming, .bestResolution] ) - guard let cgImage = cgImage else { - return nil + if let cgImage = cgImage { + logger.notice("✅ Successfully captured window") + return NSImage(cgImage: cgImage, size: NSSize(width: cgImage.width, height: cgImage.height)) + } else { + logger.notice("⚠️ Window-specific capture failed, trying fallback methods") + return captureFullScreen() // Fallback to full screen + } + } + + private func captureFullScreen() -> NSImage? { + logger.notice("📺 Attempting full screen capture as fallback") + + // Capture the entire screen + if let screen = NSScreen.main { + let rect = screen.frame + let cgImage = CGWindowListCreateImage( + rect, + .optionOnScreenOnly, + kCGNullWindowID, + [.bestResolution] + ) + + if let cgImage = cgImage { + logger.notice("✅ Full screen capture successful") + return NSImage(cgImage: cgImage, size: NSSize(width: cgImage.width, height: cgImage.height)) + } } - return NSImage(cgImage: cgImage, size: NSSize(width: cgImage.width, height: cgImage.height)) + logger.notice("❌ All capture methods failed") + return nil } func extractText(from image: NSImage, completion: @escaping (String?) -> Void) { guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else { + logger.notice("❌ Failed to convert NSImage to CGImage for text extraction") completion(nil) return } let requestHandler = VNImageRequestHandler(cgImage: cgImage, options: [:]) let request = VNRecognizeTextRequest { request, error in - guard error == nil, - let observations = request.results as? [VNRecognizedTextObservation] else { + if let error = error { + self.logger.notice("❌ Text recognition error: \(error.localizedDescription, privacy: .public)") + completion(nil) + return + } + + guard let observations = request.results as? [VNRecognizedTextObservation] else { + self.logger.notice("❌ No text observations found") completion(nil) return } @@ -73,56 +138,95 @@ class ScreenCaptureService: ObservableObject { observation.topCandidates(1).first?.string }.joined(separator: "\n") - completion(text) + if text.isEmpty { + self.logger.notice("⚠️ Text extraction returned empty result") + completion(nil) + } else { + self.logger.notice("✅ Text extraction successful, found \(text.count, privacy: .public) characters") + completion(text) + } } // Configure the recognition level request.recognitionLevel = .accurate + request.usesLanguageCorrection = true do { try requestHandler.perform([request]) } catch { + logger.notice("❌ Failed to perform text recognition: \(error.localizedDescription, privacy: .public)") completion(nil) } } func captureAndExtractText() async -> String? { - guard !isCapturing else { return nil } - - isCapturing = true - defer { isCapturing = false } - - logger.notice("🎬 Starting screen capture") - - // First get window info - guard let windowInfo = getActiveWindowInfo() else { - logger.notice("❌ Failed to get window info") - return nil + guard !isCapturing else { + logger.notice("⚠️ Screen capture already in progress, skipping") + return nil } - logger.notice("🎯 Found window: \(windowInfo.title) (\(windowInfo.ownerName))") - - // Start with window metadata - var contextText = """ - Active Window: \(windowInfo.title) - Application: \(windowInfo.ownerName) - - """ - - // Then capture and process window content - if let capturedImage = captureActiveWindow() { - if let extractedText = await withCheckedContinuation({ continuation in - extractText(from: capturedImage) { text in - continuation.resume(returning: text) - } - }) { - contextText += "Window Content:\n\(extractedText)" - // Log immediately after text extraction - logger.notice("✅ Captured: \(contextText)") + isCapturing = true + defer { + DispatchQueue.main.async { + self.isCapturing = false } } - self.lastCapturedText = contextText - return contextText + logger.notice("🎬 Starting screen capture") + + // Try multiple times to get a successful capture + for attempt in 1...maxCaptureRetries { + logger.notice("🔄 Capture attempt \(attempt, privacy: .public) of \(maxCaptureRetries, privacy: .public)") + + // First get window info + guard let windowInfo = getActiveWindowInfo() else { + logger.notice("❌ Failed to get window info on attempt \(attempt, privacy: .public)") + if attempt < maxCaptureRetries { + try? await Task.sleep(nanoseconds: UInt64(captureRetryDelay * 1_000_000_000)) + continue + } + return nil + } + + logger.notice("🎯 Found window: \(windowInfo.title, privacy: .public) (\(windowInfo.ownerName, privacy: .public))") + + // Start with window metadata + var contextText = """ + Active Window: \(windowInfo.title) + Application: \(windowInfo.ownerName) + + """ + + // Then capture and process window content + if let capturedImage = captureActiveWindow() { + if let extractedText = await withCheckedContinuation({ continuation in + extractText(from: capturedImage) { text in + continuation.resume(returning: text) + } + }) { + contextText += "Window Content:\n\(extractedText)" + // Log immediately after text extraction + logger.notice("✅ Captured: \(contextText, privacy: .public)") + + // Ensure lastCapturedText is set on the main thread + await MainActor.run { + self.lastCapturedText = contextText + } + + return contextText + } else { + logger.notice("⚠️ Failed to extract text from image on attempt \(attempt, privacy: .public)") + } + } else { + logger.notice("⚠️ Failed to capture window image on attempt \(attempt, privacy: .public)") + } + + if attempt < maxCaptureRetries { + try? await Task.sleep(nanoseconds: UInt64(captureRetryDelay * 1_000_000_000)) + } + } + + logger.notice("❌ All capture attempts failed") + return nil } } \ No newline at end of file