#!/usr/bin/env python3 """ Audio Captcha Solver Records system audio via BlackHole and solves with AI. Supports two modes: - transcribe: Speech-to-text for letter/number captchas (Whisper) - identify: Sound classification for "which is the X?" captchas (Gemini) Usage: python captcha_audio.py # Record 10s, transcribe python captcha_audio.py --duration 15 # Record 15s python captcha_audio.py --mode identify --target "stream" # Which sound is a stream? python captcha_audio.py --mode identify --target "dog barking" python captcha_audio.py --json # Output as JSON python captcha_audio.py --list-devices # List audio devices """ import subprocess import argparse import tempfile import json import re import os import sys from pathlib import Path from datetime import datetime def list_audio_devices(): """List available audio input devices via ffmpeg.""" result = subprocess.run( ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""], capture_output=True, text=True ) # Parse stderr (ffmpeg outputs device list to stderr) output = result.stderr print("Available audio devices:") print("-" * 40) in_audio = False for line in output.split('\n'): if 'audio devices' in line.lower(): in_audio = True continue if in_audio and ('[' in line): print(line.strip()) return output def check_blackhole(): """Check if BlackHole is available.""" result = subprocess.run( ["system_profiler", "SPAudioDataType"], capture_output=True, text=True ) return "BlackHole" in result.stdout def get_blackhole_device_index(): """Find BlackHole device index for ffmpeg.""" result = subprocess.run( ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""], capture_output=True, text=True ) lines = result.stderr.split('\n') in_audio = False for line in lines: if 'audio devices' in line.lower(): in_audio = True continue if in_audio and 'BlackHole' in line: # Extract device index like [0] or [1] match = re.search(r'\[(\d+)\]', line) if match: return match.group(1) return None def record_audio(duration=10, output_path=None): """Record audio from BlackHole.""" if output_path is None: output_path = tempfile.mktemp(suffix='.wav') device_index = get_blackhole_device_index() if device_index is None: raise RuntimeError("BlackHole device not found. Is it installed and set up?") # Record using ffmpeg cmd = [ "ffmpeg", "-f", "avfoundation", "-i", f":{device_index}", # Audio only, from device index "-t", str(duration), "-ar", "16000", # 16kHz for Whisper "-ac", "1", # Mono "-y", # Overwrite output_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0 and not os.path.exists(output_path): raise RuntimeError(f"Recording failed: {result.stderr}") return output_path def transcribe_audio(audio_path, model="small"): """Transcribe audio file using Whisper.""" output_dir = os.path.dirname(audio_path) cmd = [ "whisper", audio_path, "--model", model, "--language", "en", "--output_format", "txt", "--output_dir", output_dir ] result = subprocess.run(cmd, capture_output=True, text=True) # Read output file txt_path = audio_path.rsplit('.', 1)[0] + '.txt' if os.path.exists(txt_path): with open(txt_path, 'r') as f: return f.read().strip() else: raise RuntimeError(f"Transcription failed: {result.stderr}") def extract_captcha_chars(text): """Extract likely captcha characters (letters/numbers only).""" # Remove common filler words often in audio captchas filler_words = ['the', 'is', 'are', 'a', 'an', 'please', 'enter', 'type', 'following'] words = text.lower().split() filtered = [w for w in words if w not in filler_words] # Extract alphanumeric characters chars = re.findall(r'[A-Za-z0-9]', ' '.join(filtered)) return ''.join(chars).upper() def identify_sound(audio_path, target_sound): """ Use Gemini to identify which sound matches the target. For captchas like "which sound is a stream?" """ prompt = f"""You are helping solve an audio captcha. Listen carefully to the audio. The audio contains multiple sounds played in sequence (usually numbered or separated by pauses). Question: Which sound is a "{target_sound}"? Instructions: 1. Listen to each sound segment 2. Identify which one matches "{target_sound}" 3. Reply with ONLY the number (1, 2, 3, etc.) or position of the matching sound 4. If you hear the sounds labeled (like "sound 1", "sound 2"), use those numbers 5. If no labels, count the order they play (first=1, second=2, etc.) 6. Be confident - pick the best match Reply with just the number, nothing else.""" # Use gemini CLI to analyze audio cmd = [ "gemini", "-p", prompt, "-f", audio_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Gemini analysis failed: {result.stderr}") response = result.stdout.strip() # Extract just the number from response numbers = re.findall(r'\d+', response) answer = numbers[0] if numbers else response return { "answer": answer, "raw_response": response, "target_sound": target_sound } def describe_sounds(audio_path): """ Use Gemini to describe all sounds heard in the audio. Useful for debugging or when you don't know the target. """ prompt = """Listen to this audio captcha and describe each sound you hear. For each distinct sound or segment, tell me: 1. What number/position it is (first, second, third OR 1, 2, 3) 2. What the sound is (water, dog, traffic, bird, etc.) Format your response as a simple list like: 1: water/stream flowing 2: dog barking 3: traffic/cars Be specific and confident.""" cmd = [ "gemini", "-p", prompt, "-f", audio_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Gemini analysis failed: {result.stderr}") return result.stdout.strip() def solve_captcha(duration=10, model="small", as_json=False, mode="transcribe", target=None): """ Main function to record and solve audio captcha. Modes: transcribe: Use Whisper for speech-to-text (letters/numbers) identify: Use Gemini to identify sounds ("which is the stream?") describe: Use Gemini to describe all sounds (debugging) """ if not check_blackhole(): error = { "success": False, "error": "BlackHole not detected", "setup_instructions": [ "brew install blackhole-2ch", "Reboot your Mac", "Open 'Audio MIDI Setup'", "Create Multi-Output Device (speakers + BlackHole)", "Set Multi-Output as system output" ] } if as_json: return json.dumps(error, indent=2) else: print("❌ BlackHole not detected!") print("\nSetup instructions:") for step in error["setup_instructions"]: print(f" • {step}") sys.exit(1) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") work_dir = Path(tempfile.gettempdir()) / "captcha-audio" work_dir.mkdir(exist_ok=True) audio_path = str(work_dir / f"captcha_{timestamp}.wav") mode_emoji = {"transcribe": "🎤", "identify": "🔊", "describe": "🔍"} if not as_json: print(f"{mode_emoji.get(mode, '🎤')} Mode: {mode}") print(f"⏱️ Recording for {duration} seconds...") print("▶️ Play the audio captcha NOW!") try: # Record record_audio(duration, audio_path) if not as_json: print("✅ Recording complete") # Process based on mode if mode == "transcribe": if not as_json: print("🧠 Transcribing with Whisper...") raw_text = transcribe_audio(audio_path, model) extracted = extract_captcha_chars(raw_text) result = { "success": True, "mode": "transcribe", "raw_text": raw_text, "extracted_chars": extracted, "answer": extracted, # Primary answer "audio_file": audio_path } if not as_json: print("\n" + "=" * 40) print(f"📝 Raw transcription: {raw_text}") print(f"🔤 Extracted chars: {extracted}") print("=" * 40) subprocess.run(["pbcopy"], input=extracted.encode(), check=True) print("📋 Copied to clipboard!") elif mode == "identify": if not target: raise ValueError("--target required for identify mode (e.g., --target 'stream')") if not as_json: print(f"🧠 Asking Gemini: which sound is '{target}'?") id_result = identify_sound(audio_path, target) result = { "success": True, "mode": "identify", "target_sound": target, "answer": id_result["answer"], "raw_response": id_result["raw_response"], "audio_file": audio_path } if not as_json: print("\n" + "=" * 40) print(f"🎯 Target: {target}") print(f"✅ Answer: {id_result['answer']}") if id_result['raw_response'] != id_result['answer']: print(f"📝 Full response: {id_result['raw_response']}") print("=" * 40) subprocess.run(["pbcopy"], input=id_result["answer"].encode(), check=True) print("📋 Copied to clipboard!") elif mode == "describe": if not as_json: print("🧠 Asking Gemini to describe all sounds...") description = describe_sounds(audio_path) result = { "success": True, "mode": "describe", "description": description, "audio_file": audio_path } if not as_json: print("\n" + "=" * 40) print("🔊 Sounds detected:") print(description) print("=" * 40) else: raise ValueError(f"Unknown mode: {mode}") if as_json: return json.dumps(result, indent=2) return result except Exception as e: error = {"success": False, "error": str(e)} if as_json: return json.dumps(error, indent=2) else: print(f"❌ Error: {e}") sys.exit(1) def main(): parser = argparse.ArgumentParser(description="Audio Captcha Solver") parser.add_argument("--duration", "-d", type=int, default=10, help="Recording duration in seconds") parser.add_argument("--model", "-m", default="small", help="Whisper model for transcribe mode (tiny/base/small/medium/large)") parser.add_argument("--mode", choices=["transcribe", "identify", "describe"], default="transcribe", help="Mode: transcribe (speech-to-text), identify (which sound is X?), describe (list all sounds)") parser.add_argument("--target", "-t", help="For identify mode: the sound to find (e.g., 'stream', 'dog barking')") parser.add_argument("--json", "-j", action="store_true", help="Output as JSON") parser.add_argument("--list-devices", "-l", action="store_true", help="List audio devices") args = parser.parse_args() if args.list_devices: list_audio_devices() return result = solve_captcha( duration=args.duration, model=args.model, as_json=args.json, mode=args.mode, target=args.target ) if args.json: print(result) if __name__ == "__main__": main()