clawdbot-workspace/scripts/captcha_audio.py
2026-01-28 23:00:58 -05:00

374 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Audio Captcha Solver
Records system audio via BlackHole and solves with AI.
Supports two modes:
- transcribe: Speech-to-text for letter/number captchas (Whisper)
- identify: Sound classification for "which is the X?" captchas (Gemini)
Usage:
python captcha_audio.py # Record 10s, transcribe
python captcha_audio.py --duration 15 # Record 15s
python captcha_audio.py --mode identify --target "stream" # Which sound is a stream?
python captcha_audio.py --mode identify --target "dog barking"
python captcha_audio.py --json # Output as JSON
python captcha_audio.py --list-devices # List audio devices
"""
import subprocess
import argparse
import tempfile
import json
import re
import os
import sys
from pathlib import Path
from datetime import datetime
def list_audio_devices():
"""List available audio input devices via ffmpeg."""
result = subprocess.run(
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
capture_output=True, text=True
)
# Parse stderr (ffmpeg outputs device list to stderr)
output = result.stderr
print("Available audio devices:")
print("-" * 40)
in_audio = False
for line in output.split('\n'):
if 'audio devices' in line.lower():
in_audio = True
continue
if in_audio and ('[' in line):
print(line.strip())
return output
def check_blackhole():
"""Check if BlackHole is available."""
result = subprocess.run(
["system_profiler", "SPAudioDataType"],
capture_output=True, text=True
)
return "BlackHole" in result.stdout
def get_blackhole_device_index():
"""Find BlackHole device index for ffmpeg."""
result = subprocess.run(
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
capture_output=True, text=True
)
lines = result.stderr.split('\n')
in_audio = False
for line in lines:
if 'audio devices' in line.lower():
in_audio = True
continue
if in_audio and 'BlackHole' in line:
# Extract device index like [0] or [1]
match = re.search(r'\[(\d+)\]', line)
if match:
return match.group(1)
return None
def record_audio(duration=10, output_path=None):
"""Record audio from BlackHole."""
if output_path is None:
output_path = tempfile.mktemp(suffix='.wav')
device_index = get_blackhole_device_index()
if device_index is None:
raise RuntimeError("BlackHole device not found. Is it installed and set up?")
# Record using ffmpeg
cmd = [
"ffmpeg",
"-f", "avfoundation",
"-i", f":{device_index}", # Audio only, from device index
"-t", str(duration),
"-ar", "16000", # 16kHz for Whisper
"-ac", "1", # Mono
"-y", # Overwrite
output_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0 and not os.path.exists(output_path):
raise RuntimeError(f"Recording failed: {result.stderr}")
return output_path
def transcribe_audio(audio_path, model="small"):
"""Transcribe audio file using Whisper."""
output_dir = os.path.dirname(audio_path)
cmd = [
"whisper",
audio_path,
"--model", model,
"--language", "en",
"--output_format", "txt",
"--output_dir", output_dir
]
result = subprocess.run(cmd, capture_output=True, text=True)
# Read output file
txt_path = audio_path.rsplit('.', 1)[0] + '.txt'
if os.path.exists(txt_path):
with open(txt_path, 'r') as f:
return f.read().strip()
else:
raise RuntimeError(f"Transcription failed: {result.stderr}")
def extract_captcha_chars(text):
"""Extract likely captcha characters (letters/numbers only)."""
# Remove common filler words often in audio captchas
filler_words = ['the', 'is', 'are', 'a', 'an', 'please', 'enter', 'type', 'following']
words = text.lower().split()
filtered = [w for w in words if w not in filler_words]
# Extract alphanumeric characters
chars = re.findall(r'[A-Za-z0-9]', ' '.join(filtered))
return ''.join(chars).upper()
def identify_sound(audio_path, target_sound):
"""
Use Gemini to identify which sound matches the target.
For captchas like "which sound is a stream?"
"""
prompt = f"""You are helping solve an audio captcha. Listen carefully to the audio.
The audio contains multiple sounds played in sequence (usually numbered or separated by pauses).
Question: Which sound is a "{target_sound}"?
Instructions:
1. Listen to each sound segment
2. Identify which one matches "{target_sound}"
3. Reply with ONLY the number (1, 2, 3, etc.) or position of the matching sound
4. If you hear the sounds labeled (like "sound 1", "sound 2"), use those numbers
5. If no labels, count the order they play (first=1, second=2, etc.)
6. Be confident - pick the best match
Reply with just the number, nothing else."""
# Use gemini CLI to analyze audio
cmd = [
"gemini",
"-p", prompt,
"-f", audio_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Gemini analysis failed: {result.stderr}")
response = result.stdout.strip()
# Extract just the number from response
numbers = re.findall(r'\d+', response)
answer = numbers[0] if numbers else response
return {
"answer": answer,
"raw_response": response,
"target_sound": target_sound
}
def describe_sounds(audio_path):
"""
Use Gemini to describe all sounds heard in the audio.
Useful for debugging or when you don't know the target.
"""
prompt = """Listen to this audio captcha and describe each sound you hear.
For each distinct sound or segment, tell me:
1. What number/position it is (first, second, third OR 1, 2, 3)
2. What the sound is (water, dog, traffic, bird, etc.)
Format your response as a simple list like:
1: water/stream flowing
2: dog barking
3: traffic/cars
Be specific and confident."""
cmd = [
"gemini",
"-p", prompt,
"-f", audio_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Gemini analysis failed: {result.stderr}")
return result.stdout.strip()
def solve_captcha(duration=10, model="small", as_json=False, mode="transcribe", target=None):
"""
Main function to record and solve audio captcha.
Modes:
transcribe: Use Whisper for speech-to-text (letters/numbers)
identify: Use Gemini to identify sounds ("which is the stream?")
describe: Use Gemini to describe all sounds (debugging)
"""
if not check_blackhole():
error = {
"success": False,
"error": "BlackHole not detected",
"setup_instructions": [
"brew install blackhole-2ch",
"Reboot your Mac",
"Open 'Audio MIDI Setup'",
"Create Multi-Output Device (speakers + BlackHole)",
"Set Multi-Output as system output"
]
}
if as_json:
return json.dumps(error, indent=2)
else:
print("❌ BlackHole not detected!")
print("\nSetup instructions:")
for step in error["setup_instructions"]:
print(f"{step}")
sys.exit(1)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
work_dir = Path(tempfile.gettempdir()) / "captcha-audio"
work_dir.mkdir(exist_ok=True)
audio_path = str(work_dir / f"captcha_{timestamp}.wav")
mode_emoji = {"transcribe": "🎤", "identify": "🔊", "describe": "🔍"}
if not as_json:
print(f"{mode_emoji.get(mode, '🎤')} Mode: {mode}")
print(f"⏱️ Recording for {duration} seconds...")
print("▶️ Play the audio captcha NOW!")
try:
# Record
record_audio(duration, audio_path)
if not as_json:
print("✅ Recording complete")
# Process based on mode
if mode == "transcribe":
if not as_json:
print("🧠 Transcribing with Whisper...")
raw_text = transcribe_audio(audio_path, model)
extracted = extract_captcha_chars(raw_text)
result = {
"success": True,
"mode": "transcribe",
"raw_text": raw_text,
"extracted_chars": extracted,
"answer": extracted, # Primary answer
"audio_file": audio_path
}
if not as_json:
print("\n" + "=" * 40)
print(f"📝 Raw transcription: {raw_text}")
print(f"🔤 Extracted chars: {extracted}")
print("=" * 40)
subprocess.run(["pbcopy"], input=extracted.encode(), check=True)
print("📋 Copied to clipboard!")
elif mode == "identify":
if not target:
raise ValueError("--target required for identify mode (e.g., --target 'stream')")
if not as_json:
print(f"🧠 Asking Gemini: which sound is '{target}'?")
id_result = identify_sound(audio_path, target)
result = {
"success": True,
"mode": "identify",
"target_sound": target,
"answer": id_result["answer"],
"raw_response": id_result["raw_response"],
"audio_file": audio_path
}
if not as_json:
print("\n" + "=" * 40)
print(f"🎯 Target: {target}")
print(f"✅ Answer: {id_result['answer']}")
if id_result['raw_response'] != id_result['answer']:
print(f"📝 Full response: {id_result['raw_response']}")
print("=" * 40)
subprocess.run(["pbcopy"], input=id_result["answer"].encode(), check=True)
print("📋 Copied to clipboard!")
elif mode == "describe":
if not as_json:
print("🧠 Asking Gemini to describe all sounds...")
description = describe_sounds(audio_path)
result = {
"success": True,
"mode": "describe",
"description": description,
"audio_file": audio_path
}
if not as_json:
print("\n" + "=" * 40)
print("🔊 Sounds detected:")
print(description)
print("=" * 40)
else:
raise ValueError(f"Unknown mode: {mode}")
if as_json:
return json.dumps(result, indent=2)
return result
except Exception as e:
error = {"success": False, "error": str(e)}
if as_json:
return json.dumps(error, indent=2)
else:
print(f"❌ Error: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Audio Captcha Solver")
parser.add_argument("--duration", "-d", type=int, default=10, help="Recording duration in seconds")
parser.add_argument("--model", "-m", default="small", help="Whisper model for transcribe mode (tiny/base/small/medium/large)")
parser.add_argument("--mode", choices=["transcribe", "identify", "describe"], default="transcribe",
help="Mode: transcribe (speech-to-text), identify (which sound is X?), describe (list all sounds)")
parser.add_argument("--target", "-t", help="For identify mode: the sound to find (e.g., 'stream', 'dog barking')")
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
parser.add_argument("--list-devices", "-l", action="store_true", help="List audio devices")
args = parser.parse_args()
if args.list_devices:
list_audio_devices()
return
result = solve_captcha(
duration=args.duration,
model=args.model,
as_json=args.json,
mode=args.mode,
target=args.target
)
if args.json:
print(result)
if __name__ == "__main__":
main()