374 lines
12 KiB
Python
Executable File
374 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Audio Captcha Solver
|
|
|
|
Records system audio via BlackHole and solves with AI.
|
|
Supports two modes:
|
|
- transcribe: Speech-to-text for letter/number captchas (Whisper)
|
|
- identify: Sound classification for "which is the X?" captchas (Gemini)
|
|
|
|
Usage:
|
|
python captcha_audio.py # Record 10s, transcribe
|
|
python captcha_audio.py --duration 15 # Record 15s
|
|
python captcha_audio.py --mode identify --target "stream" # Which sound is a stream?
|
|
python captcha_audio.py --mode identify --target "dog barking"
|
|
python captcha_audio.py --json # Output as JSON
|
|
python captcha_audio.py --list-devices # List audio devices
|
|
"""
|
|
|
|
import subprocess
|
|
import argparse
|
|
import tempfile
|
|
import json
|
|
import re
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
def list_audio_devices():
|
|
"""List available audio input devices via ffmpeg."""
|
|
result = subprocess.run(
|
|
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
|
|
capture_output=True, text=True
|
|
)
|
|
# Parse stderr (ffmpeg outputs device list to stderr)
|
|
output = result.stderr
|
|
print("Available audio devices:")
|
|
print("-" * 40)
|
|
in_audio = False
|
|
for line in output.split('\n'):
|
|
if 'audio devices' in line.lower():
|
|
in_audio = True
|
|
continue
|
|
if in_audio and ('[' in line):
|
|
print(line.strip())
|
|
return output
|
|
|
|
def check_blackhole():
|
|
"""Check if BlackHole is available."""
|
|
result = subprocess.run(
|
|
["system_profiler", "SPAudioDataType"],
|
|
capture_output=True, text=True
|
|
)
|
|
return "BlackHole" in result.stdout
|
|
|
|
def get_blackhole_device_index():
|
|
"""Find BlackHole device index for ffmpeg."""
|
|
result = subprocess.run(
|
|
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
|
|
capture_output=True, text=True
|
|
)
|
|
lines = result.stderr.split('\n')
|
|
in_audio = False
|
|
for line in lines:
|
|
if 'audio devices' in line.lower():
|
|
in_audio = True
|
|
continue
|
|
if in_audio and 'BlackHole' in line:
|
|
# Extract device index like [0] or [1]
|
|
match = re.search(r'\[(\d+)\]', line)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def record_audio(duration=10, output_path=None):
|
|
"""Record audio from BlackHole."""
|
|
if output_path is None:
|
|
output_path = tempfile.mktemp(suffix='.wav')
|
|
|
|
device_index = get_blackhole_device_index()
|
|
if device_index is None:
|
|
raise RuntimeError("BlackHole device not found. Is it installed and set up?")
|
|
|
|
# Record using ffmpeg
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-f", "avfoundation",
|
|
"-i", f":{device_index}", # Audio only, from device index
|
|
"-t", str(duration),
|
|
"-ar", "16000", # 16kHz for Whisper
|
|
"-ac", "1", # Mono
|
|
"-y", # Overwrite
|
|
output_path
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0 and not os.path.exists(output_path):
|
|
raise RuntimeError(f"Recording failed: {result.stderr}")
|
|
|
|
return output_path
|
|
|
|
def transcribe_audio(audio_path, model="small"):
|
|
"""Transcribe audio file using Whisper."""
|
|
output_dir = os.path.dirname(audio_path)
|
|
|
|
cmd = [
|
|
"whisper",
|
|
audio_path,
|
|
"--model", model,
|
|
"--language", "en",
|
|
"--output_format", "txt",
|
|
"--output_dir", output_dir
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
# Read output file
|
|
txt_path = audio_path.rsplit('.', 1)[0] + '.txt'
|
|
if os.path.exists(txt_path):
|
|
with open(txt_path, 'r') as f:
|
|
return f.read().strip()
|
|
else:
|
|
raise RuntimeError(f"Transcription failed: {result.stderr}")
|
|
|
|
def extract_captcha_chars(text):
|
|
"""Extract likely captcha characters (letters/numbers only)."""
|
|
# Remove common filler words often in audio captchas
|
|
filler_words = ['the', 'is', 'are', 'a', 'an', 'please', 'enter', 'type', 'following']
|
|
words = text.lower().split()
|
|
filtered = [w for w in words if w not in filler_words]
|
|
|
|
# Extract alphanumeric characters
|
|
chars = re.findall(r'[A-Za-z0-9]', ' '.join(filtered))
|
|
return ''.join(chars).upper()
|
|
|
|
def identify_sound(audio_path, target_sound):
|
|
"""
|
|
Use Gemini to identify which sound matches the target.
|
|
For captchas like "which sound is a stream?"
|
|
"""
|
|
prompt = f"""You are helping solve an audio captcha. Listen carefully to the audio.
|
|
The audio contains multiple sounds played in sequence (usually numbered or separated by pauses).
|
|
|
|
Question: Which sound is a "{target_sound}"?
|
|
|
|
Instructions:
|
|
1. Listen to each sound segment
|
|
2. Identify which one matches "{target_sound}"
|
|
3. Reply with ONLY the number (1, 2, 3, etc.) or position of the matching sound
|
|
4. If you hear the sounds labeled (like "sound 1", "sound 2"), use those numbers
|
|
5. If no labels, count the order they play (first=1, second=2, etc.)
|
|
6. Be confident - pick the best match
|
|
|
|
Reply with just the number, nothing else."""
|
|
|
|
# Use gemini CLI to analyze audio
|
|
cmd = [
|
|
"gemini",
|
|
"-p", prompt,
|
|
"-f", audio_path
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"Gemini analysis failed: {result.stderr}")
|
|
|
|
response = result.stdout.strip()
|
|
|
|
# Extract just the number from response
|
|
numbers = re.findall(r'\d+', response)
|
|
answer = numbers[0] if numbers else response
|
|
|
|
return {
|
|
"answer": answer,
|
|
"raw_response": response,
|
|
"target_sound": target_sound
|
|
}
|
|
|
|
def describe_sounds(audio_path):
|
|
"""
|
|
Use Gemini to describe all sounds heard in the audio.
|
|
Useful for debugging or when you don't know the target.
|
|
"""
|
|
prompt = """Listen to this audio captcha and describe each sound you hear.
|
|
|
|
For each distinct sound or segment, tell me:
|
|
1. What number/position it is (first, second, third OR 1, 2, 3)
|
|
2. What the sound is (water, dog, traffic, bird, etc.)
|
|
|
|
Format your response as a simple list like:
|
|
1: water/stream flowing
|
|
2: dog barking
|
|
3: traffic/cars
|
|
|
|
Be specific and confident."""
|
|
|
|
cmd = [
|
|
"gemini",
|
|
"-p", prompt,
|
|
"-f", audio_path
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"Gemini analysis failed: {result.stderr}")
|
|
|
|
return result.stdout.strip()
|
|
|
|
def solve_captcha(duration=10, model="small", as_json=False, mode="transcribe", target=None):
|
|
"""
|
|
Main function to record and solve audio captcha.
|
|
|
|
Modes:
|
|
transcribe: Use Whisper for speech-to-text (letters/numbers)
|
|
identify: Use Gemini to identify sounds ("which is the stream?")
|
|
describe: Use Gemini to describe all sounds (debugging)
|
|
"""
|
|
if not check_blackhole():
|
|
error = {
|
|
"success": False,
|
|
"error": "BlackHole not detected",
|
|
"setup_instructions": [
|
|
"brew install blackhole-2ch",
|
|
"Reboot your Mac",
|
|
"Open 'Audio MIDI Setup'",
|
|
"Create Multi-Output Device (speakers + BlackHole)",
|
|
"Set Multi-Output as system output"
|
|
]
|
|
}
|
|
if as_json:
|
|
return json.dumps(error, indent=2)
|
|
else:
|
|
print("❌ BlackHole not detected!")
|
|
print("\nSetup instructions:")
|
|
for step in error["setup_instructions"]:
|
|
print(f" • {step}")
|
|
sys.exit(1)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
work_dir = Path(tempfile.gettempdir()) / "captcha-audio"
|
|
work_dir.mkdir(exist_ok=True)
|
|
audio_path = str(work_dir / f"captcha_{timestamp}.wav")
|
|
|
|
mode_emoji = {"transcribe": "🎤", "identify": "🔊", "describe": "🔍"}
|
|
if not as_json:
|
|
print(f"{mode_emoji.get(mode, '🎤')} Mode: {mode}")
|
|
print(f"⏱️ Recording for {duration} seconds...")
|
|
print("▶️ Play the audio captcha NOW!")
|
|
|
|
try:
|
|
# Record
|
|
record_audio(duration, audio_path)
|
|
|
|
if not as_json:
|
|
print("✅ Recording complete")
|
|
|
|
# Process based on mode
|
|
if mode == "transcribe":
|
|
if not as_json:
|
|
print("🧠 Transcribing with Whisper...")
|
|
|
|
raw_text = transcribe_audio(audio_path, model)
|
|
extracted = extract_captcha_chars(raw_text)
|
|
|
|
result = {
|
|
"success": True,
|
|
"mode": "transcribe",
|
|
"raw_text": raw_text,
|
|
"extracted_chars": extracted,
|
|
"answer": extracted, # Primary answer
|
|
"audio_file": audio_path
|
|
}
|
|
|
|
if not as_json:
|
|
print("\n" + "=" * 40)
|
|
print(f"📝 Raw transcription: {raw_text}")
|
|
print(f"🔤 Extracted chars: {extracted}")
|
|
print("=" * 40)
|
|
subprocess.run(["pbcopy"], input=extracted.encode(), check=True)
|
|
print("📋 Copied to clipboard!")
|
|
|
|
elif mode == "identify":
|
|
if not target:
|
|
raise ValueError("--target required for identify mode (e.g., --target 'stream')")
|
|
|
|
if not as_json:
|
|
print(f"🧠 Asking Gemini: which sound is '{target}'?")
|
|
|
|
id_result = identify_sound(audio_path, target)
|
|
|
|
result = {
|
|
"success": True,
|
|
"mode": "identify",
|
|
"target_sound": target,
|
|
"answer": id_result["answer"],
|
|
"raw_response": id_result["raw_response"],
|
|
"audio_file": audio_path
|
|
}
|
|
|
|
if not as_json:
|
|
print("\n" + "=" * 40)
|
|
print(f"🎯 Target: {target}")
|
|
print(f"✅ Answer: {id_result['answer']}")
|
|
if id_result['raw_response'] != id_result['answer']:
|
|
print(f"📝 Full response: {id_result['raw_response']}")
|
|
print("=" * 40)
|
|
subprocess.run(["pbcopy"], input=id_result["answer"].encode(), check=True)
|
|
print("📋 Copied to clipboard!")
|
|
|
|
elif mode == "describe":
|
|
if not as_json:
|
|
print("🧠 Asking Gemini to describe all sounds...")
|
|
|
|
description = describe_sounds(audio_path)
|
|
|
|
result = {
|
|
"success": True,
|
|
"mode": "describe",
|
|
"description": description,
|
|
"audio_file": audio_path
|
|
}
|
|
|
|
if not as_json:
|
|
print("\n" + "=" * 40)
|
|
print("🔊 Sounds detected:")
|
|
print(description)
|
|
print("=" * 40)
|
|
|
|
else:
|
|
raise ValueError(f"Unknown mode: {mode}")
|
|
|
|
if as_json:
|
|
return json.dumps(result, indent=2)
|
|
return result
|
|
|
|
except Exception as e:
|
|
error = {"success": False, "error": str(e)}
|
|
if as_json:
|
|
return json.dumps(error, indent=2)
|
|
else:
|
|
print(f"❌ Error: {e}")
|
|
sys.exit(1)
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Audio Captcha Solver")
|
|
parser.add_argument("--duration", "-d", type=int, default=10, help="Recording duration in seconds")
|
|
parser.add_argument("--model", "-m", default="small", help="Whisper model for transcribe mode (tiny/base/small/medium/large)")
|
|
parser.add_argument("--mode", choices=["transcribe", "identify", "describe"], default="transcribe",
|
|
help="Mode: transcribe (speech-to-text), identify (which sound is X?), describe (list all sounds)")
|
|
parser.add_argument("--target", "-t", help="For identify mode: the sound to find (e.g., 'stream', 'dog barking')")
|
|
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
parser.add_argument("--list-devices", "-l", action="store_true", help="List audio devices")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.list_devices:
|
|
list_audio_devices()
|
|
return
|
|
|
|
result = solve_captcha(
|
|
duration=args.duration,
|
|
model=args.model,
|
|
as_json=args.json,
|
|
mode=args.mode,
|
|
target=args.target
|
|
)
|
|
if args.json:
|
|
print(result)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|