clawdbot-workspace/scripts/captcha_audio.py

#!/usr/bin/env python3
"""
Audio Captcha Solver

Records system audio via BlackHole and solves with AI.
Supports two modes:
  - transcribe: Speech-to-text for letter/number captchas (Whisper)
  - identify: Sound classification for "which is the X?" captchas (Gemini)

Usage:
    python captcha_audio.py                              # Record 10s, transcribe
    python captcha_audio.py --duration 15               # Record 15s
    python captcha_audio.py --mode identify --target "stream"  # Which sound is a stream?
    python captcha_audio.py --mode identify --target "dog barking"
    python captcha_audio.py --json                      # Output as JSON
    python captcha_audio.py --list-devices              # List audio devices
"""

import subprocess
import argparse
import tempfile
import json
import re
import os
import sys
from pathlib import Path
from datetime import datetime

def list_audio_devices():
    """List available audio input devices via ffmpeg."""
    result = subprocess.run(
        ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
        capture_output=True, text=True
    )
    # Parse stderr (ffmpeg outputs device list to stderr)
    output = result.stderr
    print("Available audio devices:")
    print("-" * 40)
    in_audio = False
    for line in output.split('\n'):
        if 'audio devices' in line.lower():
            in_audio = True
            continue
        if in_audio and ('[' in line):
            print(line.strip())
    return output

def check_blackhole():
    """Check if BlackHole is available."""
    result = subprocess.run(
        ["system_profiler", "SPAudioDataType"],
        capture_output=True, text=True
    )
    return "BlackHole" in result.stdout

def get_blackhole_device_index():
    """Find BlackHole device index for ffmpeg."""
    result = subprocess.run(
        ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
        capture_output=True, text=True
    )
    lines = result.stderr.split('\n')
    in_audio = False
    for line in lines:
        if 'audio devices' in line.lower():
            in_audio = True
            continue
        if in_audio and 'BlackHole' in line:
            # Extract device index like [0] or [1]
            match = re.search(r'\[(\d+)\]', line)
            if match:
                return match.group(1)
    return None

def record_audio(duration=10, output_path=None):
    """Record audio from BlackHole."""
    if output_path is None:
        output_path = tempfile.mktemp(suffix='.wav')

    device_index = get_blackhole_device_index()
    if device_index is None:
        raise RuntimeError("BlackHole device not found. Is it installed and set up?")

    # Record using ffmpeg
    cmd = [
        "ffmpeg",
        "-f", "avfoundation",
        "-i", f":{device_index}",  # Audio only, from device index
        "-t", str(duration),
        "-ar", "16000",  # 16kHz for Whisper
        "-ac", "1",      # Mono
        "-y",            # Overwrite
        output_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0 and not os.path.exists(output_path):
        raise RuntimeError(f"Recording failed: {result.stderr}")

    return output_path

def transcribe_audio(audio_path, model="small"):
    """Transcribe audio file using Whisper."""
    output_dir = os.path.dirname(audio_path)

    cmd = [
        "whisper",
        audio_path,
        "--model", model,
        "--language", "en",
        "--output_format", "txt",
        "--output_dir", output_dir
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    # Read output file
    txt_path = audio_path.rsplit('.', 1)[0] + '.txt'
    if os.path.exists(txt_path):
        with open(txt_path, 'r') as f:
            return f.read().strip()
    else:
        raise RuntimeError(f"Transcription failed: {result.stderr}")

def extract_captcha_chars(text):
    """Extract likely captcha characters (letters/numbers only)."""
    # Remove common filler words often in audio captchas
    filler_words = ['the', 'is', 'are', 'a', 'an', 'please', 'enter', 'type', 'following']
    words = text.lower().split()
    filtered = [w for w in words if w not in filler_words]

    # Extract alphanumeric characters
    chars = re.findall(r'[A-Za-z0-9]', ' '.join(filtered))
    return ''.join(chars).upper()

def identify_sound(audio_path, target_sound):
    """
    Use Gemini to identify which sound matches the target.
    For captchas like "which sound is a stream?"
    """
    prompt = f"""You are helping solve an audio captcha. Listen carefully to the audio.
The audio contains multiple sounds played in sequence (usually numbered or separated by pauses).

Question: Which sound is a "{target_sound}"?

Instructions:
1. Listen to each sound segment
2. Identify which one matches "{target_sound}"
3. Reply with ONLY the number (1, 2, 3, etc.) or position of the matching sound
4. If you hear the sounds labeled (like "sound 1", "sound 2"), use those numbers
5. If no labels, count the order they play (first=1, second=2, etc.)
6. Be confident - pick the best match

Reply with just the number, nothing else."""

    # Use gemini CLI to analyze audio
    cmd = [
        "gemini",
        "-p", prompt,
        "-f", audio_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"Gemini analysis failed: {result.stderr}")

    response = result.stdout.strip()

    # Extract just the number from response
    numbers = re.findall(r'\d+', response)
    answer = numbers[0] if numbers else response

    return {
        "answer": answer,
        "raw_response": response,
        "target_sound": target_sound
    }

def describe_sounds(audio_path):
    """
    Use Gemini to describe all sounds heard in the audio.
    Useful for debugging or when you don't know the target.
    """
    prompt = """Listen to this audio captcha and describe each sound you hear.

For each distinct sound or segment, tell me:
1. What number/position it is (first, second, third OR 1, 2, 3)
2. What the sound is (water, dog, traffic, bird, etc.)

Format your response as a simple list like:
1: water/stream flowing
2: dog barking
3: traffic/cars

Be specific and confident."""

    cmd = [
        "gemini",
        "-p", prompt,
        "-f", audio_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"Gemini analysis failed: {result.stderr}")

    return result.stdout.strip()

def solve_captcha(duration=10, model="small", as_json=False, mode="transcribe", target=None):
    """
    Main function to record and solve audio captcha.

    Modes:
        transcribe: Use Whisper for speech-to-text (letters/numbers)
        identify: Use Gemini to identify sounds ("which is the stream?")
        describe: Use Gemini to describe all sounds (debugging)
    """
    if not check_blackhole():
        error = {
            "success": False,
            "error": "BlackHole not detected",
            "setup_instructions": [
                "brew install blackhole-2ch",
                "Reboot your Mac",
                "Open 'Audio MIDI Setup'",
                "Create Multi-Output Device (speakers + BlackHole)",
                "Set Multi-Output as system output"
            ]
        }
        if as_json:
            return json.dumps(error, indent=2)
        else:
            print("❌ BlackHole not detected!")
            print("\nSetup instructions:")
            for step in error["setup_instructions"]:
                print(f"  • {step}")
            sys.exit(1)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    work_dir = Path(tempfile.gettempdir()) / "captcha-audio"
    work_dir.mkdir(exist_ok=True)
    audio_path = str(work_dir / f"captcha_{timestamp}.wav")

    mode_emoji = {"transcribe": "🎤", "identify": "🔊", "describe": "🔍"}
    if not as_json:
        print(f"{mode_emoji.get(mode, '🎤')} Mode: {mode}")
        print(f"⏱️  Recording for {duration} seconds...")
        print("▶️  Play the audio captcha NOW!")

    try:
        # Record
        record_audio(duration, audio_path)

        if not as_json:
            print("✅ Recording complete")

        # Process based on mode
        if mode == "transcribe":
            if not as_json:
                print("🧠 Transcribing with Whisper...")

            raw_text = transcribe_audio(audio_path, model)
            extracted = extract_captcha_chars(raw_text)

            result = {
                "success": True,
                "mode": "transcribe",
                "raw_text": raw_text,
                "extracted_chars": extracted,
                "answer": extracted,  # Primary answer
                "audio_file": audio_path
            }

            if not as_json:
                print("\n" + "=" * 40)
                print(f"📝 Raw transcription: {raw_text}")
                print(f"🔤 Extracted chars:   {extracted}")
                print("=" * 40)
                subprocess.run(["pbcopy"], input=extracted.encode(), check=True)
                print("📋 Copied to clipboard!")

        elif mode == "identify":
            if not target:
                raise ValueError("--target required for identify mode (e.g., --target 'stream')")

            if not as_json:
                print(f"🧠 Asking Gemini: which sound is '{target}'?")

            id_result = identify_sound(audio_path, target)

            result = {
                "success": True,
                "mode": "identify",
                "target_sound": target,
                "answer": id_result["answer"],
                "raw_response": id_result["raw_response"],
                "audio_file": audio_path
            }

            if not as_json:
                print("\n" + "=" * 40)
                print(f"🎯 Target: {target}")
                print(f"✅ Answer: {id_result['answer']}")
                if id_result['raw_response'] != id_result['answer']:
                    print(f"📝 Full response: {id_result['raw_response']}")
                print("=" * 40)
                subprocess.run(["pbcopy"], input=id_result["answer"].encode(), check=True)
                print("📋 Copied to clipboard!")

        elif mode == "describe":
            if not as_json:
                print("🧠 Asking Gemini to describe all sounds...")

            description = describe_sounds(audio_path)

            result = {
                "success": True,
                "mode": "describe",
                "description": description,
                "audio_file": audio_path
            }

            if not as_json:
                print("\n" + "=" * 40)
                print("🔊 Sounds detected:")
                print(description)
                print("=" * 40)

        else:
            raise ValueError(f"Unknown mode: {mode}")

        if as_json:
            return json.dumps(result, indent=2)
        return result

    except Exception as e:
        error = {"success": False, "error": str(e)}
        if as_json:
            return json.dumps(error, indent=2)
        else:
            print(f"❌ Error: {e}")
            sys.exit(1)

def main():
    parser = argparse.ArgumentParser(description="Audio Captcha Solver")
    parser.add_argument("--duration", "-d", type=int, default=10, help="Recording duration in seconds")
    parser.add_argument("--model", "-m", default="small", help="Whisper model for transcribe mode (tiny/base/small/medium/large)")
    parser.add_argument("--mode", choices=["transcribe", "identify", "describe"], default="transcribe",
                        help="Mode: transcribe (speech-to-text), identify (which sound is X?), describe (list all sounds)")
    parser.add_argument("--target", "-t", help="For identify mode: the sound to find (e.g., 'stream', 'dog barking')")
    parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
    parser.add_argument("--list-devices", "-l", action="store_true", help="List audio devices")

    args = parser.parse_args()

    if args.list_devices:
        list_audio_devices()
        return

    result = solve_captcha(
        duration=args.duration,
        model=args.model,
        as_json=args.json,
        mode=args.mode,
        target=args.target
    )
    if args.json:
        print(result)

if __name__ == "__main__":
    main()