#!/bin/bash # Audio Captcha Solver - Agent Browser Edition # Uses agent-browser for network interception instead of BlackHole # # Usage: # ./captcha_agent_browser.sh [mode] [target] # ./captcha_agent_browser.sh "https://site.com/login" transcribe # ./captcha_agent_browser.sh "https://site.com/login" identify "stream" # ./captcha_agent_browser.sh "https://site.com/login" describe set -e URL="${1:-}" MODE="${2:-transcribe}" TARGET="${3:-}" if [ -z "$URL" ]; then echo "Usage: $0 [mode] [target]" echo "" echo "Modes:" echo " transcribe - Speech-to-text (default)" echo " identify - Which sound is X? (requires target)" echo " describe - List all sounds heard" echo "" echo "Example:" echo " $0 'https://example.com/login' identify 'stream'" exit 1 fi OUTPUT_DIR="/tmp/captcha-audio" TIMESTAMP=$(date +%Y%m%d_%H%M%S) AUDIO_FILE="$OUTPUT_DIR/captcha_$TIMESTAMP.mp3" mkdir -p "$OUTPUT_DIR" echo "🌐 Audio Captcha Solver (Agent Browser)" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" # Open URL in agent-browser echo "📍 Opening: $URL" agent-browser open "$URL" --headed echo "" echo "👆 Find and click the AUDIO CAPTCHA button on the page" echo " (I'm watching network requests for audio files...)" echo "" # Wait a moment for page load sleep 2 # Poll for audio requests echo "👂 Listening for audio file requests..." MAX_ATTEMPTS=60 ATTEMPT=0 AUDIO_URL="" while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do # Get network requests and look for audio REQUESTS=$(agent-browser network requests --json 2>/dev/null || echo "[]") # Look for audio URLs in the requests AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]+\.(mp3|wav|ogg|m4a|webm)[^"]*' | head -1 || true) # Also check for audio content types or captcha audio patterns if [ -z "$AUDIO_URL" ]; then AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*audio[^"]*' | head -1 || true) fi if [ -z "$AUDIO_URL" ]; then AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*captcha[^"]*\.(mp3|wav|ogg)[^"]*' | head -1 || true) fi if [ -z "$AUDIO_URL" ]; then AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*recaptcha[^"]*audio[^"]*' | head -1 || true) fi if [ -n "$AUDIO_URL" ]; then echo "🎵 Found audio URL!" break fi sleep 1 ATTEMPT=$((ATTEMPT + 1)) # Show progress every 10 seconds if [ $((ATTEMPT % 10)) -eq 0 ]; then echo " Still listening... ($ATTEMPT seconds)" fi done if [ -z "$AUDIO_URL" ]; then echo "❌ No audio file detected after ${MAX_ATTEMPTS} seconds" echo "" echo "Debugging: Here are recent network requests:" agent-browser network requests 2>/dev/null | head -20 exit 1 fi echo "📥 Downloading: $AUDIO_URL" curl -sL "$AUDIO_URL" -o "$AUDIO_FILE" if [ ! -s "$AUDIO_FILE" ]; then echo "❌ Failed to download audio file" exit 1 fi echo "✅ Saved to: $AUDIO_FILE" echo "" # Analyze based on mode case "$MODE" in transcribe) echo "🧠 Transcribing with Whisper..." whisper "$AUDIO_FILE" \ --model small \ --language en \ --output_format txt \ --output_dir "$OUTPUT_DIR" \ 2>/dev/null TXT_FILE="${AUDIO_FILE%.mp3}.txt" if [ -f "$TXT_FILE" ]; then RAW_TEXT=$(cat "$TXT_FILE") EXTRACTED=$(echo "$RAW_TEXT" | grep -oE '[A-Za-z0-9]' | tr -d '\n' | tr '[:lower:]' '[:upper:]') echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "📝 Raw text: $RAW_TEXT" echo "🔤 Extracted: $EXTRACTED" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "$EXTRACTED" | pbcopy echo "📋 Copied to clipboard!" else echo "❌ Transcription failed" fi ;; identify) if [ -z "$TARGET" ]; then echo "❌ identify mode requires a target sound" echo " Example: $0 '$URL' identify 'stream'" exit 1 fi echo "🧠 Asking Gemini: which sound is '$TARGET'?" PROMPT="Listen to this audio captcha. It contains multiple sounds. Which sound is a \"$TARGET\"? Reply with ONLY the number (1, 2, 3, etc.) of the matching sound. Just the number, nothing else." RESPONSE=$(gemini -p "$PROMPT" -f "$AUDIO_FILE" 2>/dev/null) ANSWER=$(echo "$RESPONSE" | grep -oE '[0-9]+' | head -1) echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "🎯 Target: $TARGET" echo "✅ Answer: ${ANSWER:-$RESPONSE}" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "${ANSWER:-$RESPONSE}" | pbcopy echo "📋 Copied to clipboard!" ;; describe) echo "🧠 Asking Gemini to describe all sounds..." PROMPT="Listen to this audio and describe each distinct sound you hear. Format as: 1: [description], 2: [description], etc." RESPONSE=$(gemini -p "$PROMPT" -f "$AUDIO_FILE" 2>/dev/null) echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "🔊 Sounds detected:" echo "$RESPONSE" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ;; *) echo "❌ Unknown mode: $MODE" echo " Use: transcribe, identify, or describe" exit 1 ;; esac echo "" echo "Done! Browser is still open if you need to enter the answer."