clawdbot-workspace/scripts/captcha_agent_browser.sh
2026-01-28 23:00:58 -05:00

185 lines
6.1 KiB
Bash
Executable File

#!/bin/bash
# Audio Captcha Solver - Agent Browser Edition
# Uses agent-browser for network interception instead of BlackHole
#
# Usage:
# ./captcha_agent_browser.sh <url> [mode] [target]
# ./captcha_agent_browser.sh "https://site.com/login" transcribe
# ./captcha_agent_browser.sh "https://site.com/login" identify "stream"
# ./captcha_agent_browser.sh "https://site.com/login" describe
set -e
URL="${1:-}"
MODE="${2:-transcribe}"
TARGET="${3:-}"
if [ -z "$URL" ]; then
echo "Usage: $0 <url> [mode] [target]"
echo ""
echo "Modes:"
echo " transcribe - Speech-to-text (default)"
echo " identify - Which sound is X? (requires target)"
echo " describe - List all sounds heard"
echo ""
echo "Example:"
echo " $0 'https://example.com/login' identify 'stream'"
exit 1
fi
OUTPUT_DIR="/tmp/captcha-audio"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
AUDIO_FILE="$OUTPUT_DIR/captcha_$TIMESTAMP.mp3"
mkdir -p "$OUTPUT_DIR"
echo "🌐 Audio Captcha Solver (Agent Browser)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
# Open URL in agent-browser
echo "📍 Opening: $URL"
agent-browser open "$URL" --headed
echo ""
echo "👆 Find and click the AUDIO CAPTCHA button on the page"
echo " (I'm watching network requests for audio files...)"
echo ""
# Wait a moment for page load
sleep 2
# Poll for audio requests
echo "👂 Listening for audio file requests..."
MAX_ATTEMPTS=60
ATTEMPT=0
AUDIO_URL=""
while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
# Get network requests and look for audio
REQUESTS=$(agent-browser network requests --json 2>/dev/null || echo "[]")
# Look for audio URLs in the requests
AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]+\.(mp3|wav|ogg|m4a|webm)[^"]*' | head -1 || true)
# Also check for audio content types or captcha audio patterns
if [ -z "$AUDIO_URL" ]; then
AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*audio[^"]*' | head -1 || true)
fi
if [ -z "$AUDIO_URL" ]; then
AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*captcha[^"]*\.(mp3|wav|ogg)[^"]*' | head -1 || true)
fi
if [ -z "$AUDIO_URL" ]; then
AUDIO_URL=$(echo "$REQUESTS" | grep -oE 'https?://[^"]*recaptcha[^"]*audio[^"]*' | head -1 || true)
fi
if [ -n "$AUDIO_URL" ]; then
echo "🎵 Found audio URL!"
break
fi
sleep 1
ATTEMPT=$((ATTEMPT + 1))
# Show progress every 10 seconds
if [ $((ATTEMPT % 10)) -eq 0 ]; then
echo " Still listening... ($ATTEMPT seconds)"
fi
done
if [ -z "$AUDIO_URL" ]; then
echo "❌ No audio file detected after ${MAX_ATTEMPTS} seconds"
echo ""
echo "Debugging: Here are recent network requests:"
agent-browser network requests 2>/dev/null | head -20
exit 1
fi
echo "📥 Downloading: $AUDIO_URL"
curl -sL "$AUDIO_URL" -o "$AUDIO_FILE"
if [ ! -s "$AUDIO_FILE" ]; then
echo "❌ Failed to download audio file"
exit 1
fi
echo "✅ Saved to: $AUDIO_FILE"
echo ""
# Analyze based on mode
case "$MODE" in
transcribe)
echo "🧠 Transcribing with Whisper..."
whisper "$AUDIO_FILE" \
--model small \
--language en \
--output_format txt \
--output_dir "$OUTPUT_DIR" \
2>/dev/null
TXT_FILE="${AUDIO_FILE%.mp3}.txt"
if [ -f "$TXT_FILE" ]; then
RAW_TEXT=$(cat "$TXT_FILE")
EXTRACTED=$(echo "$RAW_TEXT" | grep -oE '[A-Za-z0-9]' | tr -d '\n' | tr '[:lower:]' '[:upper:]')
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "📝 Raw text: $RAW_TEXT"
echo "🔤 Extracted: $EXTRACTED"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "$EXTRACTED" | pbcopy
echo "📋 Copied to clipboard!"
else
echo "❌ Transcription failed"
fi
;;
identify)
if [ -z "$TARGET" ]; then
echo "❌ identify mode requires a target sound"
echo " Example: $0 '$URL' identify 'stream'"
exit 1
fi
echo "🧠 Asking Gemini: which sound is '$TARGET'?"
PROMPT="Listen to this audio captcha. It contains multiple sounds. Which sound is a \"$TARGET\"? Reply with ONLY the number (1, 2, 3, etc.) of the matching sound. Just the number, nothing else."
RESPONSE=$(gemini -p "$PROMPT" -f "$AUDIO_FILE" 2>/dev/null)
ANSWER=$(echo "$RESPONSE" | grep -oE '[0-9]+' | head -1)
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "🎯 Target: $TARGET"
echo "✅ Answer: ${ANSWER:-$RESPONSE}"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "${ANSWER:-$RESPONSE}" | pbcopy
echo "📋 Copied to clipboard!"
;;
describe)
echo "🧠 Asking Gemini to describe all sounds..."
PROMPT="Listen to this audio and describe each distinct sound you hear. Format as: 1: [description], 2: [description], etc."
RESPONSE=$(gemini -p "$PROMPT" -f "$AUDIO_FILE" 2>/dev/null)
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "🔊 Sounds detected:"
echo "$RESPONSE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
;;
*)
echo "❌ Unknown mode: $MODE"
echo " Use: transcribe, identify, or describe"
exit 1
;;
esac
echo ""
echo "Done! Browser is still open if you need to enter the answer."