clawdbot-workspace/audio-captcha/capture-and-analyze.sh
2026-01-28 23:00:58 -05:00

39 lines
1.4 KiB
Bash
Executable File

#!/bin/bash
# Audio Captcha Solver - Capture from Blackhole + Analyze with Gemini
# Usage: ./capture-and-analyze.sh [duration_seconds]
DURATION=${1:-5}
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
AUDIO_FILE="/Users/jakeshore/.clawdbot/workspace/audio-captcha/captures/captcha-${TIMESTAMP}.wav"
GEMINI_KEY="AIzaSyClMlVU3Z1jh1UBxTRn25yesH8RU1q_umY"
mkdir -p /Users/jakeshore/.clawdbot/workspace/audio-captcha/captures
echo "🎤 Recording ${DURATION}s from BlackHole..."
# Capture audio from BlackHole 2ch
ffmpeg -y -f avfoundation -i ":BlackHole 2ch" -t $DURATION -ar 16000 -ac 1 "$AUDIO_FILE" 2>/dev/null
if [ ! -f "$AUDIO_FILE" ]; then
echo "❌ Recording failed. Make sure BlackHole is set as output device."
exit 1
fi
echo "✅ Captured: $AUDIO_FILE"
echo "📤 Sending to Gemini for analysis..."
# Convert to base64 for API
AUDIO_B64=$(base64 -i "$AUDIO_FILE")
# Call Gemini with audio
curl -s "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${GEMINI_KEY}" \
-H "Content-Type: application/json" \
-d "{
\"contents\": [{
\"parts\": [
{\"text\": \"Listen to this audio captcha and transcribe EXACTLY what is said. Return ONLY the text/numbers spoken, nothing else.\"},
{\"inline_data\": {\"mime_type\": \"audio/wav\", \"data\": \"${AUDIO_B64}\"}}
]
}]
}" | jq -r '.candidates[0].content.parts[0].text'