.agents/scripts/speak.sh

95 lines
3.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
VOICE="${VOICE:-ash}"
TEXT="$*"
DAEMON="http://localhost:3850"
TMPRAW=$(mktemp /tmp/speak-XXXX.raw)
TMPWAV=$(mktemp /tmp/speak-XXXX.wav)
trap "rm -f $TMPRAW $TMPWAV" EXIT
if [ -z "$TEXT" ]; then
echo "Usage: speak.sh <text to speak>"
exit 1
fi
# The inner command streams the SSE response, extracts base64 audio chunks,
# concatenates and decodes them to raw PCM16.
# We use jq to safely embed the text into the JSON payload.
read -r -d '' INNER_CMD << 'INNEREOF' || true
PAYLOAD=$(jq -n \
--arg text "$SPEAK_TEXT" \
--arg voice "$SPEAK_VOICE" \
'{
model: "openai/gpt-audio-mini",
modalities: ["text", "audio"],
audio: { voice: $voice, format: "pcm16" },
stream: true,
messages: [{ role: "user", content: $text }]
}')
curl -sN https://openrouter.ai/api/v1/chat/completions \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "Content-Type: application/json" \
-d "$PAYLOAD" | \
while IFS= read -r line; do
# Strip "data: " prefix from SSE
line="${line#data: }"
[ -z "$line" ] && continue
[ "$line" = "[DONE]" ] && continue
# Extract audio data chunk if present
chunk=$(echo "$line" | jq -r '.choices[0].delta.audio.data // empty' 2>/dev/null)
[ -n "$chunk" ] && printf '%s' "$chunk"
done
INNEREOF
# Execute the streaming command via daemon's exec_with_secrets.
# We pass SPEAK_TEXT and SPEAK_VOICE as additional env vars through the secrets map.
# The exec endpoint injects OPENROUTER_API_KEY; we also set our custom vars in the command.
INNER_WITH_VARS="export SPEAK_TEXT=$(printf '%q' "$TEXT"); export SPEAK_VOICE=$(printf '%q' "$VOICE"); $INNER_CMD"
EXEC_RESPONSE=$(curl -s "$DAEMON/api/secrets/OPENROUTER_API_KEY/exec" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg cmd "$INNER_WITH_VARS" '{ command: $cmd }')")
# Extract the concatenated base64 audio from stdout
B64_AUDIO=$(echo "$EXEC_RESPONSE" | jq -r '.stdout // empty')
if [ -z "$B64_AUDIO" ]; then
STDERR=$(echo "$EXEC_RESPONSE" | jq -r '.stderr // empty')
echo "Error: No audio data received"
[ -n "$STDERR" ] && echo "stderr: $STDERR"
exit 1
fi
# Decode base64 to raw PCM16
echo "$B64_AUDIO" | base64 -d > "$TMPRAW" 2>/dev/null
RAWSIZE=$(stat -c%s "$TMPRAW" 2>/dev/null || stat -f%z "$TMPRAW" 2>/dev/null)
if [ "$RAWSIZE" -lt 100 ]; then
echo "Error: Audio data too small ($RAWSIZE bytes)"
exit 1
fi
# Wrap raw PCM16 in a WAV header (24kHz, mono, 16-bit LE)
# WAV header is 44 bytes
DATASIZE=$RAWSIZE
FILESIZE=$((DATASIZE + 36))
{
printf 'RIFF'
printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((FILESIZE & 0xFF)) $(((FILESIZE >> 8) & 0xFF)) $(((FILESIZE >> 16) & 0xFF)) $(((FILESIZE >> 24) & 0xFF)))"
printf 'WAVEfmt '
printf '\x10\x00\x00\x00' # chunk size 16
printf '\x01\x00' # PCM format
printf '\x01\x00' # mono
printf '\xc0\x5d\x00\x00' # 24000 Hz sample rate
printf '\x80\xbb\x00\x00' # byte rate (24000 * 2)
printf '\x02\x00' # block align
printf '\x10\x00' # 16 bits per sample
printf 'data'
printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((DATASIZE & 0xFF)) $(((DATASIZE >> 8) & 0xFF)) $(((DATASIZE >> 16) & 0xFF)) $(((DATASIZE >> 24) & 0xFF)))"
cat "$TMPRAW"
} > "$TMPWAV"
ffplay -nodisp -autoexit -loglevel quiet "$TMPWAV"