95 lines
3.2 KiB
Bash
Executable File
95 lines
3.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
VOICE="${VOICE:-ash}"
|
|
TEXT="$*"
|
|
DAEMON="http://localhost:3850"
|
|
TMPRAW=$(mktemp /tmp/speak-XXXX.raw)
|
|
TMPWAV=$(mktemp /tmp/speak-XXXX.wav)
|
|
trap "rm -f $TMPRAW $TMPWAV" EXIT
|
|
|
|
if [ -z "$TEXT" ]; then
|
|
echo "Usage: speak.sh <text to speak>"
|
|
exit 1
|
|
fi
|
|
|
|
# The inner command streams the SSE response, extracts base64 audio chunks,
|
|
# concatenates and decodes them to raw PCM16.
|
|
# We use jq to safely embed the text into the JSON payload.
|
|
read -r -d '' INNER_CMD << 'INNEREOF' || true
|
|
PAYLOAD=$(jq -n \
|
|
--arg text "$SPEAK_TEXT" \
|
|
--arg voice "$SPEAK_VOICE" \
|
|
'{
|
|
model: "openai/gpt-audio-mini",
|
|
modalities: ["text", "audio"],
|
|
audio: { voice: $voice, format: "pcm16" },
|
|
stream: true,
|
|
messages: [{ role: "user", content: $text }]
|
|
}')
|
|
|
|
curl -sN https://openrouter.ai/api/v1/chat/completions \
|
|
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$PAYLOAD" | \
|
|
while IFS= read -r line; do
|
|
# Strip "data: " prefix from SSE
|
|
line="${line#data: }"
|
|
[ -z "$line" ] && continue
|
|
[ "$line" = "[DONE]" ] && continue
|
|
# Extract audio data chunk if present
|
|
chunk=$(echo "$line" | jq -r '.choices[0].delta.audio.data // empty' 2>/dev/null)
|
|
[ -n "$chunk" ] && printf '%s' "$chunk"
|
|
done
|
|
INNEREOF
|
|
|
|
# Execute the streaming command via daemon's exec_with_secrets.
|
|
# We pass SPEAK_TEXT and SPEAK_VOICE as additional env vars through the secrets map.
|
|
# The exec endpoint injects OPENROUTER_API_KEY; we also set our custom vars in the command.
|
|
INNER_WITH_VARS="export SPEAK_TEXT=$(printf '%q' "$TEXT"); export SPEAK_VOICE=$(printf '%q' "$VOICE"); $INNER_CMD"
|
|
|
|
EXEC_RESPONSE=$(curl -s "$DAEMON/api/secrets/OPENROUTER_API_KEY/exec" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$(jq -n --arg cmd "$INNER_WITH_VARS" '{ command: $cmd }')")
|
|
|
|
# Extract the concatenated base64 audio from stdout
|
|
B64_AUDIO=$(echo "$EXEC_RESPONSE" | jq -r '.stdout // empty')
|
|
|
|
if [ -z "$B64_AUDIO" ]; then
|
|
STDERR=$(echo "$EXEC_RESPONSE" | jq -r '.stderr // empty')
|
|
echo "Error: No audio data received"
|
|
[ -n "$STDERR" ] && echo "stderr: $STDERR"
|
|
exit 1
|
|
fi
|
|
|
|
# Decode base64 to raw PCM16
|
|
echo "$B64_AUDIO" | base64 -d > "$TMPRAW" 2>/dev/null
|
|
|
|
RAWSIZE=$(stat -c%s "$TMPRAW" 2>/dev/null || stat -f%z "$TMPRAW" 2>/dev/null)
|
|
if [ "$RAWSIZE" -lt 100 ]; then
|
|
echo "Error: Audio data too small ($RAWSIZE bytes)"
|
|
exit 1
|
|
fi
|
|
|
|
# Wrap raw PCM16 in a WAV header (24kHz, mono, 16-bit LE)
|
|
# WAV header is 44 bytes
|
|
DATASIZE=$RAWSIZE
|
|
FILESIZE=$((DATASIZE + 36))
|
|
{
|
|
printf 'RIFF'
|
|
printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((FILESIZE & 0xFF)) $(((FILESIZE >> 8) & 0xFF)) $(((FILESIZE >> 16) & 0xFF)) $(((FILESIZE >> 24) & 0xFF)))"
|
|
printf 'WAVEfmt '
|
|
printf '\x10\x00\x00\x00' # chunk size 16
|
|
printf '\x01\x00' # PCM format
|
|
printf '\x01\x00' # mono
|
|
printf '\xc0\x5d\x00\x00' # 24000 Hz sample rate
|
|
printf '\x80\xbb\x00\x00' # byte rate (24000 * 2)
|
|
printf '\x02\x00' # block align
|
|
printf '\x10\x00' # 16 bits per sample
|
|
printf 'data'
|
|
printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((DATASIZE & 0xFF)) $(((DATASIZE >> 8) & 0xFF)) $(((DATASIZE >> 16) & 0xFF)) $(((DATASIZE >> 24) & 0xFF)))"
|
|
cat "$TMPRAW"
|
|
} > "$TMPWAV"
|
|
|
|
ffplay -nodisp -autoexit -loglevel quiet "$TMPWAV"
|