#!/usr/bin/env bash set -euo pipefail VOICE="${VOICE:-ash}" TEXT="$*" DAEMON="http://localhost:3850" TMPRAW=$(mktemp /tmp/speak-XXXX.raw) TMPWAV=$(mktemp /tmp/speak-XXXX.wav) trap "rm -f $TMPRAW $TMPWAV" EXIT if [ -z "$TEXT" ]; then echo "Usage: speak.sh " exit 1 fi # The inner command streams the SSE response, extracts base64 audio chunks, # concatenates and decodes them to raw PCM16. # We use jq to safely embed the text into the JSON payload. read -r -d '' INNER_CMD << 'INNEREOF' || true PAYLOAD=$(jq -n \ --arg text "$SPEAK_TEXT" \ --arg voice "$SPEAK_VOICE" \ '{ model: "openai/gpt-audio-mini", modalities: ["text", "audio"], audio: { voice: $voice, format: "pcm16" }, stream: true, messages: [{ role: "user", content: $text }] }') curl -sN https://openrouter.ai/api/v1/chat/completions \ -H "Authorization: Bearer $OPENROUTER_API_KEY" \ -H "Content-Type: application/json" \ -d "$PAYLOAD" | \ while IFS= read -r line; do # Strip "data: " prefix from SSE line="${line#data: }" [ -z "$line" ] && continue [ "$line" = "[DONE]" ] && continue # Extract audio data chunk if present chunk=$(echo "$line" | jq -r '.choices[0].delta.audio.data // empty' 2>/dev/null) [ -n "$chunk" ] && printf '%s' "$chunk" done INNEREOF # Execute the streaming command via daemon's exec_with_secrets. # We pass SPEAK_TEXT and SPEAK_VOICE as additional env vars through the secrets map. # The exec endpoint injects OPENROUTER_API_KEY; we also set our custom vars in the command. INNER_WITH_VARS="export SPEAK_TEXT=$(printf '%q' "$TEXT"); export SPEAK_VOICE=$(printf '%q' "$VOICE"); $INNER_CMD" EXEC_RESPONSE=$(curl -s "$DAEMON/api/secrets/OPENROUTER_API_KEY/exec" \ -H "Content-Type: application/json" \ -d "$(jq -n --arg cmd "$INNER_WITH_VARS" '{ command: $cmd }')") # Extract the concatenated base64 audio from stdout B64_AUDIO=$(echo "$EXEC_RESPONSE" | jq -r '.stdout // empty') if [ -z "$B64_AUDIO" ]; then STDERR=$(echo "$EXEC_RESPONSE" | jq -r '.stderr // empty') echo "Error: No audio data received" [ -n "$STDERR" ] && echo "stderr: $STDERR" exit 1 fi # Decode base64 to raw PCM16 echo "$B64_AUDIO" | base64 -d > "$TMPRAW" 2>/dev/null RAWSIZE=$(stat -c%s "$TMPRAW" 2>/dev/null || stat -f%z "$TMPRAW" 2>/dev/null) if [ "$RAWSIZE" -lt 100 ]; then echo "Error: Audio data too small ($RAWSIZE bytes)" exit 1 fi # Wrap raw PCM16 in a WAV header (24kHz, mono, 16-bit LE) # WAV header is 44 bytes DATASIZE=$RAWSIZE FILESIZE=$((DATASIZE + 36)) { printf 'RIFF' printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((FILESIZE & 0xFF)) $(((FILESIZE >> 8) & 0xFF)) $(((FILESIZE >> 16) & 0xFF)) $(((FILESIZE >> 24) & 0xFF)))" printf 'WAVEfmt ' printf '\x10\x00\x00\x00' # chunk size 16 printf '\x01\x00' # PCM format printf '\x01\x00' # mono printf '\xc0\x5d\x00\x00' # 24000 Hz sample rate printf '\x80\xbb\x00\x00' # byte rate (24000 * 2) printf '\x02\x00' # block align printf '\x10\x00' # 16 bits per sample printf 'data' printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((DATASIZE & 0xFF)) $(((DATASIZE >> 8) & 0xFF)) $(((DATASIZE >> 16) & 0xFF)) $(((DATASIZE >> 24) & 0xFF)))" cat "$TMPRAW" } > "$TMPWAV" ffplay -nodisp -autoexit -loglevel quiet "$TMPWAV"