.agents/scripts/speak.sh

#!/usr/bin/env bash
set -euo pipefail

VOICE="${VOICE:-ash}"
TEXT="$*"
DAEMON="http://localhost:3850"
TMPRAW=$(mktemp /tmp/speak-XXXX.raw)
TMPWAV=$(mktemp /tmp/speak-XXXX.wav)
trap "rm -f $TMPRAW $TMPWAV" EXIT

if [ -z "$TEXT" ]; then
  echo "Usage: speak.sh <text to speak>"
  exit 1
fi

# The inner command streams the SSE response, extracts base64 audio chunks,
# concatenates and decodes them to raw PCM16.
# We use jq to safely embed the text into the JSON payload.
read -r -d '' INNER_CMD << 'INNEREOF' || true
PAYLOAD=$(jq -n \
  --arg text "$SPEAK_TEXT" \
  --arg voice "$SPEAK_VOICE" \
  '{
    model: "openai/gpt-audio-mini",
    modalities: ["text", "audio"],
    audio: { voice: $voice, format: "pcm16" },
    stream: true,
    messages: [{ role: "user", content: $text }]
  }')

curl -sN https://openrouter.ai/api/v1/chat/completions \
  -H "Authorization: Bearer $OPENROUTER_API_KEY" \
  -H "Content-Type: application/json" \
  -d "$PAYLOAD" | \
while IFS= read -r line; do
  # Strip "data: " prefix from SSE
  line="${line#data: }"
  [ -z "$line" ] && continue
  [ "$line" = "[DONE]" ] && continue
  # Extract audio data chunk if present
  chunk=$(echo "$line" | jq -r '.choices[0].delta.audio.data // empty' 2>/dev/null)
  [ -n "$chunk" ] && printf '%s' "$chunk"
done
INNEREOF

# Execute the streaming command via daemon's exec_with_secrets.
# We pass SPEAK_TEXT and SPEAK_VOICE as additional env vars through the secrets map.
# The exec endpoint injects OPENROUTER_API_KEY; we also set our custom vars in the command.
INNER_WITH_VARS="export SPEAK_TEXT=$(printf '%q' "$TEXT"); export SPEAK_VOICE=$(printf '%q' "$VOICE"); $INNER_CMD"

EXEC_RESPONSE=$(curl -s "$DAEMON/api/secrets/OPENROUTER_API_KEY/exec" \
  -H "Content-Type: application/json" \
  -d "$(jq -n --arg cmd "$INNER_WITH_VARS" '{ command: $cmd }')")

# Extract the concatenated base64 audio from stdout
B64_AUDIO=$(echo "$EXEC_RESPONSE" | jq -r '.stdout // empty')

if [ -z "$B64_AUDIO" ]; then
  STDERR=$(echo "$EXEC_RESPONSE" | jq -r '.stderr // empty')
  echo "Error: No audio data received"
  [ -n "$STDERR" ] && echo "stderr: $STDERR"
  exit 1
fi

# Decode base64 to raw PCM16
echo "$B64_AUDIO" | base64 -d > "$TMPRAW" 2>/dev/null

RAWSIZE=$(stat -c%s "$TMPRAW" 2>/dev/null || stat -f%z "$TMPRAW" 2>/dev/null)
if [ "$RAWSIZE" -lt 100 ]; then
  echo "Error: Audio data too small ($RAWSIZE bytes)"
  exit 1
fi

# Wrap raw PCM16 in a WAV header (24kHz, mono, 16-bit LE)
# WAV header is 44 bytes
DATASIZE=$RAWSIZE
FILESIZE=$((DATASIZE + 36))
{
  printf 'RIFF'
  printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((FILESIZE & 0xFF)) $(((FILESIZE >> 8) & 0xFF)) $(((FILESIZE >> 16) & 0xFF)) $(((FILESIZE >> 24) & 0xFF)))"
  printf 'WAVEfmt '
  printf '\x10\x00\x00\x00'  # chunk size 16
  printf '\x01\x00'          # PCM format
  printf '\x01\x00'          # mono
  printf '\xc0\x5d\x00\x00'  # 24000 Hz sample rate
  printf '\x80\xbb\x00\x00'  # byte rate (24000 * 2)
  printf '\x02\x00'          # block align
  printf '\x10\x00'          # 16 bits per sample
  printf 'data'
  printf "$(printf '\\x%02x\\x%02x\\x%02x\\x%02x' $((DATASIZE & 0xFF)) $(((DATASIZE >> 8) & 0xFF)) $(((DATASIZE >> 16) & 0xFF)) $(((DATASIZE >> 24) & 0xFF)))"
  cat "$TMPRAW"
} > "$TMPWAV"

ffplay -nodisp -autoexit -loglevel quiet "$TMPWAV"