.agents/skills/ai-video-director/scripts/generate-voiceover.sh

#!/usr/bin/env bash
# Generate a voiceover with Gemini TTS (gemini-2.5-flash-preview-tts).
# Outputs a WAV file from text input.
#
# Usage:
#   generate-voiceover.sh <text> <output.wav> [voice]
#
# Voices: Kore, Zephyr, Puck, Charon, Fenrir, Leda, Orus, Aoede,
#         Enceladus, Achernar, Alnilam, Schedar, and 18 more.
# Control pace/tone via natural language in the text itself.
#
# Env: GOOGLE_AI_API_KEY must be set (or sourced from .env.local)
set -euo pipefail

TEXT="$1"
OUTPUT="$2"
VOICE="${3:-Orus}"

PROJECT_DIR="${AI_STUDIO_DIR:-/mnt/work/dev/ai-studio-videos}"

if [[ -z "${GOOGLE_AI_API_KEY:-}" ]] && [[ -f "$PROJECT_DIR/.env.local" ]]; then
  source "$PROJECT_DIR/.env.local"
fi

TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT

mkdir -p "$(dirname "$OUTPUT")"

# build request
jq -n \
  --arg text "$TEXT" \
  --arg voice "$VOICE" \
  '{
    contents: [{
      parts: [{text: $text}]
    }],
    generationConfig: {
      responseModalities: ["AUDIO"],
      speechConfig: {
        voiceConfig: {
          prebuiltVoiceConfig: {
            voiceName: $voice
          }
        }
      }
    }
  }' > "$TMPDIR/request.json"

echo "Generating voiceover..."
echo "  Voice: $VOICE"
echo "  Text: ${TEXT:0:80}$([ ${#TEXT} -gt 80 ] && echo '...')"

RESPONSE=$(curl -s -X POST \
  "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \
  -H "x-goog-api-key: ${GOOGLE_AI_API_KEY}" \
  -H "Content-Type: application/json" \
  -d @"$TMPDIR/request.json")

# error handling
ERROR=$(echo "$RESPONSE" | jq -r '.error.message // empty')
if [[ -n "$ERROR" ]]; then
  echo "API Error: $ERROR" >&2; exit 1
fi

# extract audio data (base64-encoded PCM, 24kHz mono 16-bit)
AUDIO=$(echo "$RESPONSE" | jq -r \
  '(.candidates[0].content.parts // [])[] | select(.inlineData) | .inlineData.data // empty')

if [[ -z "$AUDIO" ]]; then
  echo "No audio returned." >&2
  echo "$RESPONSE" | jq '.' >&2; exit 1
fi

# decode base64 to raw PCM
echo "$AUDIO" | base64 -d > "$TMPDIR/raw.pcm"

# convert raw PCM to WAV (24kHz, mono, 16-bit signed LE)
# using ffmpeg if available, otherwise python
if command -v ffmpeg &>/dev/null; then
  ffmpeg -f s16le -ar 24000 -ac 1 -i "$TMPDIR/raw.pcm" \
    -y "$OUTPUT" 2>/dev/null
elif command -v python3 &>/dev/null; then
  python3 -c "
import wave, sys
with open('$TMPDIR/raw.pcm', 'rb') as f:
    pcm = f.read()
with wave.open('$OUTPUT', 'wb') as w:
    w.setnchannels(1)
    w.setsampwidth(2)
    w.setframerate(24000)
    w.writeframes(pcm)
"
else
  # fallback: just save raw PCM
  cp "$TMPDIR/raw.pcm" "$OUTPUT"
  echo "Warning: no ffmpeg or python3 - saved raw PCM" >&2
fi

echo "Saved: $OUTPUT ($(du -h "$OUTPUT" | cut -f1))"