.agents/skills/ai-video-director/scripts/generate-voiceover.sh

101 lines
2.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Generate a voiceover with Gemini TTS (gemini-2.5-flash-preview-tts).
# Outputs a WAV file from text input.
#
# Usage:
# generate-voiceover.sh <text> <output.wav> [voice]
#
# Voices: Kore, Zephyr, Puck, Charon, Fenrir, Leda, Orus, Aoede,
# Enceladus, Achernar, Alnilam, Schedar, and 18 more.
# Control pace/tone via natural language in the text itself.
#
# Env: GOOGLE_AI_API_KEY must be set (or sourced from .env.local)
set -euo pipefail
TEXT="$1"
OUTPUT="$2"
VOICE="${3:-Orus}"
PROJECT_DIR="${AI_STUDIO_DIR:-/mnt/work/dev/ai-studio-videos}"
if [[ -z "${GOOGLE_AI_API_KEY:-}" ]] && [[ -f "$PROJECT_DIR/.env.local" ]]; then
source "$PROJECT_DIR/.env.local"
fi
TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT
mkdir -p "$(dirname "$OUTPUT")"
# build request
jq -n \
--arg text "$TEXT" \
--arg voice "$VOICE" \
'{
contents: [{
parts: [{text: $text}]
}],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: $voice
}
}
}
}
}' > "$TMPDIR/request.json"
echo "Generating voiceover..."
echo " Voice: $VOICE"
echo " Text: ${TEXT:0:80}$([ ${#TEXT} -gt 80 ] && echo '...')"
RESPONSE=$(curl -s -X POST \
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \
-H "x-goog-api-key: ${GOOGLE_AI_API_KEY}" \
-H "Content-Type: application/json" \
-d @"$TMPDIR/request.json")
# error handling
ERROR=$(echo "$RESPONSE" | jq -r '.error.message // empty')
if [[ -n "$ERROR" ]]; then
echo "API Error: $ERROR" >&2; exit 1
fi
# extract audio data (base64-encoded PCM, 24kHz mono 16-bit)
AUDIO=$(echo "$RESPONSE" | jq -r \
'(.candidates[0].content.parts // [])[] | select(.inlineData) | .inlineData.data // empty')
if [[ -z "$AUDIO" ]]; then
echo "No audio returned." >&2
echo "$RESPONSE" | jq '.' >&2; exit 1
fi
# decode base64 to raw PCM
echo "$AUDIO" | base64 -d > "$TMPDIR/raw.pcm"
# convert raw PCM to WAV (24kHz, mono, 16-bit signed LE)
# using ffmpeg if available, otherwise python
if command -v ffmpeg &>/dev/null; then
ffmpeg -f s16le -ar 24000 -ac 1 -i "$TMPDIR/raw.pcm" \
-y "$OUTPUT" 2>/dev/null
elif command -v python3 &>/dev/null; then
python3 -c "
import wave, sys
with open('$TMPDIR/raw.pcm', 'rb') as f:
pcm = f.read()
with wave.open('$OUTPUT', 'wb') as w:
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(24000)
w.writeframes(pcm)
"
else
# fallback: just save raw PCM
cp "$TMPDIR/raw.pcm" "$OUTPUT"
echo "Warning: no ffmpeg or python3 - saved raw PCM" >&2
fi
echo "Saved: $OUTPUT ($(du -h "$OUTPUT" | cut -f1))"