101 lines
2.7 KiB
Bash
Executable File
101 lines
2.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Generate a voiceover with Gemini TTS (gemini-2.5-flash-preview-tts).
|
|
# Outputs a WAV file from text input.
|
|
#
|
|
# Usage:
|
|
# generate-voiceover.sh <text> <output.wav> [voice]
|
|
#
|
|
# Voices: Kore, Zephyr, Puck, Charon, Fenrir, Leda, Orus, Aoede,
|
|
# Enceladus, Achernar, Alnilam, Schedar, and 18 more.
|
|
# Control pace/tone via natural language in the text itself.
|
|
#
|
|
# Env: GOOGLE_AI_API_KEY must be set (or sourced from .env.local)
|
|
set -euo pipefail
|
|
|
|
TEXT="$1"
|
|
OUTPUT="$2"
|
|
VOICE="${3:-Orus}"
|
|
|
|
PROJECT_DIR="${AI_STUDIO_DIR:-/mnt/work/dev/ai-studio-videos}"
|
|
|
|
if [[ -z "${GOOGLE_AI_API_KEY:-}" ]] && [[ -f "$PROJECT_DIR/.env.local" ]]; then
|
|
source "$PROJECT_DIR/.env.local"
|
|
fi
|
|
|
|
TMPDIR=$(mktemp -d)
|
|
trap 'rm -rf "$TMPDIR"' EXIT
|
|
|
|
mkdir -p "$(dirname "$OUTPUT")"
|
|
|
|
# build request
|
|
jq -n \
|
|
--arg text "$TEXT" \
|
|
--arg voice "$VOICE" \
|
|
'{
|
|
contents: [{
|
|
parts: [{text: $text}]
|
|
}],
|
|
generationConfig: {
|
|
responseModalities: ["AUDIO"],
|
|
speechConfig: {
|
|
voiceConfig: {
|
|
prebuiltVoiceConfig: {
|
|
voiceName: $voice
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}' > "$TMPDIR/request.json"
|
|
|
|
echo "Generating voiceover..."
|
|
echo " Voice: $VOICE"
|
|
echo " Text: ${TEXT:0:80}$([ ${#TEXT} -gt 80 ] && echo '...')"
|
|
|
|
RESPONSE=$(curl -s -X POST \
|
|
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \
|
|
-H "x-goog-api-key: ${GOOGLE_AI_API_KEY}" \
|
|
-H "Content-Type: application/json" \
|
|
-d @"$TMPDIR/request.json")
|
|
|
|
# error handling
|
|
ERROR=$(echo "$RESPONSE" | jq -r '.error.message // empty')
|
|
if [[ -n "$ERROR" ]]; then
|
|
echo "API Error: $ERROR" >&2; exit 1
|
|
fi
|
|
|
|
# extract audio data (base64-encoded PCM, 24kHz mono 16-bit)
|
|
AUDIO=$(echo "$RESPONSE" | jq -r \
|
|
'(.candidates[0].content.parts // [])[] | select(.inlineData) | .inlineData.data // empty')
|
|
|
|
if [[ -z "$AUDIO" ]]; then
|
|
echo "No audio returned." >&2
|
|
echo "$RESPONSE" | jq '.' >&2; exit 1
|
|
fi
|
|
|
|
# decode base64 to raw PCM
|
|
echo "$AUDIO" | base64 -d > "$TMPDIR/raw.pcm"
|
|
|
|
# convert raw PCM to WAV (24kHz, mono, 16-bit signed LE)
|
|
# using ffmpeg if available, otherwise python
|
|
if command -v ffmpeg &>/dev/null; then
|
|
ffmpeg -f s16le -ar 24000 -ac 1 -i "$TMPDIR/raw.pcm" \
|
|
-y "$OUTPUT" 2>/dev/null
|
|
elif command -v python3 &>/dev/null; then
|
|
python3 -c "
|
|
import wave, sys
|
|
with open('$TMPDIR/raw.pcm', 'rb') as f:
|
|
pcm = f.read()
|
|
with wave.open('$OUTPUT', 'wb') as w:
|
|
w.setnchannels(1)
|
|
w.setsampwidth(2)
|
|
w.setframerate(24000)
|
|
w.writeframes(pcm)
|
|
"
|
|
else
|
|
# fallback: just save raw PCM
|
|
cp "$TMPDIR/raw.pcm" "$OUTPUT"
|
|
echo "Warning: no ffmpeg or python3 - saved raw PCM" >&2
|
|
fi
|
|
|
|
echo "Saved: $OUTPUT ($(du -h "$OUTPUT" | cut -f1))"
|