beatmatchr/backend/services/media_ingest.py
2025-11-10 16:11:36 -05:00

221 lines
7.2 KiB
Python

from __future__ import annotations
import json
import logging
import os
import subprocess
import tempfile
import uuid
from datetime import datetime
from pathlib import Path
from typing import Dict, List
import requests
from PIL import Image
from ..db import db_session
from ..models import Project, SourceClip
from . import storage
logger = logging.getLogger(__name__)
def resolve_media_urls_from_input(url: str) -> List[str]:
"""Resolve direct media URLs for a given user-provided URL using yt-dlp."""
try:
process = subprocess.run(
[
"yt-dlp",
"--dump-json",
"--skip-download",
url,
],
check=False,
capture_output=True,
text=True,
)
except FileNotFoundError:
logger.warning("yt-dlp not installed; returning provided URL directly")
return [url]
if process.returncode != 0:
logger.error("yt-dlp failed for %s: %s", url, process.stderr.strip())
return [url]
media_urls: List[str] = []
for line in process.stdout.strip().splitlines():
try:
payload = json.loads(line)
except json.JSONDecodeError:
continue
url_field = payload.get("url") or payload.get("webpage_url")
if url_field:
media_urls.append(url_field)
if not media_urls:
media_urls.append(url)
return media_urls
def download_media_file(media_url: str) -> str:
"""Download a media file to a temporary local path and return it."""
response = requests.get(media_url, stream=True, timeout=60)
response.raise_for_status()
suffix = Path(media_url).suffix or ".mp4"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
temp_file.write(chunk)
temp_path = temp_file.name
return temp_path
def extract_video_metadata(local_path: str) -> Dict[str, float | int | None]:
"""Extract video metadata using ffprobe."""
command = [
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=width,height,r_frame_rate:format=duration",
"-of",
"json",
local_path,
]
result = subprocess.run(command, capture_output=True, text=True, check=False)
if result.returncode != 0:
raise RuntimeError(f"ffprobe failed: {result.stderr}")
payload = json.loads(result.stdout)
stream = (payload.get("streams") or [{}])[0]
format_info = payload.get("format") or {}
r_frame_rate = stream.get("r_frame_rate", "0/1")
try:
num, den = r_frame_rate.split("/")
fps = float(num) / float(den) if float(den) else None
except (ValueError, ZeroDivisionError):
fps = None
metadata = {
"duration_seconds": float(format_info.get("duration")) if format_info.get("duration") else None,
"width": stream.get("width"),
"height": stream.get("height"),
"fps": fps,
}
return metadata
def generate_thumbnail(local_video_path: str, time_seconds: float = 0.5) -> bytes:
"""Generate a thumbnail image for a video clip using ffmpeg."""
resized_path: str | None = None
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_image:
temp_image_path = temp_image.name
try:
command = [
"ffmpeg",
"-ss",
str(time_seconds),
"-i",
local_video_path,
"-frames:v",
"1",
"-q:v",
"2",
temp_image_path,
]
result = subprocess.run(command, capture_output=True, check=False)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg thumbnail generation failed: {result.stderr}")
with Image.open(temp_image_path) as img:
width = 480
ratio = width / float(img.width)
resized = img.resize((width, int(img.height * ratio)))
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as resized_file:
resized.save(resized_file.name, format="JPEG", quality=90)
resized_path = resized_file.name
if resized_path is None:
raise RuntimeError("Failed to create thumbnail image")
with open(resized_path, "rb") as thumbnail_file:
data = thumbnail_file.read()
finally:
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
if resized_path and os.path.exists(resized_path):
os.remove(resized_path)
return data
def ingest_single_media_url(project_id: str, input_url: str, origin: str = "url") -> List[Dict]:
"""Ingest media from a URL and persist SourceClip entries."""
media_urls = resolve_media_urls_from_input(input_url)
created_clips: List[Dict] = []
with db_session() as session:
project = session.query(Project).filter_by(id=project_id).one_or_none()
if project is None:
raise ValueError(f"Project {project_id} does not exist")
for media_url in media_urls:
local_path = download_media_file(media_url)
try:
metadata = extract_video_metadata(local_path)
thumbnail_bytes = generate_thumbnail(local_path)
clip_id = str(uuid.uuid4())
extension = Path(local_path).suffix or ".mp4"
storage_dest = f"videos/{project_id}/{clip_id}{extension}"
thumb_dest = f"thumbnails/{project_id}/{clip_id}.jpg"
with open(local_path, "rb") as infile:
storage_path = storage.upload_file(infile, storage_dest)
thumbnail_path = storage.upload_bytes(thumbnail_bytes, thumb_dest, content_type="image/jpeg")
now = datetime.utcnow()
clip = SourceClip(
id=clip_id,
project_id=project_id,
origin=origin,
original_url=input_url,
storage_path=storage_path,
thumbnail_path=thumbnail_path,
duration_seconds=metadata.get("duration_seconds"),
width=metadata.get("width"),
height=metadata.get("height"),
fps=metadata.get("fps"),
created_at=now,
updated_at=now,
)
session.add(clip)
session.flush()
created_clips.append(
{
"id": clip.id,
"project_id": clip.project_id,
"storage_path": clip.storage_path,
"thumbnail_path": clip.thumbnail_path,
"duration_seconds": clip.duration_seconds,
"width": clip.width,
"height": clip.height,
"fps": clip.fps,
"origin": clip.origin,
"original_url": clip.original_url,
}
)
finally:
if os.path.exists(local_path):
os.remove(local_path)
session.commit()
return created_clips