Search Nukepedia's 2300+ free Nuke tools catalog. Includes: - scrape.py: build catalog from nukepedia.com (rate-limited) - search.py: query by name, category, rating, author - Pre-scraped catalog with 2341 tools Categories: gizmos, python, plugins, toolsets, blink, hiero, etc. Support Nukepedia: https://nukepedia.com/donate
354 lines
12 KiB
Python
354 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape Nukepedia.com to build a searchable catalog of VFX tools.
|
|
|
|
Usage:
|
|
python scrape.py [--full] [--resume] [--output PATH]
|
|
|
|
Options:
|
|
--full Fetch detail pages for each tool (slower, ~2400 requests)
|
|
--resume Resume an interrupted scrape
|
|
--output Output path (default: ../data/nukepedia-catalog.json)
|
|
|
|
Rate limited to 1 req/sec. Please support Nukepedia: https://nukepedia.com/donate
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
try:
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
print("Missing dependencies. Install with:")
|
|
print(" pip install requests beautifulsoup4")
|
|
sys.exit(1)
|
|
|
|
BASE_URL = "https://nukepedia.com"
|
|
USER_AGENT = "gizmosearch/0.1 (VFX tool catalog builder; respects robots.txt)"
|
|
REQUEST_DELAY = 1.0
|
|
MAX_RETRIES = 3
|
|
PROGRESS_FILE = ".gizmosearch_progress.json"
|
|
|
|
CATEGORIES = [
|
|
("gizmos", ["deep", "image", "particles", "draw", "time", "channel",
|
|
"colour", "filter", "keyer", "merge", "transform", "3d",
|
|
"stereo", "metadata", "other"]),
|
|
("python", ["import-export", "render", "flipbook", "misc", "3d",
|
|
"nodegraph", "ui", "deep"]),
|
|
("plugins", ["image", "time", "draw", "channel", "colour", "filter",
|
|
"keyer", "merge", "transform", "3d", "other"]),
|
|
("toolsets", ["deep", "image", "particles", "draw", "time", "channel",
|
|
"colour", "filter", "keyer", "merge", "transform", "3d",
|
|
"stereo", "metadata", "other"]),
|
|
("blink", ["deep", "image", "particles", "draw", "time", "channel",
|
|
"colour", "filter", "keyer", "merge", "transform", "3d",
|
|
"stereo", "metadata", "other"]),
|
|
("miscellaneous", []),
|
|
("hiero", ["python", "softeffects"]),
|
|
("presets", []),
|
|
("tcl-scripts", []),
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class Tool:
|
|
id: str
|
|
name: str
|
|
category: str
|
|
subcategory: str
|
|
author: str = ""
|
|
description: str = ""
|
|
rating: Optional[float] = None
|
|
rating_count: int = 0
|
|
downloads: int = 0
|
|
nuke_versions: str = ""
|
|
platforms: list = field(default_factory=list)
|
|
license: Optional[str] = None
|
|
url: str = ""
|
|
last_updated: Optional[str] = None
|
|
scraped_at: str = ""
|
|
|
|
|
|
@dataclass
|
|
class Catalog:
|
|
version: str
|
|
scraped_at: str
|
|
tool_count: int
|
|
nukepedia_support: dict
|
|
tools: list
|
|
|
|
|
|
def get_support_info():
|
|
return {
|
|
"message": "Nukepedia is a free, community-run resource serving VFX artists since 2008. Please support them!",
|
|
"donate_url": "https://nukepedia.com/donate",
|
|
"prouser_url": "https://nukepedia.com/prouser",
|
|
"website": "https://nukepedia.com",
|
|
"contribute_url": "https://nukepedia.com/my-uploads/new/"
|
|
}
|
|
|
|
|
|
class Scraper:
|
|
def __init__(self, full_scrape=False, resume=False):
|
|
self.session = requests.Session()
|
|
self.session.headers["User-Agent"] = USER_AGENT
|
|
self.full_scrape = full_scrape
|
|
self.tools = []
|
|
self.completed_urls = set()
|
|
|
|
if resume:
|
|
self._load_progress()
|
|
|
|
def _load_progress(self):
|
|
try:
|
|
with open(PROGRESS_FILE) as f:
|
|
data = json.load(f)
|
|
self.tools = [Tool(**t) for t in data.get("tools", [])]
|
|
self.completed_urls = set(data.get("completed_urls", []))
|
|
print(f" Resumed: {len(self.tools)} tools, {len(self.completed_urls)} pages")
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
def _save_progress(self):
|
|
with open(PROGRESS_FILE, "w") as f:
|
|
json.dump({
|
|
"tools": [asdict(t) for t in self.tools],
|
|
"completed_urls": list(self.completed_urls)
|
|
}, f)
|
|
|
|
def _cleanup_progress(self):
|
|
Path(PROGRESS_FILE).unlink(missing_ok=True)
|
|
|
|
def _fetch(self, url: str) -> Optional[str]:
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
resp = self.session.get(url, timeout=30, allow_redirects=True)
|
|
if resp.status_code == 200:
|
|
return resp.text
|
|
elif resp.status_code == 429:
|
|
wait = (2 ** attempt) * REQUEST_DELAY
|
|
print(f" Rate limited, waiting {wait}s...")
|
|
time.sleep(wait)
|
|
else:
|
|
print(f" HTTP {resp.status_code}: {url}")
|
|
return None
|
|
except requests.RequestException as e:
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(REQUEST_DELAY * (attempt + 1))
|
|
else:
|
|
print(f" Error fetching {url}: {e}")
|
|
return None
|
|
return None
|
|
|
|
def scrape(self) -> Catalog:
|
|
print("\n GizmoSearch")
|
|
print(" ===========\n")
|
|
print(" Building a catalog of free VFX tools from nukepedia.com")
|
|
print(" Please support Nukepedia: https://nukepedia.com/donate\n")
|
|
|
|
for cat_name, subcats in CATEGORIES:
|
|
if subcats:
|
|
for subcat in subcats:
|
|
self._scrape_category(cat_name, subcat)
|
|
else:
|
|
self._scrape_category(cat_name, "")
|
|
|
|
if self.full_scrape:
|
|
self._scrape_detail_pages()
|
|
|
|
self._cleanup_progress()
|
|
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
catalog = Catalog(
|
|
version="1.0",
|
|
scraped_at=now,
|
|
tool_count=len(self.tools),
|
|
nukepedia_support=get_support_info(),
|
|
tools=[asdict(t) for t in self.tools]
|
|
)
|
|
|
|
print(f"\n Scraped {len(self.tools)} tools")
|
|
print(" Support Nukepedia: https://nukepedia.com/donate\n")
|
|
|
|
return catalog
|
|
|
|
def _scrape_category(self, category: str, subcategory: str):
|
|
if subcategory:
|
|
url = f"{BASE_URL}/tools/{category}/{subcategory}"
|
|
display = f"{category}/{subcategory}"
|
|
else:
|
|
url = f"{BASE_URL}/tools/{category}"
|
|
display = category
|
|
|
|
if url in self.completed_urls:
|
|
print(f" {display} (cached)")
|
|
return
|
|
|
|
print(f" {display}...", end="", flush=True)
|
|
|
|
page = 1
|
|
count = 0
|
|
while True:
|
|
page_url = f"{url}?page={page}" if page > 1 else url
|
|
html = self._fetch(page_url)
|
|
if not html:
|
|
break
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
tools = self._parse_tool_cards(soup, category, subcategory or "general")
|
|
|
|
if not tools:
|
|
break
|
|
|
|
self.tools.extend(tools)
|
|
count += len(tools)
|
|
|
|
# check for next page
|
|
next_link = soup.select_one(".pagination a.next, a[rel='next']")
|
|
if not next_link or "disabled" in next_link.get("class", []):
|
|
break
|
|
|
|
page += 1
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
self.completed_urls.add(url)
|
|
self._save_progress()
|
|
print(f" {count} tools")
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
def _parse_tool_cards(self, soup: BeautifulSoup, category: str, subcategory: str) -> list:
|
|
"""Parse tool cards from listing page. Nukepedia uses <a class="tool-card"> with data attributes."""
|
|
tools = []
|
|
|
|
# find all tool cards - they're <a> tags with class "tool-card" and data attributes
|
|
cards = soup.select("a.tool-card[data-name]")
|
|
|
|
for card in cards:
|
|
href = card.get("href", "")
|
|
if not href or not href.startswith("/tools/"):
|
|
continue
|
|
|
|
url = f"{BASE_URL}{href}"
|
|
tool_id = href.rstrip("/").split("/")[-1]
|
|
|
|
name = card.get("data-name", tool_id)
|
|
author = card.get("data-author", "")
|
|
downloads = int(card.get("data-downloads", 0) or 0)
|
|
date_str = card.get("data-date", "")
|
|
rating_str = card.get("data-rating", "")
|
|
|
|
rating = None
|
|
if rating_str:
|
|
try:
|
|
rating = float(rating_str)
|
|
except ValueError:
|
|
pass
|
|
|
|
last_updated = None
|
|
if date_str:
|
|
last_updated = date_str
|
|
|
|
# get description from the card content
|
|
desc_el = card.select_one(".description, .excerpt, p")
|
|
description = desc_el.get_text(strip=True) if desc_el else ""
|
|
|
|
# get rating count if available
|
|
rating_count = 0
|
|
rating_el = card.select_one(".rating-count, .votes")
|
|
if rating_el:
|
|
match = re.search(r"(\d+)", rating_el.get_text())
|
|
if match:
|
|
rating_count = int(match.group(1))
|
|
|
|
tools.append(Tool(
|
|
id=tool_id,
|
|
name=name,
|
|
category=category,
|
|
subcategory=subcategory,
|
|
author=author,
|
|
description=description,
|
|
rating=rating,
|
|
rating_count=rating_count,
|
|
downloads=downloads,
|
|
url=url,
|
|
platforms=["linux", "mac", "windows"],
|
|
last_updated=last_updated,
|
|
scraped_at=datetime.now(timezone.utc).isoformat()
|
|
))
|
|
|
|
return tools
|
|
|
|
def _scrape_detail_pages(self):
|
|
tools_needing_details = [(i, t) for i, t in enumerate(self.tools) if not t.description]
|
|
|
|
if not tools_needing_details:
|
|
return
|
|
|
|
print(f"\n Fetching {len(tools_needing_details)} detail pages...")
|
|
|
|
for idx, (i, tool) in enumerate(tools_needing_details):
|
|
if tool.url in self.completed_urls:
|
|
continue
|
|
|
|
html = self._fetch(tool.url)
|
|
if html:
|
|
self._parse_detail_page(html, self.tools[i])
|
|
self.completed_urls.add(tool.url)
|
|
|
|
if (idx + 1) % 50 == 0:
|
|
self._save_progress()
|
|
print(f" {idx + 1}/{len(tools_needing_details)}")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
def _parse_detail_page(self, html: str, tool: Tool):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# description
|
|
desc_el = soup.select_one(".tool-description, .description, #description, .content p")
|
|
if desc_el:
|
|
tool.description = " ".join(desc_el.get_text().split())[:500]
|
|
|
|
# nuke versions
|
|
ver_el = soup.select_one(".nuke-version, .compatibility, [class*='version']")
|
|
if ver_el:
|
|
tool.nuke_versions = ver_el.get_text(strip=True)
|
|
|
|
# license
|
|
lic_el = soup.select_one(".license, [class*='license']")
|
|
if lic_el:
|
|
tool.license = lic_el.get_text(strip=True)
|
|
|
|
tool.scraped_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Scrape Nukepedia for VFX tools")
|
|
parser.add_argument("--full", action="store_true", help="Fetch detail pages")
|
|
parser.add_argument("--resume", action="store_true", help="Resume interrupted scrape")
|
|
parser.add_argument("--output", default=None, help="Output path")
|
|
args = parser.parse_args()
|
|
|
|
output = Path(args.output) if args.output else Path(__file__).parent.parent / "data" / "nukepedia-catalog.json"
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
scraper = Scraper(full_scrape=args.full, resume=args.resume)
|
|
catalog = scraper.scrape()
|
|
|
|
with open(output, "w") as f:
|
|
json.dump(asdict(catalog), f, indent=2)
|
|
|
|
print(f" Saved to {output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|