Nicholai c1ea14975e feat(skills): add nukepedia-tools skill
Search Nukepedia's 2300+ free Nuke tools catalog. Includes:
- scrape.py: build catalog from nukepedia.com (rate-limited)
- search.py: query by name, category, rating, author
- Pre-scraped catalog with 2341 tools

Categories: gizmos, python, plugins, toolsets, blink, hiero, etc.

Support Nukepedia: https://nukepedia.com/donate
2026-01-24 23:05:06 -07:00

354 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Scrape Nukepedia.com to build a searchable catalog of VFX tools.
Usage:
python scrape.py [--full] [--resume] [--output PATH]
Options:
--full Fetch detail pages for each tool (slower, ~2400 requests)
--resume Resume an interrupted scrape
--output Output path (default: ../data/nukepedia-catalog.json)
Rate limited to 1 req/sec. Please support Nukepedia: https://nukepedia.com/donate
"""
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
print("Missing dependencies. Install with:")
print(" pip install requests beautifulsoup4")
sys.exit(1)
BASE_URL = "https://nukepedia.com"
USER_AGENT = "gizmosearch/0.1 (VFX tool catalog builder; respects robots.txt)"
REQUEST_DELAY = 1.0
MAX_RETRIES = 3
PROGRESS_FILE = ".gizmosearch_progress.json"
CATEGORIES = [
("gizmos", ["deep", "image", "particles", "draw", "time", "channel",
"colour", "filter", "keyer", "merge", "transform", "3d",
"stereo", "metadata", "other"]),
("python", ["import-export", "render", "flipbook", "misc", "3d",
"nodegraph", "ui", "deep"]),
("plugins", ["image", "time", "draw", "channel", "colour", "filter",
"keyer", "merge", "transform", "3d", "other"]),
("toolsets", ["deep", "image", "particles", "draw", "time", "channel",
"colour", "filter", "keyer", "merge", "transform", "3d",
"stereo", "metadata", "other"]),
("blink", ["deep", "image", "particles", "draw", "time", "channel",
"colour", "filter", "keyer", "merge", "transform", "3d",
"stereo", "metadata", "other"]),
("miscellaneous", []),
("hiero", ["python", "softeffects"]),
("presets", []),
("tcl-scripts", []),
]
@dataclass
class Tool:
id: str
name: str
category: str
subcategory: str
author: str = ""
description: str = ""
rating: Optional[float] = None
rating_count: int = 0
downloads: int = 0
nuke_versions: str = ""
platforms: list = field(default_factory=list)
license: Optional[str] = None
url: str = ""
last_updated: Optional[str] = None
scraped_at: str = ""
@dataclass
class Catalog:
version: str
scraped_at: str
tool_count: int
nukepedia_support: dict
tools: list
def get_support_info():
return {
"message": "Nukepedia is a free, community-run resource serving VFX artists since 2008. Please support them!",
"donate_url": "https://nukepedia.com/donate",
"prouser_url": "https://nukepedia.com/prouser",
"website": "https://nukepedia.com",
"contribute_url": "https://nukepedia.com/my-uploads/new/"
}
class Scraper:
def __init__(self, full_scrape=False, resume=False):
self.session = requests.Session()
self.session.headers["User-Agent"] = USER_AGENT
self.full_scrape = full_scrape
self.tools = []
self.completed_urls = set()
if resume:
self._load_progress()
def _load_progress(self):
try:
with open(PROGRESS_FILE) as f:
data = json.load(f)
self.tools = [Tool(**t) for t in data.get("tools", [])]
self.completed_urls = set(data.get("completed_urls", []))
print(f" Resumed: {len(self.tools)} tools, {len(self.completed_urls)} pages")
except FileNotFoundError:
pass
def _save_progress(self):
with open(PROGRESS_FILE, "w") as f:
json.dump({
"tools": [asdict(t) for t in self.tools],
"completed_urls": list(self.completed_urls)
}, f)
def _cleanup_progress(self):
Path(PROGRESS_FILE).unlink(missing_ok=True)
def _fetch(self, url: str) -> Optional[str]:
for attempt in range(MAX_RETRIES):
try:
resp = self.session.get(url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
return resp.text
elif resp.status_code == 429:
wait = (2 ** attempt) * REQUEST_DELAY
print(f" Rate limited, waiting {wait}s...")
time.sleep(wait)
else:
print(f" HTTP {resp.status_code}: {url}")
return None
except requests.RequestException as e:
if attempt < MAX_RETRIES - 1:
time.sleep(REQUEST_DELAY * (attempt + 1))
else:
print(f" Error fetching {url}: {e}")
return None
return None
def scrape(self) -> Catalog:
print("\n GizmoSearch")
print(" ===========\n")
print(" Building a catalog of free VFX tools from nukepedia.com")
print(" Please support Nukepedia: https://nukepedia.com/donate\n")
for cat_name, subcats in CATEGORIES:
if subcats:
for subcat in subcats:
self._scrape_category(cat_name, subcat)
else:
self._scrape_category(cat_name, "")
if self.full_scrape:
self._scrape_detail_pages()
self._cleanup_progress()
now = datetime.now(timezone.utc).isoformat()
catalog = Catalog(
version="1.0",
scraped_at=now,
tool_count=len(self.tools),
nukepedia_support=get_support_info(),
tools=[asdict(t) for t in self.tools]
)
print(f"\n Scraped {len(self.tools)} tools")
print(" Support Nukepedia: https://nukepedia.com/donate\n")
return catalog
def _scrape_category(self, category: str, subcategory: str):
if subcategory:
url = f"{BASE_URL}/tools/{category}/{subcategory}"
display = f"{category}/{subcategory}"
else:
url = f"{BASE_URL}/tools/{category}"
display = category
if url in self.completed_urls:
print(f" {display} (cached)")
return
print(f" {display}...", end="", flush=True)
page = 1
count = 0
while True:
page_url = f"{url}?page={page}" if page > 1 else url
html = self._fetch(page_url)
if not html:
break
soup = BeautifulSoup(html, "html.parser")
tools = self._parse_tool_cards(soup, category, subcategory or "general")
if not tools:
break
self.tools.extend(tools)
count += len(tools)
# check for next page
next_link = soup.select_one(".pagination a.next, a[rel='next']")
if not next_link or "disabled" in next_link.get("class", []):
break
page += 1
time.sleep(REQUEST_DELAY)
self.completed_urls.add(url)
self._save_progress()
print(f" {count} tools")
time.sleep(REQUEST_DELAY)
def _parse_tool_cards(self, soup: BeautifulSoup, category: str, subcategory: str) -> list:
"""Parse tool cards from listing page. Nukepedia uses <a class="tool-card"> with data attributes."""
tools = []
# find all tool cards - they're <a> tags with class "tool-card" and data attributes
cards = soup.select("a.tool-card[data-name]")
for card in cards:
href = card.get("href", "")
if not href or not href.startswith("/tools/"):
continue
url = f"{BASE_URL}{href}"
tool_id = href.rstrip("/").split("/")[-1]
name = card.get("data-name", tool_id)
author = card.get("data-author", "")
downloads = int(card.get("data-downloads", 0) or 0)
date_str = card.get("data-date", "")
rating_str = card.get("data-rating", "")
rating = None
if rating_str:
try:
rating = float(rating_str)
except ValueError:
pass
last_updated = None
if date_str:
last_updated = date_str
# get description from the card content
desc_el = card.select_one(".description, .excerpt, p")
description = desc_el.get_text(strip=True) if desc_el else ""
# get rating count if available
rating_count = 0
rating_el = card.select_one(".rating-count, .votes")
if rating_el:
match = re.search(r"(\d+)", rating_el.get_text())
if match:
rating_count = int(match.group(1))
tools.append(Tool(
id=tool_id,
name=name,
category=category,
subcategory=subcategory,
author=author,
description=description,
rating=rating,
rating_count=rating_count,
downloads=downloads,
url=url,
platforms=["linux", "mac", "windows"],
last_updated=last_updated,
scraped_at=datetime.now(timezone.utc).isoformat()
))
return tools
def _scrape_detail_pages(self):
tools_needing_details = [(i, t) for i, t in enumerate(self.tools) if not t.description]
if not tools_needing_details:
return
print(f"\n Fetching {len(tools_needing_details)} detail pages...")
for idx, (i, tool) in enumerate(tools_needing_details):
if tool.url in self.completed_urls:
continue
html = self._fetch(tool.url)
if html:
self._parse_detail_page(html, self.tools[i])
self.completed_urls.add(tool.url)
if (idx + 1) % 50 == 0:
self._save_progress()
print(f" {idx + 1}/{len(tools_needing_details)}")
time.sleep(REQUEST_DELAY)
def _parse_detail_page(self, html: str, tool: Tool):
soup = BeautifulSoup(html, "html.parser")
# description
desc_el = soup.select_one(".tool-description, .description, #description, .content p")
if desc_el:
tool.description = " ".join(desc_el.get_text().split())[:500]
# nuke versions
ver_el = soup.select_one(".nuke-version, .compatibility, [class*='version']")
if ver_el:
tool.nuke_versions = ver_el.get_text(strip=True)
# license
lic_el = soup.select_one(".license, [class*='license']")
if lic_el:
tool.license = lic_el.get_text(strip=True)
tool.scraped_at = datetime.now(timezone.utc).isoformat()
def main():
parser = argparse.ArgumentParser(description="Scrape Nukepedia for VFX tools")
parser.add_argument("--full", action="store_true", help="Fetch detail pages")
parser.add_argument("--resume", action="store_true", help="Resume interrupted scrape")
parser.add_argument("--output", default=None, help="Output path")
args = parser.parse_args()
output = Path(args.output) if args.output else Path(__file__).parent.parent / "data" / "nukepedia-catalog.json"
output.parent.mkdir(parents=True, exist_ok=True)
scraper = Scraper(full_scrape=args.full, resume=args.resume)
catalog = scraper.scrape()
with open(output, "w") as f:
json.dump(asdict(catalog), f, indent=2)
print(f" Saved to {output}")
if __name__ == "__main__":
main()