#!/usr/bin/env python3 """ Scrape Nukepedia.com to build a searchable catalog of VFX tools. Usage: python scrape.py [--full] [--resume] [--output PATH] Options: --full Fetch detail pages for each tool (slower, ~2400 requests) --resume Resume an interrupted scrape --output Output path (default: ../data/nukepedia-catalog.json) Rate limited to 1 req/sec. Please support Nukepedia: https://nukepedia.com/donate """ import argparse import json import re import sys import time from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from pathlib import Path from typing import Optional try: import requests from bs4 import BeautifulSoup except ImportError: print("Missing dependencies. Install with:") print(" pip install requests beautifulsoup4") sys.exit(1) BASE_URL = "https://nukepedia.com" USER_AGENT = "gizmosearch/0.1 (VFX tool catalog builder; respects robots.txt)" REQUEST_DELAY = 1.0 MAX_RETRIES = 3 PROGRESS_FILE = ".gizmosearch_progress.json" CATEGORIES = [ ("gizmos", ["deep", "image", "particles", "draw", "time", "channel", "colour", "filter", "keyer", "merge", "transform", "3d", "stereo", "metadata", "other"]), ("python", ["import-export", "render", "flipbook", "misc", "3d", "nodegraph", "ui", "deep"]), ("plugins", ["image", "time", "draw", "channel", "colour", "filter", "keyer", "merge", "transform", "3d", "other"]), ("toolsets", ["deep", "image", "particles", "draw", "time", "channel", "colour", "filter", "keyer", "merge", "transform", "3d", "stereo", "metadata", "other"]), ("blink", ["deep", "image", "particles", "draw", "time", "channel", "colour", "filter", "keyer", "merge", "transform", "3d", "stereo", "metadata", "other"]), ("miscellaneous", []), ("hiero", ["python", "softeffects"]), ("presets", []), ("tcl-scripts", []), ] @dataclass class Tool: id: str name: str category: str subcategory: str author: str = "" description: str = "" rating: Optional[float] = None rating_count: int = 0 downloads: int = 0 nuke_versions: str = "" platforms: list = field(default_factory=list) license: Optional[str] = None url: str = "" last_updated: Optional[str] = None scraped_at: str = "" @dataclass class Catalog: version: str scraped_at: str tool_count: int nukepedia_support: dict tools: list def get_support_info(): return { "message": "Nukepedia is a free, community-run resource serving VFX artists since 2008. Please support them!", "donate_url": "https://nukepedia.com/donate", "prouser_url": "https://nukepedia.com/prouser", "website": "https://nukepedia.com", "contribute_url": "https://nukepedia.com/my-uploads/new/" } class Scraper: def __init__(self, full_scrape=False, resume=False): self.session = requests.Session() self.session.headers["User-Agent"] = USER_AGENT self.full_scrape = full_scrape self.tools = [] self.completed_urls = set() if resume: self._load_progress() def _load_progress(self): try: with open(PROGRESS_FILE) as f: data = json.load(f) self.tools = [Tool(**t) for t in data.get("tools", [])] self.completed_urls = set(data.get("completed_urls", [])) print(f" Resumed: {len(self.tools)} tools, {len(self.completed_urls)} pages") except FileNotFoundError: pass def _save_progress(self): with open(PROGRESS_FILE, "w") as f: json.dump({ "tools": [asdict(t) for t in self.tools], "completed_urls": list(self.completed_urls) }, f) def _cleanup_progress(self): Path(PROGRESS_FILE).unlink(missing_ok=True) def _fetch(self, url: str) -> Optional[str]: for attempt in range(MAX_RETRIES): try: resp = self.session.get(url, timeout=30, allow_redirects=True) if resp.status_code == 200: return resp.text elif resp.status_code == 429: wait = (2 ** attempt) * REQUEST_DELAY print(f" Rate limited, waiting {wait}s...") time.sleep(wait) else: print(f" HTTP {resp.status_code}: {url}") return None except requests.RequestException as e: if attempt < MAX_RETRIES - 1: time.sleep(REQUEST_DELAY * (attempt + 1)) else: print(f" Error fetching {url}: {e}") return None return None def scrape(self) -> Catalog: print("\n GizmoSearch") print(" ===========\n") print(" Building a catalog of free VFX tools from nukepedia.com") print(" Please support Nukepedia: https://nukepedia.com/donate\n") for cat_name, subcats in CATEGORIES: if subcats: for subcat in subcats: self._scrape_category(cat_name, subcat) else: self._scrape_category(cat_name, "") if self.full_scrape: self._scrape_detail_pages() self._cleanup_progress() now = datetime.now(timezone.utc).isoformat() catalog = Catalog( version="1.0", scraped_at=now, tool_count=len(self.tools), nukepedia_support=get_support_info(), tools=[asdict(t) for t in self.tools] ) print(f"\n Scraped {len(self.tools)} tools") print(" Support Nukepedia: https://nukepedia.com/donate\n") return catalog def _scrape_category(self, category: str, subcategory: str): if subcategory: url = f"{BASE_URL}/tools/{category}/{subcategory}" display = f"{category}/{subcategory}" else: url = f"{BASE_URL}/tools/{category}" display = category if url in self.completed_urls: print(f" {display} (cached)") return print(f" {display}...", end="", flush=True) page = 1 count = 0 while True: page_url = f"{url}?page={page}" if page > 1 else url html = self._fetch(page_url) if not html: break soup = BeautifulSoup(html, "html.parser") tools = self._parse_tool_cards(soup, category, subcategory or "general") if not tools: break self.tools.extend(tools) count += len(tools) # check for next page next_link = soup.select_one(".pagination a.next, a[rel='next']") if not next_link or "disabled" in next_link.get("class", []): break page += 1 time.sleep(REQUEST_DELAY) self.completed_urls.add(url) self._save_progress() print(f" {count} tools") time.sleep(REQUEST_DELAY) def _parse_tool_cards(self, soup: BeautifulSoup, category: str, subcategory: str) -> list: """Parse tool cards from listing page. Nukepedia uses with data attributes.""" tools = [] # find all tool cards - they're tags with class "tool-card" and data attributes cards = soup.select("a.tool-card[data-name]") for card in cards: href = card.get("href", "") if not href or not href.startswith("/tools/"): continue url = f"{BASE_URL}{href}" tool_id = href.rstrip("/").split("/")[-1] name = card.get("data-name", tool_id) author = card.get("data-author", "") downloads = int(card.get("data-downloads", 0) or 0) date_str = card.get("data-date", "") rating_str = card.get("data-rating", "") rating = None if rating_str: try: rating = float(rating_str) except ValueError: pass last_updated = None if date_str: last_updated = date_str # get description from the card content desc_el = card.select_one(".description, .excerpt, p") description = desc_el.get_text(strip=True) if desc_el else "" # get rating count if available rating_count = 0 rating_el = card.select_one(".rating-count, .votes") if rating_el: match = re.search(r"(\d+)", rating_el.get_text()) if match: rating_count = int(match.group(1)) tools.append(Tool( id=tool_id, name=name, category=category, subcategory=subcategory, author=author, description=description, rating=rating, rating_count=rating_count, downloads=downloads, url=url, platforms=["linux", "mac", "windows"], last_updated=last_updated, scraped_at=datetime.now(timezone.utc).isoformat() )) return tools def _scrape_detail_pages(self): tools_needing_details = [(i, t) for i, t in enumerate(self.tools) if not t.description] if not tools_needing_details: return print(f"\n Fetching {len(tools_needing_details)} detail pages...") for idx, (i, tool) in enumerate(tools_needing_details): if tool.url in self.completed_urls: continue html = self._fetch(tool.url) if html: self._parse_detail_page(html, self.tools[i]) self.completed_urls.add(tool.url) if (idx + 1) % 50 == 0: self._save_progress() print(f" {idx + 1}/{len(tools_needing_details)}") time.sleep(REQUEST_DELAY) def _parse_detail_page(self, html: str, tool: Tool): soup = BeautifulSoup(html, "html.parser") # description desc_el = soup.select_one(".tool-description, .description, #description, .content p") if desc_el: tool.description = " ".join(desc_el.get_text().split())[:500] # nuke versions ver_el = soup.select_one(".nuke-version, .compatibility, [class*='version']") if ver_el: tool.nuke_versions = ver_el.get_text(strip=True) # license lic_el = soup.select_one(".license, [class*='license']") if lic_el: tool.license = lic_el.get_text(strip=True) tool.scraped_at = datetime.now(timezone.utc).isoformat() def main(): parser = argparse.ArgumentParser(description="Scrape Nukepedia for VFX tools") parser.add_argument("--full", action="store_true", help="Fetch detail pages") parser.add_argument("--resume", action="store_true", help="Resume interrupted scrape") parser.add_argument("--output", default=None, help="Output path") args = parser.parse_args() output = Path(args.output) if args.output else Path(__file__).parent.parent / "data" / "nukepedia-catalog.json" output.parent.mkdir(parents=True, exist_ok=True) scraper = Scraper(full_scrape=args.full, resume=args.resume) catalog = scraper.scrape() with open(output, "w") as f: json.dump(asdict(catalog), f, indent=2) print(f" Saved to {output}") if __name__ == "__main__": main()