clawdbot-workspace/test-reonomy-scraper.sh

#!/bin/bash

#
# Quick validation script for the Reonomy scraper update
#

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRAPER_FILE="$SCRIPT_DIR/reonomy-scraper.js"

# Color codes
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info() { echo -e "${BLUE}ℹ️  $1${NC}"; }
log_success() { echo -e "${GREEN}✅ $1${NC}"; }
log_error() { echo -e "${RED}❌ $1${NC}"; }
log_warning() { echo -e "${YELLOW}⚠️  $1${NC}"; }

echo "🔍 Reonomy Scraper Validation"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo

# Check if scraper file exists
if [ ! -f "$SCRAPER_FILE" ]; then
  log_error "Scraper file not found: $SCRAPER_FILE"
  exit 1
fi
log_success "Scraper file found"

# Check Node.js syntax
log_info "Checking Node.js syntax..."
if node --check "$SCRAPER_FILE" 2>/dev/null; then
  log_success "Syntax is valid"
else
  log_error "Syntax errors found"
  node --check "$SCRAPER_FILE"
  exit 1
fi

# Check for new functions
log_info "Checking for new extraction functions..."

if grep -q "extractPropertyContactInfo" "$SCRAPER_FILE"; then
  log_success "extractPropertyContactInfo function found"
else
  log_error "extractPropertyContactInfo function missing"
  exit 1
fi

if grep -q "extractOwnerContactInfo" "$SCRAPER_FILE"; then
  log_success "extractOwnerContactInfo function found"
else
  log_error "extractOwnerContactInfo function missing"
  exit 1
fi

if grep -q "extractLinksFromPage" "$SCRAPER_FILE"; then
  log_success "extractLinksFromPage function found"
else
  log_error "extractLinksFromPage function missing"
  exit 1
fi

# Check for rate limiting configuration
log_info "Checking rate limiting configuration..."

if grep -q "MAX_PROPERTIES" "$SCRAPER_FILE"; then
  log_success "MAX_PROPERTIES limit configured"
else
  log_warning "MAX_PROPERTIES limit not found"
fi

if grep -q "MAX_OWNERS" "$SCRAPER_FILE"; then
  log_success "MAX_OWNERS limit configured"
else
  log_warning "MAX_OWNERS limit not found"
fi

if grep -q "PAGE_DELAY_MS" "$SCRAPER_FILE"; then
  log_success "PAGE_DELAY_MS configured"
else
  log_warning "PAGE_DELAY_MS not found"
fi

# Check for email/phone extraction patterns
log_info "Checking contact extraction patterns..."

email_patterns=(
  'a\[href\^="mailto:"\]'
  '\.email'
  '\[a-zA-Z0-9._%+-]+@\[a-zA-Z0-9.-]+\.\[a-zA-Z\]{2,\}'
)

phone_patterns=(
  'a\[href\^="tel:"\]'
  '\.phone'
  '\(\?\d{3}\)\)?\[-.\s\]?\(\d{3}\)\[-.\s\]?\(\d{4}\)'
)

for pattern in "${email_patterns[@]}"; do
  if grep -q "$pattern" "$SCRAPER_FILE"; then
    log_success "Email extraction pattern found: $pattern"
    break
  fi
done

for pattern in "${phone_patterns[@]}"; do
  if grep -q "$pattern" "$SCRAPER_FILE"; then
    log_success "Phone extraction pattern found: $pattern"
    break
  fi
done

# Check main scraper loop
log_info "Checking main scraper loop..."

if grep -q "visit each property page" "$SCRAPER_FILE"; then
  log_success "Property page scraping logic found"
else
  log_warning "Property page scraping comment not found (may be present with different wording)"
fi

if grep -q "visit each owner page" "$SCRAPER_FILE"; then
  log_success "Owner page scraping logic found"
else
  log_warning "Owner page scraping comment not found (may be present with different wording)"
fi

# Show configuration values
log_info "Current configuration:"
echo
grep -E "^(const|let).*=.*//.*limit" "$SCRAPER_FILE" | sed 's/^/  /' || true
grep -E "^(const|let).*=.*PAGE_DELAY_MS" "$SCRAPER_FILE" | sed 's/^/  /' || true
echo

# Check dependencies
log_info "Checking dependencies..."

if command -v node &> /dev/null; then
  NODE_VERSION=$(node --version)
  log_success "Node.js installed: $NODE_VERSION"
else
  log_error "Node.js not found"
  exit 1
fi

if [ -f "$SCRIPT_DIR/package.json" ]; then
  log_success "package.json found"
else
  log_warning "package.json not found (npm install may be needed)"
fi

if [ -d "$SCRIPT_DIR/node_modules/puppeteer" ]; then
  log_success "puppeteer installed"
else
  log_warning "puppeteer not found - run: npm install puppeteer"
fi

echo
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
log_success "All validation checks passed!"
echo
log_info "To run the scraper:"
echo "  cd $SCRIPT_DIR"
echo "  ./scrape-reonomy.sh --location 'New York, NY'"
echo
log_info "Or with credentials:"
echo "  export REONOMY_EMAIL='your@email.com'"
echo "  export REONOMY_PASSWORD='yourpassword'"
echo "  node reonomy-scraper.js"
echo