177 lines
4.6 KiB
Bash
Executable File
177 lines
4.6 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
#
|
||
# Quick validation script for the Reonomy scraper update
|
||
#
|
||
|
||
set -e
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
SCRAPER_FILE="$SCRIPT_DIR/reonomy-scraper.js"
|
||
|
||
# Color codes
|
||
GREEN='\033[0;32m'
|
||
RED='\033[0;31m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m'
|
||
|
||
log_info() { echo -e "${BLUE}ℹ️ $1${NC}"; }
|
||
log_success() { echo -e "${GREEN}✅ $1${NC}"; }
|
||
log_error() { echo -e "${RED}❌ $1${NC}"; }
|
||
log_warning() { echo -e "${YELLOW}⚠️ $1${NC}"; }
|
||
|
||
echo "🔍 Reonomy Scraper Validation"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo
|
||
|
||
# Check if scraper file exists
|
||
if [ ! -f "$SCRAPER_FILE" ]; then
|
||
log_error "Scraper file not found: $SCRAPER_FILE"
|
||
exit 1
|
||
fi
|
||
log_success "Scraper file found"
|
||
|
||
# Check Node.js syntax
|
||
log_info "Checking Node.js syntax..."
|
||
if node --check "$SCRAPER_FILE" 2>/dev/null; then
|
||
log_success "Syntax is valid"
|
||
else
|
||
log_error "Syntax errors found"
|
||
node --check "$SCRAPER_FILE"
|
||
exit 1
|
||
fi
|
||
|
||
# Check for new functions
|
||
log_info "Checking for new extraction functions..."
|
||
|
||
if grep -q "extractPropertyContactInfo" "$SCRAPER_FILE"; then
|
||
log_success "extractPropertyContactInfo function found"
|
||
else
|
||
log_error "extractPropertyContactInfo function missing"
|
||
exit 1
|
||
fi
|
||
|
||
if grep -q "extractOwnerContactInfo" "$SCRAPER_FILE"; then
|
||
log_success "extractOwnerContactInfo function found"
|
||
else
|
||
log_error "extractOwnerContactInfo function missing"
|
||
exit 1
|
||
fi
|
||
|
||
if grep -q "extractLinksFromPage" "$SCRAPER_FILE"; then
|
||
log_success "extractLinksFromPage function found"
|
||
else
|
||
log_error "extractLinksFromPage function missing"
|
||
exit 1
|
||
fi
|
||
|
||
# Check for rate limiting configuration
|
||
log_info "Checking rate limiting configuration..."
|
||
|
||
if grep -q "MAX_PROPERTIES" "$SCRAPER_FILE"; then
|
||
log_success "MAX_PROPERTIES limit configured"
|
||
else
|
||
log_warning "MAX_PROPERTIES limit not found"
|
||
fi
|
||
|
||
if grep -q "MAX_OWNERS" "$SCRAPER_FILE"; then
|
||
log_success "MAX_OWNERS limit configured"
|
||
else
|
||
log_warning "MAX_OWNERS limit not found"
|
||
fi
|
||
|
||
if grep -q "PAGE_DELAY_MS" "$SCRAPER_FILE"; then
|
||
log_success "PAGE_DELAY_MS configured"
|
||
else
|
||
log_warning "PAGE_DELAY_MS not found"
|
||
fi
|
||
|
||
# Check for email/phone extraction patterns
|
||
log_info "Checking contact extraction patterns..."
|
||
|
||
email_patterns=(
|
||
'a\[href\^="mailto:"\]'
|
||
'\.email'
|
||
'\[a-zA-Z0-9._%+-]+@\[a-zA-Z0-9.-]+\.\[a-zA-Z\]{2,\}'
|
||
)
|
||
|
||
phone_patterns=(
|
||
'a\[href\^="tel:"\]'
|
||
'\.phone'
|
||
'\(\?\d{3}\)\)?\[-.\s\]?\(\d{3}\)\[-.\s\]?\(\d{4}\)'
|
||
)
|
||
|
||
for pattern in "${email_patterns[@]}"; do
|
||
if grep -q "$pattern" "$SCRAPER_FILE"; then
|
||
log_success "Email extraction pattern found: $pattern"
|
||
break
|
||
fi
|
||
done
|
||
|
||
for pattern in "${phone_patterns[@]}"; do
|
||
if grep -q "$pattern" "$SCRAPER_FILE"; then
|
||
log_success "Phone extraction pattern found: $pattern"
|
||
break
|
||
fi
|
||
done
|
||
|
||
# Check main scraper loop
|
||
log_info "Checking main scraper loop..."
|
||
|
||
if grep -q "visit each property page" "$SCRAPER_FILE"; then
|
||
log_success "Property page scraping logic found"
|
||
else
|
||
log_warning "Property page scraping comment not found (may be present with different wording)"
|
||
fi
|
||
|
||
if grep -q "visit each owner page" "$SCRAPER_FILE"; then
|
||
log_success "Owner page scraping logic found"
|
||
else
|
||
log_warning "Owner page scraping comment not found (may be present with different wording)"
|
||
fi
|
||
|
||
# Show configuration values
|
||
log_info "Current configuration:"
|
||
echo
|
||
grep -E "^(const|let).*=.*//.*limit" "$SCRAPER_FILE" | sed 's/^/ /' || true
|
||
grep -E "^(const|let).*=.*PAGE_DELAY_MS" "$SCRAPER_FILE" | sed 's/^/ /' || true
|
||
echo
|
||
|
||
# Check dependencies
|
||
log_info "Checking dependencies..."
|
||
|
||
if command -v node &> /dev/null; then
|
||
NODE_VERSION=$(node --version)
|
||
log_success "Node.js installed: $NODE_VERSION"
|
||
else
|
||
log_error "Node.js not found"
|
||
exit 1
|
||
fi
|
||
|
||
if [ -f "$SCRIPT_DIR/package.json" ]; then
|
||
log_success "package.json found"
|
||
else
|
||
log_warning "package.json not found (npm install may be needed)"
|
||
fi
|
||
|
||
if [ -d "$SCRIPT_DIR/node_modules/puppeteer" ]; then
|
||
log_success "puppeteer installed"
|
||
else
|
||
log_warning "puppeteer not found - run: npm install puppeteer"
|
||
fi
|
||
|
||
echo
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
log_success "All validation checks passed!"
|
||
echo
|
||
log_info "To run the scraper:"
|
||
echo " cd $SCRIPT_DIR"
|
||
echo " ./scrape-reonomy.sh --location 'New York, NY'"
|
||
echo
|
||
log_info "Or with credentials:"
|
||
echo " export REONOMY_EMAIL='your@email.com'"
|
||
echo " export REONOMY_PASSWORD='yourpassword'"
|
||
echo " node reonomy-scraper.js"
|
||
echo
|