clawdbot-workspace/test-reonomy-scraper.sh

177 lines
4.6 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
#
# Quick validation script for the Reonomy scraper update
#
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRAPER_FILE="$SCRIPT_DIR/reonomy-scraper.js"
# Color codes
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE} $1${NC}"; }
log_success() { echo -e "${GREEN}$1${NC}"; }
log_error() { echo -e "${RED}$1${NC}"; }
log_warning() { echo -e "${YELLOW}⚠️ $1${NC}"; }
echo "🔍 Reonomy Scraper Validation"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo
# Check if scraper file exists
if [ ! -f "$SCRAPER_FILE" ]; then
log_error "Scraper file not found: $SCRAPER_FILE"
exit 1
fi
log_success "Scraper file found"
# Check Node.js syntax
log_info "Checking Node.js syntax..."
if node --check "$SCRAPER_FILE" 2>/dev/null; then
log_success "Syntax is valid"
else
log_error "Syntax errors found"
node --check "$SCRAPER_FILE"
exit 1
fi
# Check for new functions
log_info "Checking for new extraction functions..."
if grep -q "extractPropertyContactInfo" "$SCRAPER_FILE"; then
log_success "extractPropertyContactInfo function found"
else
log_error "extractPropertyContactInfo function missing"
exit 1
fi
if grep -q "extractOwnerContactInfo" "$SCRAPER_FILE"; then
log_success "extractOwnerContactInfo function found"
else
log_error "extractOwnerContactInfo function missing"
exit 1
fi
if grep -q "extractLinksFromPage" "$SCRAPER_FILE"; then
log_success "extractLinksFromPage function found"
else
log_error "extractLinksFromPage function missing"
exit 1
fi
# Check for rate limiting configuration
log_info "Checking rate limiting configuration..."
if grep -q "MAX_PROPERTIES" "$SCRAPER_FILE"; then
log_success "MAX_PROPERTIES limit configured"
else
log_warning "MAX_PROPERTIES limit not found"
fi
if grep -q "MAX_OWNERS" "$SCRAPER_FILE"; then
log_success "MAX_OWNERS limit configured"
else
log_warning "MAX_OWNERS limit not found"
fi
if grep -q "PAGE_DELAY_MS" "$SCRAPER_FILE"; then
log_success "PAGE_DELAY_MS configured"
else
log_warning "PAGE_DELAY_MS not found"
fi
# Check for email/phone extraction patterns
log_info "Checking contact extraction patterns..."
email_patterns=(
'a\[href\^="mailto:"\]'
'\.email'
'\[a-zA-Z0-9._%+-]+@\[a-zA-Z0-9.-]+\.\[a-zA-Z\]{2,\}'
)
phone_patterns=(
'a\[href\^="tel:"\]'
'\.phone'
'\(\?\d{3}\)\)?\[-.\s\]?\(\d{3}\)\[-.\s\]?\(\d{4}\)'
)
for pattern in "${email_patterns[@]}"; do
if grep -q "$pattern" "$SCRAPER_FILE"; then
log_success "Email extraction pattern found: $pattern"
break
fi
done
for pattern in "${phone_patterns[@]}"; do
if grep -q "$pattern" "$SCRAPER_FILE"; then
log_success "Phone extraction pattern found: $pattern"
break
fi
done
# Check main scraper loop
log_info "Checking main scraper loop..."
if grep -q "visit each property page" "$SCRAPER_FILE"; then
log_success "Property page scraping logic found"
else
log_warning "Property page scraping comment not found (may be present with different wording)"
fi
if grep -q "visit each owner page" "$SCRAPER_FILE"; then
log_success "Owner page scraping logic found"
else
log_warning "Owner page scraping comment not found (may be present with different wording)"
fi
# Show configuration values
log_info "Current configuration:"
echo
grep -E "^(const|let).*=.*//.*limit" "$SCRAPER_FILE" | sed 's/^/ /' || true
grep -E "^(const|let).*=.*PAGE_DELAY_MS" "$SCRAPER_FILE" | sed 's/^/ /' || true
echo
# Check dependencies
log_info "Checking dependencies..."
if command -v node &> /dev/null; then
NODE_VERSION=$(node --version)
log_success "Node.js installed: $NODE_VERSION"
else
log_error "Node.js not found"
exit 1
fi
if [ -f "$SCRIPT_DIR/package.json" ]; then
log_success "package.json found"
else
log_warning "package.json not found (npm install may be needed)"
fi
if [ -d "$SCRIPT_DIR/node_modules/puppeteer" ]; then
log_success "puppeteer installed"
else
log_warning "puppeteer not found - run: npm install puppeteer"
fi
echo
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
log_success "All validation checks passed!"
echo
log_info "To run the scraper:"
echo " cd $SCRIPT_DIR"
echo " ./scrape-reonomy.sh --location 'New York, NY'"
echo
log_info "Or with credentials:"
echo " export REONOMY_EMAIL='your@email.com'"
echo " export REONOMY_PASSWORD='yourpassword'"
echo " node reonomy-scraper.js"
echo