clawdbot-workspace/reonomy-scraper-v5.js

368 lines
11 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v5 - LONGER WAITS + DEBUG
*
* Improvements:
* - Increased page load wait (10000ms instead of 2000ms)
* - Debug output for each page
* - Multiple wait strategies
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
const PAGE_LOAD_DELAY_MS = 10000; // Increased from 2000 to 10000
const MAX_WAIT_SECONDS = 45; // Maximum wait per property
const DEBUG = process.env.DEBUG === 'true';
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v5.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v5.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Debug log function
*/
async function debugLog(page, label) {
if (!DEBUG) return;
const debugInfo = await page.evaluate(() => {
return {
url: window.location.href,
title: document.title,
bodyTextLength: document.body.innerText.length,
emailCount: document.querySelectorAll('a[href^="mailto:"]').length,
phoneCount: document.querySelectorAll('a[href^="tel:"]').length,
mailtoLinks: Array.from(document.querySelectorAll('a[href^="mailto:"]')).slice(0, 3).map(a => a.href),
telLinks: Array.from(document.querySelectorAll('a[href^="tel:"]')).slice(0, 3).map(a => a.href)
};
});
log(`🔍 [DEBUG] ${label}:`);
log(` URL: ${debugInfo.url}`);
log(` Title: ${debugInfo.title}`);
log(` Body Text Length: ${debugInfo.bodyTextLength}`);
log(` Email Links: ${debugInfo.emailCount}`);
log(` Phone Links: ${debugInfo.phoneCount}`);
if (debugInfo.emailCount > 0) {
log(` 📧 Emails: ${debugInfo.mailtoLinks.slice(0, 2).join(', ')}`);
}
if (debugInfo.phoneCount > 0) {
log(` 📞 Phones: ${debugInfo.telLinks.slice(0, 2).join(', ')}`);
}
}
/**
* Extract contact info from ownership page with better waiting
*/
async function extractContactInfo(page, propertyUrl) {
log(` 🔗 Navigating to ownership page...`);
await page.goto(propertyUrl, {
waitUntil: 'networkidle2',
timeout: 60000
});
log(` ⏳ Waiting ${PAGE_LOAD_DELAY_MS}ms for content to load...`);
await sleep(PAGE_LOAD_DELAY_MS);
// Additional wait for dynamic content
log(` ⏳ Waiting additional 5s for dynamic content...`);
await sleep(5000);
const contactInfo = await page.evaluate(() => {
const info = {
emails: [],
phones: [],
address: '',
owners: [],
pageTitle: document.title,
pageHtmlSample: ''
};
// Extract emails
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5) {
info.emails.push(email);
}
});
// Extract phones
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7) {
info.phones.push(phone);
}
});
// Extract property address
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.address = addressMatch[0];
}
// Look for owner names
const ownerPattern = /Owns\s+(\d+)\s+properties?\s+([A-Za-z\s,]+)/i;
const ownerMatch = document.body.innerText.match(ownerPattern);
if (ownerMatch) {
info.owners.push(ownerMatch[2]?.trim());
}
// Save HTML sample for debugging
const bodyText = document.body.innerText;
if (bodyText.length < 500) {
info.pageHtmlSample = bodyText;
}
return info;
});
log(` 📧 Emails: ${contactInfo.emails.length} found: ${contactInfo.emails.join(', ') || 'none'}`);
log(` 📞 Phones: ${contactInfo.phones.length} found: ${contactInfo.phones.join(', ') || 'none'}`);
log(` 📄 Page Title: ${contactInfo.pageTitle}`);
return contactInfo;
}
/**
* Extract property IDs from search results page
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const propertyIds = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
propertyIds.push({
id: match[1],
url: href
});
}
});
return propertyIds;
});
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v5 (LONGER WAITS + DEBUG)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log(`\n📍 Step 2: Navigating to search...`);
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Perform search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs from search results
log('\n📍 Step 4: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found. The page structure may have changed.');
throw new Error('No properties found on search page');
}
// Limit to MAX_PROPERTIES
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
// For each property, visit ownership page and extract contact info
log(`\n📍 Step 5: Scraping ${propertiesToScrape.length} properties with extended waits...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
// Calculate wait time: start small, increase for later properties
const extraWaitMs = Math.min(i * 1000, 10000); // Up to 10s extra wait
const totalWaitMs = PAGE_LOAD_DELAY_MS + extraWaitMs;
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
log(` 🕐 Wait time: ${(totalWaitMs / 1000).toFixed(1)}s (base: ${PAGE_LOAD_DELAY_MS / 1000}s + ${extraWaitMs / 1000}s extra)`);
log(` 🔗 Ownership URL: https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`);
// Build ownership URL
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
log(` 📥 Navigating...`);
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 60000
});
// Debug log after navigation
await debugLog(page, `Property ${i + 1}`);
// Base wait
log(` ⏳ Base wait ${PAGE_LOAD_DELAY_MS}ms...`);
await sleep(PAGE_LOAD_DELAY_MS);
// Additional wait
log(` ⏳ Additional wait ${extraWaitMs}ms...`);
await sleep(extraWaitMs);
// Extract contact info
const contactInfo = await extractContactInfo(page, prop.url);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: prop.url,
ownershipUrl: ownershipUrl,
address: contactInfo.address || '',
emails: contactInfo.emails,
phones: contactInfo.phones,
owners: contactInfo.owners,
pageTitle: contactInfo.pageTitle,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Rate limiting between properties
const rateLimitDelay = 5000; // 5 seconds between properties
log(` ⏸ Rate limit delay: ${rateLimitDelay}ms...`);
await sleep(rateLimitDelay);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v5-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v5-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});