clawdbot-workspace/reonomy-scraper-v3.js

316 lines
9.7 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v3 - Corrected URL Pattern & Selectors
*
* Based on DOM analysis:
* - Correct URL: /search/{search-id}/property/{property-id}/ownership
* - Email selector: a[href^="mailto:"]
* - Phone selector: a[href^="tel:"]
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 10; // Number of properties to scrape
const PAGE_DELAY_MS = 3000; // Rate limiting delay
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v3.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v3.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract contact info from ownership page
*/
async function extractContactInfo(page) {
return await page.evaluate(() => {
const info = {
emails: [],
phones: [],
owners: [],
address: '',
propertyDetails: {}
};
// Extract emails
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5) {
info.emails.push(email);
}
});
// Extract phones
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7) {
info.phones.push(phone);
}
});
// Extract property address
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.address = addressMatch[0];
}
// Look for owner names (from page structure discovered)
const ownerPattern = /Owns\s+(\d+)\s+properties?\s+([A-Za-z\s,]+)/i;
const ownerMatch = document.body.innerText.match(ownerPattern);
if (ownerMatch) {
info.owners.push(ownerMatch[2]?.trim());
}
return info;
});
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v3...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log(`\n📍 Step 2: Navigating to search...`);
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Perform search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// STEP: We need to find property IDs from the search results page
// The properties are dynamically loaded, so we need to inspect how they're loaded
log('\n📍 Step 4: Finding property IDs...');
log('⚠️ Properties are dynamically loaded - checking DOM structure...');
// Check if properties are visible
const propertyButtons = await page.evaluate(() => {
const buttons = [];
document.querySelectorAll('button').forEach(b => {
const text = b.textContent.trim();
// Look for property patterns in button text
const propertyMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (propertyMatch) {
buttons.push({
text: text,
address: propertyMatch[0],
city: propertyMatch[1],
state: propertyMatch[2],
zip: propertyMatch[3],
hasAddress: true
});
}
});
return buttons.slice(0, MAX_PROPERTIES);
});
if (propertyButtons.length === 0) {
log('⚠️ No property buttons found. Properties may be loaded differently.');
log('💡 Trying alternative: Click on "Recently Viewed Properties" section...');
// Try to find property links directly
await sleep(2000);
} else {
log(`✅ Found ${propertyButtons.length} property buttons`);
// For each property button, we need to click it and get the property ID from the URL
for (let i = 0; i < Math.min(propertyButtons.length, MAX_PROPERTIES); i++) {
const prop = propertyButtons[i];
log(`\n[${i + 1}/${Math.min(propertyButtons.length, MAX_PROPERTIES)}] ${prop.address || prop.text.substring(0, 40)}...`);
// Click property button
await page.evaluate((prop) => {
const buttons = Array.from(document.querySelectorAll('button'));
const target = buttons.find(b => b.textContent.includes(prop.address?.substring(0, 20)) || b.textContent.includes(prop.text?.substring(0, 20)));
if (target) {
target.click();
}
}, prop);
await sleep(3000);
// Extract property ID from URL
const newUrl = page.url();
const propIdMatch = newUrl.match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
const propertyId = propIdMatch[1];
// Navigate to ownership page for contact info
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${propertyId}/ownership`;
log(` 🔍 Navigating to ownership page...`);
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(2000);
// Extract contact info
const contactInfo = await extractContactInfo(page);
log(` 📧 Emails: ${contactInfo.emails.length} found: ${contactInfo.emails.join(', ') || 'none'}`);
log(` 📞 Phones: ${contactInfo.phones.length} found: ${contactInfo.phones.join(', ') || 'none'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyAddress: contactInfo.address || prop.address || '',
city: prop.city || '',
state: prop.state || '',
zip: prop.zip || '',
emails: contactInfo.emails,
phones: contactInfo.phones,
owners: contactInfo.owners,
propertyUrl: `https://app.reonomy.com/#!/property/${propertyId}`,
ownershipUrl: ownershipUrl
};
leads.push(lead);
// Rate limiting
if (i < Math.min(propertyButtons.length, MAX_PROPERTIES) - 1) {
await sleep(PAGE_DELAY_MS);
}
} else {
log(' ⚠️ Could not extract property ID from URL');
}
// Go back to search results
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(2000);
}
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v3-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v3-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});