clawdbot-workspace/reonomy-scraper-v4-final.js

284 lines
8.0 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v4 - FINAL VERSION
*
* Key discoveries from browser inspection:
* 1. Search for location → Get search-id from URL
* 2. Extract all property IDs from search results
* 3. Navigate to ownership view for each property:
* /search/{search-id}/property/{property-id}/ownership
* 4. Extract emails/phones from mailto:/tel: links
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20; // Number of properties to scrape
const PAGE_DELAY_MS = 3000; // Rate limiting delay between ownership pages
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v4.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v4.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract contact info from ownership page
*/
async function extractContactInfo(page, propertyUrl) {
return await page.evaluate(() => {
const info = {
emails: [],
phones: [],
address: '',
};
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5) {
info.emails.push(email);
}
});
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7) {
info.phones.push(phone);
}
});
// Extract property address
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.address = addressMatch[0];
}
return info;
});
}
/**
* Extract property IDs from search results page
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const propertyIds = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
propertyIds.push({
id: match[1],
url: href
});
}
});
return propertyIds;
});
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v4 (FINAL VERSION)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log(`\n📍 Step 2: Navigating to search...`);
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Perform search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs from search results
log('\n📍 Step 4: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found. The page structure may have changed.');
throw new Error('No properties found on search page');
}
// Limit to MAX_PROPERTIES
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
// For each property, visit ownership page and extract contact info
log(`\n📍 Step 5: Scraping ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Build ownership URL
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
log(` 🔗 Navigating to ownership page...`);
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(2000);
// Extract contact info
const contactInfo = await extractContactInfo(page, prop.url);
log(` 📧 Emails: ${contactInfo.emails.length} - ${contactInfo.emails.join(', ') || 'none'}`);
log(` 📞 Phones: ${contactInfo.phones.length} - ${contactInfo.phones.join(', ') || 'none'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: prop.url,
ownershipUrl: ownershipUrl,
address: contactInfo.address || '',
emails: contactInfo.emails,
phones: contactInfo.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Rate limiting
if (i < propertiesToScrape.length - 1) {
await sleep(PAGE_DELAY_MS);
}
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v4-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v4-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});