clawdbot-workspace/reonomy-scraper-working.js

324 lines
8.5 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Lead Scraper - Working JSON Fallback Version
*
* Extracts property and owner leads from Reonomy dashboard/search
* and saves to JSON (no Google Sheets dependency).
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
const HEADLESS = process.env.HEADLESS === 'true';
// Output file
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save leads to JSON file
*/
function saveLeads(leads) {
const data = {
scrapeDate: new Date().toISOString(),
leadCount: leads.length,
location: SEARCH_LOCATION,
leads: leads
};
try {
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(data, null, 2));
log(`💾 Saved ${leads.length} leads to ${OUTPUT_FILE}`);
return OUTPUT_FILE;
} catch (error) {
log(`❌ Error saving to JSON: ${error.message}`);
return null;
}
}
/**
* Extract properties from page
*/
async function extractProperties(page) {
log('🔍 Extracting property data...');
const properties = await page.evaluate(() => {
const results = [];
const propertyLinks = Array.from(document.querySelectorAll('a[href*="/property/"]'));
propertyLinks.forEach(link => {
const text = (link.innerText || link.textContent || '').trim();
const addressMatch = text.match(/^(\d+.+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
results.push({
fullText: text,
address: addressMatch[1].trim(),
city: addressMatch[2].trim(),
state: addressMatch[3].trim(),
zip: addressMatch[4].trim(),
url: link.href,
remainingText: text.substring(addressMatch[0].length).trim()
});
}
});
return results;
});
const scrapeDate = new Date().toISOString().split('T')[0];
const leads = [];
for (const prop of properties) {
const sqFtMatch = prop.remainingText.match(/(\d+\.?\d*)\s*k?\s*SF/i);
const sqFt = sqFtMatch ? sqFtMatch[0] : '';
const propertyType = prop.remainingText.replace(sqFt, '').trim() || '';
const lead = {
scrapeDate,
ownerName: '',
propertyAddress: prop.address,
city: prop.city,
state: prop.state,
zip: prop.zip,
propertyType,
squareFootage: sqFt,
ownerLocation: '',
propertyCount: '',
propertyUrl: prop.url,
ownerUrl: '',
email: '',
phone: ''
};
leads.push(lead);
}
log(`✅ Extracted ${leads.length} properties`);
return leads;
}
/**
* Extract owners from page
*/
async function extractOwners(page) {
log('🔍 Extracting owner data...');
const owners = await page.evaluate(() => {
const results = [];
const ownerLinks = Array.from(document.querySelectorAll('a[href*="/person/"]'));
ownerLinks.forEach(link => {
const text = (link.innerText || link.textContent || '').trim();
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
if (lines.length >= 2) {
const ownerName = lines[0];
const location = lines.find(l => l.includes(',')) || '';
const propertyCountMatch = text.match(/(\d+)\s*propert/i);
const propertyCount = propertyCountMatch ? propertyCountMatch[1] : '';
results.push({
ownerName,
location,
propertyCount,
url: link.href,
fullText: text
});
}
});
return results;
});
const scrapeDate = new Date().toISOString().split('T')[0];
const leads = [];
for (const owner of owners) {
let city = '';
let state = '';
let ownerLocation = owner.location;
if (ownerLocation.includes(',')) {
const parts = ownerLocation.split(',').map(p => p.trim());
if (parts.length >= 2 && /^[A-Z]{2}$/.test(parts[parts.length - 1])) {
state = parts[parts.length - 1];
const cityWithPrefix = parts[parts.length - 2];
const cityMatch = cityWithPrefix.match(/(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$/);
city = cityMatch ? cityMatch[1] : '';
} else if (parts.length === 2) {
city = parts[0];
state = parts[1];
}
}
const lead = {
scrapeDate,
ownerName: owner.ownerName,
propertyAddress: '',
city,
state,
zip: '',
propertyType: '',
squareFootage: '',
ownerLocation: owner.location,
propertyCount: owner.propertyCount,
propertyUrl: '',
ownerUrl: owner.url,
email: '',
phone: ''
};
leads.push(lead);
}
log(`✅ Extracted ${leads.length} owners`);
return leads;
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Lead Scraper (JSON Fallback Mode)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
try {
// Login
log('\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Logging in...');
await sleep(8000);
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
log('✅ On search page');
// Search
log(`\n📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract leads
log('\n📍 Step 4: Extracting lead data...');
const allLeads = [];
const properties = await extractProperties(page);
allLeads.push(...properties);
const owners = await extractOwners(page);
allLeads.push(...owners);
log(`\n✅ Total leads extracted: ${allLeads.length}`);
if (allLeads.length === 0) {
log('\n⚠ No leads found. Taking screenshot for debugging...');
await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
} else {
// Save to JSON
log('\n📍 Step 5: Saving leads to JSON file...');
saveLeads(allLeads);
}
log('\n✅ Scraping complete!');
log(`💾 Leads saved to: ${OUTPUT_FILE}`);
log(`📝 Log file: ${LOG_FILE}`);
return { leadCount: allLeads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});