490 lines
13 KiB
JavaScript
490 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Reonomy Lead Scraper v2
|
||
*
|
||
* Improved scraper with better data extraction from dashboard
|
||
* and search results.
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const { execSync } = require('child_process');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration from environment variables
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL;
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD;
|
||
const SHEET_ID = process.env.REONOMY_SHEET_ID;
|
||
const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
|
||
const HEADLESS = process.env.HEADLESS === 'true';
|
||
|
||
// Validate credentials
|
||
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
|
||
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
|
||
console.error(' Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper-v2.js');
|
||
process.exit(1);
|
||
}
|
||
|
||
// Log file
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Execute gog CLI command
|
||
*/
|
||
function gogCommand(command) {
|
||
try {
|
||
let fullCommand = `gog ${command}`;
|
||
const account = process.env.GOG_ACCOUNT;
|
||
if (account) {
|
||
fullCommand = `gog --account "${account}" ${command}`;
|
||
}
|
||
|
||
const output = execSync(fullCommand, {
|
||
encoding: 'utf-8',
|
||
timeout: 30000,
|
||
stdio: ['pipe', 'pipe', 'pipe']
|
||
});
|
||
|
||
const combinedOutput = (output || '').trim();
|
||
return combinedOutput;
|
||
} catch (error) {
|
||
if (error.status !== 0) {
|
||
const stderr = error.stderr ? error.stderr.toString() : '';
|
||
const stdout = error.stdout ? error.stdout.toString() : '';
|
||
|
||
if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) {
|
||
return stdout.trim();
|
||
}
|
||
|
||
if (stderr.includes('error') || stderr.includes('Error')) {
|
||
throw new Error(`gog command failed: ${stderr}`);
|
||
}
|
||
throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`);
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get or create Google Sheet
|
||
*/
|
||
async function getOrCreateSheet() {
|
||
log('📊 Checking Google Sheets...');
|
||
|
||
if (SHEET_ID) {
|
||
log(`✅ Using existing sheet: ${SHEET_ID}`);
|
||
return SHEET_ID;
|
||
}
|
||
|
||
try {
|
||
log('📝 Creating new Google Sheet...');
|
||
const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`);
|
||
|
||
try {
|
||
const result = JSON.parse(output);
|
||
const newSheetId = result.spreadsheetId || result.id;
|
||
log(`✅ Created new sheet: ${newSheetId}`);
|
||
return newSheetId;
|
||
} catch (error) {
|
||
const match = output.match(/([0-9A-Za-z_-]{20,})/);
|
||
if (match) {
|
||
log(`✅ Created new sheet: ${match[1]}`);
|
||
return match[1];
|
||
}
|
||
throw new Error('Could not parse sheet ID from gog output');
|
||
}
|
||
} catch (error) {
|
||
log(`⚠️ Could not create Google Sheet: ${error.message}`);
|
||
log('💾 Leads will be saved to JSON file instead');
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Initialize sheet with headers
|
||
*/
|
||
async function initializeSheet(sheetId) {
|
||
log('📋 Initializing sheet headers...');
|
||
|
||
const headers = [
|
||
'Scrape Date',
|
||
'Owner Name',
|
||
'Property Address',
|
||
'City',
|
||
'State',
|
||
'ZIP',
|
||
'Property Type',
|
||
'Square Footage',
|
||
'Owner Location',
|
||
'Property Count',
|
||
'Property URL',
|
||
'Owner URL',
|
||
'Email',
|
||
'Phone'
|
||
];
|
||
|
||
const headerString = headers.map(h => `"${h}"`).join(' ');
|
||
|
||
try {
|
||
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
|
||
log('✅ Sheet headers initialized');
|
||
} catch (error) {
|
||
log(`⚠️ Could not set headers: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Append row to Google Sheet or save to JSON file
|
||
*/
|
||
async function appendToSheet(sheetId, rowData) {
|
||
if (sheetId) {
|
||
const values = Object.values(rowData).map(v => {
|
||
if (v === null || v === undefined) return '';
|
||
const str = String(v).replace(/"/g, '""');
|
||
return `"${str}"`;
|
||
}).join(' ');
|
||
|
||
try {
|
||
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
|
||
log(`✅ Added: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
|
||
} catch (error) {
|
||
log(`❌ Error appending to sheet: ${error.message}`);
|
||
}
|
||
} else {
|
||
jsonLeads.push(rowData);
|
||
log(`✅ Collected: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Save leads to JSON file
|
||
*/
|
||
function saveToJsonFile(leads) {
|
||
const filename = path.join(__dirname, 'reonomy-leads.json');
|
||
const data = {
|
||
scrapeDate: new Date().toISOString(),
|
||
leadCount: leads.length,
|
||
location: SEARCH_LOCATION,
|
||
leads: leads
|
||
};
|
||
|
||
try {
|
||
fs.writeFileSync(filename, JSON.stringify(data, null, 2));
|
||
log(`💾 Saved ${leads.length} leads to ${filename}`);
|
||
return filename;
|
||
} catch (error) {
|
||
log(`❌ Error saving to JSON: ${error.message}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
let jsonLeads = [];
|
||
|
||
/**
|
||
* Extract property addresses and details from dashboard
|
||
*/
|
||
async function extractPropertiesFromDashboard(page) {
|
||
log('🔍 Extracting property data from dashboard...');
|
||
|
||
const properties = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
// Find all property links
|
||
const propertyLinks = Array.from(document.querySelectorAll('a[href*="/property/"]'));
|
||
|
||
propertyLinks.forEach(link => {
|
||
const text = (link.innerText || link.textContent || '').trim();
|
||
|
||
// Look for address patterns (starts with number, has comma)
|
||
const addressMatch = text.match(/^(\d+.+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
|
||
if (addressMatch) {
|
||
results.push({
|
||
fullText: text,
|
||
address: addressMatch[1].trim(),
|
||
city: addressMatch[2].trim(),
|
||
state: addressMatch[3].trim(),
|
||
zip: addressMatch[4].trim(),
|
||
url: link.href,
|
||
remainingText: text.substring(addressMatch[0].length).trim()
|
||
});
|
||
}
|
||
});
|
||
|
||
return results;
|
||
});
|
||
|
||
const scrapeDate = new Date().toISOString().split('T')[0];
|
||
const leads = [];
|
||
|
||
for (const prop of properties) {
|
||
// Extract property type and square footage from remaining text
|
||
const sqFtMatch = prop.remainingText.match(/(\d+\.?\d*)\s*k?\s*SF/i);
|
||
const sqFt = sqFtMatch ? sqFtMatch[0] : '';
|
||
const propertyType = prop.remainingText.replace(sqFt, '').trim() || '';
|
||
|
||
const lead = {
|
||
scrapeDate,
|
||
ownerName: '',
|
||
propertyAddress: prop.address,
|
||
city: prop.city,
|
||
state: prop.state,
|
||
zip: prop.zip,
|
||
propertyType,
|
||
squareFootage: sqFt,
|
||
ownerLocation: '',
|
||
propertyCount: '',
|
||
propertyUrl: prop.url,
|
||
ownerUrl: '',
|
||
email: '',
|
||
phone: ''
|
||
};
|
||
|
||
leads.push(lead);
|
||
}
|
||
|
||
log(`✅ Extracted ${leads.length} properties`);
|
||
return leads;
|
||
}
|
||
|
||
/**
|
||
* Extract owner data from dashboard
|
||
*/
|
||
async function extractOwnersFromDashboard(page) {
|
||
log('🔍 Extracting owner data from dashboard...');
|
||
|
||
const owners = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
const ownerLinks = Array.from(document.querySelectorAll('a[href*="/person/"]'));
|
||
|
||
ownerLinks.forEach(link => {
|
||
const text = (link.innerText || link.textContent || '').trim();
|
||
|
||
// Pattern: Owner name\nOwns X properties Location
|
||
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
|
||
|
||
if (lines.length >= 2) {
|
||
const ownerName = lines[0];
|
||
const location = lines.find(l => l.includes(',')) || '';
|
||
const propertyCountMatch = text.match(/(\d+)\s*propert/i);
|
||
const propertyCount = propertyCountMatch ? propertyCountMatch[1] : '';
|
||
|
||
results.push({
|
||
ownerName,
|
||
location,
|
||
propertyCount,
|
||
url: link.href,
|
||
fullText: text
|
||
});
|
||
}
|
||
});
|
||
|
||
return results;
|
||
});
|
||
|
||
const scrapeDate = new Date().toISOString().split('T')[0];
|
||
const leads = [];
|
||
|
||
for (const owner of owners) {
|
||
// Parse location more carefully - extract city and state
|
||
// Format is: "Owns X properties City, State" or just "City, State"
|
||
let city = '';
|
||
let state = '';
|
||
let ownerLocation = owner.location;
|
||
|
||
if (ownerLocation.includes(',')) {
|
||
const parts = ownerLocation.split(',').map(p => p.trim());
|
||
|
||
// If the last part is a state (2 uppercase letters), use it
|
||
if (parts.length >= 2 && /^[A-Z]{2}$/.test(parts[parts.length - 1])) {
|
||
state = parts[parts.length - 1];
|
||
// The city is the second-to-last part, but we need to remove "Owns X properties" prefix
|
||
const cityWithPrefix = parts[parts.length - 2];
|
||
const cityMatch = cityWithPrefix.match(/(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$/);
|
||
city = cityMatch ? cityMatch[1] : '';
|
||
} else if (parts.length === 2) {
|
||
city = parts[0];
|
||
state = parts[1];
|
||
}
|
||
}
|
||
|
||
const lead = {
|
||
scrapeDate,
|
||
ownerName: owner.ownerName,
|
||
propertyAddress: '',
|
||
city,
|
||
state,
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: '',
|
||
ownerLocation: owner.location,
|
||
propertyCount: owner.propertyCount,
|
||
propertyUrl: '',
|
||
ownerUrl: owner.url,
|
||
email: '',
|
||
phone: ''
|
||
};
|
||
|
||
leads.push(lead);
|
||
}
|
||
|
||
log(`✅ Extracted ${leads.length} owners`);
|
||
return leads;
|
||
}
|
||
|
||
/**
|
||
* Main scraper function
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Lead Scraper v2...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS ? 'new' : false,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
let sheetId;
|
||
|
||
try {
|
||
// Setup Google Sheet
|
||
sheetId = await getOrCreateSheet();
|
||
|
||
if (sheetId) {
|
||
try {
|
||
const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`);
|
||
if (!existingData.includes('Owner Name')) {
|
||
await initializeSheet(sheetId);
|
||
}
|
||
} catch (error) {
|
||
await initializeSheet(sheetId);
|
||
}
|
||
} else {
|
||
log('💾 Will save leads to: reonomy-leads.json');
|
||
}
|
||
|
||
// Login to Reonomy
|
||
log('\n📍 Step 1: Logging into Reonomy...');
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
|
||
await page.click('button[type="submit"]');
|
||
log('⏳ Logging in...');
|
||
|
||
await sleep(8000);
|
||
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Navigate to home/dashboard to extract recent data
|
||
log('\n📍 Step 2: Navigating to dashboard...');
|
||
await page.goto('https://app.reonomy.com/#!/home', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
log('✅ On dashboard');
|
||
|
||
// Extract leads
|
||
log('\n📍 Step 3: Extracting lead data...');
|
||
const allLeads = [];
|
||
|
||
// Extract properties
|
||
const properties = await extractPropertiesFromDashboard(page);
|
||
allLeads.push(...properties);
|
||
|
||
// Extract owners
|
||
const owners = await extractOwnersFromDashboard(page);
|
||
allLeads.push(...owners);
|
||
|
||
log(`\n✅ Total leads extracted: ${allLeads.length}`);
|
||
|
||
if (allLeads.length === 0) {
|
||
log('\n⚠️ No leads found. Taking screenshot for debugging...');
|
||
await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
|
||
log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
|
||
} else {
|
||
// Save leads
|
||
log('\n📍 Step 4: Saving leads...');
|
||
|
||
for (const lead of allLeads) {
|
||
await appendToSheet(sheetId, lead);
|
||
await sleep(500);
|
||
}
|
||
|
||
if (!sheetId && jsonLeads.length > 0) {
|
||
saveToJsonFile(jsonLeads);
|
||
}
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
if (sheetId) {
|
||
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
|
||
} else {
|
||
log('💾 Leads saved to: reonomy-leads.json');
|
||
}
|
||
log(`📝 Log file: ${LOG_FILE}`);
|
||
|
||
return { sheetId, leadCount: allLeads.length };
|
||
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-error.png');
|
||
} catch (e) {
|
||
// Ignore screenshot errors
|
||
}
|
||
|
||
throw error;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
}
|
||
}
|
||
|
||
// Run scraper
|
||
scrapeLeads()
|
||
.then(result => {
|
||
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
|
||
if (result.sheetId) {
|
||
console.log(`\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
|
||
}
|
||
process.exit(0);
|
||
})
|
||
.catch(error => {
|
||
log(`\n💥 Scraper failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|