clawdbot-workspace/reonomy-scraper-v2.js

490 lines
13 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Lead Scraper v2
*
* Improved scraper with better data extraction from dashboard
* and search results.
*/
const puppeteer = require('puppeteer');
const { execSync } = require('child_process');
const fs = require('fs');
const path = require('path');
// Configuration from environment variables
const REONOMY_EMAIL = process.env.REONOMY_EMAIL;
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD;
const SHEET_ID = process.env.REONOMY_SHEET_ID;
const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
const HEADLESS = process.env.HEADLESS === 'true';
// Validate credentials
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
console.error(' Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper-v2.js');
process.exit(1);
}
// Log file
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute gog CLI command
*/
function gogCommand(command) {
try {
let fullCommand = `gog ${command}`;
const account = process.env.GOG_ACCOUNT;
if (account) {
fullCommand = `gog --account "${account}" ${command}`;
}
const output = execSync(fullCommand, {
encoding: 'utf-8',
timeout: 30000,
stdio: ['pipe', 'pipe', 'pipe']
});
const combinedOutput = (output || '').trim();
return combinedOutput;
} catch (error) {
if (error.status !== 0) {
const stderr = error.stderr ? error.stderr.toString() : '';
const stdout = error.stdout ? error.stdout.toString() : '';
if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) {
return stdout.trim();
}
if (stderr.includes('error') || stderr.includes('Error')) {
throw new Error(`gog command failed: ${stderr}`);
}
throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`);
}
throw error;
}
}
/**
* Get or create Google Sheet
*/
async function getOrCreateSheet() {
log('📊 Checking Google Sheets...');
if (SHEET_ID) {
log(`✅ Using existing sheet: ${SHEET_ID}`);
return SHEET_ID;
}
try {
log('📝 Creating new Google Sheet...');
const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`);
try {
const result = JSON.parse(output);
const newSheetId = result.spreadsheetId || result.id;
log(`✅ Created new sheet: ${newSheetId}`);
return newSheetId;
} catch (error) {
const match = output.match(/([0-9A-Za-z_-]{20,})/);
if (match) {
log(`✅ Created new sheet: ${match[1]}`);
return match[1];
}
throw new Error('Could not parse sheet ID from gog output');
}
} catch (error) {
log(`⚠️ Could not create Google Sheet: ${error.message}`);
log('💾 Leads will be saved to JSON file instead');
return null;
}
}
/**
* Initialize sheet with headers
*/
async function initializeSheet(sheetId) {
log('📋 Initializing sheet headers...');
const headers = [
'Scrape Date',
'Owner Name',
'Property Address',
'City',
'State',
'ZIP',
'Property Type',
'Square Footage',
'Owner Location',
'Property Count',
'Property URL',
'Owner URL',
'Email',
'Phone'
];
const headerString = headers.map(h => `"${h}"`).join(' ');
try {
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
log('✅ Sheet headers initialized');
} catch (error) {
log(`⚠️ Could not set headers: ${error.message}`);
}
}
/**
* Append row to Google Sheet or save to JSON file
*/
async function appendToSheet(sheetId, rowData) {
if (sheetId) {
const values = Object.values(rowData).map(v => {
if (v === null || v === undefined) return '';
const str = String(v).replace(/"/g, '""');
return `"${str}"`;
}).join(' ');
try {
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
log(`✅ Added: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
} catch (error) {
log(`❌ Error appending to sheet: ${error.message}`);
}
} else {
jsonLeads.push(rowData);
log(`✅ Collected: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
}
}
/**
* Save leads to JSON file
*/
function saveToJsonFile(leads) {
const filename = path.join(__dirname, 'reonomy-leads.json');
const data = {
scrapeDate: new Date().toISOString(),
leadCount: leads.length,
location: SEARCH_LOCATION,
leads: leads
};
try {
fs.writeFileSync(filename, JSON.stringify(data, null, 2));
log(`💾 Saved ${leads.length} leads to ${filename}`);
return filename;
} catch (error) {
log(`❌ Error saving to JSON: ${error.message}`);
return null;
}
}
let jsonLeads = [];
/**
* Extract property addresses and details from dashboard
*/
async function extractPropertiesFromDashboard(page) {
log('🔍 Extracting property data from dashboard...');
const properties = await page.evaluate(() => {
const results = [];
// Find all property links
const propertyLinks = Array.from(document.querySelectorAll('a[href*="/property/"]'));
propertyLinks.forEach(link => {
const text = (link.innerText || link.textContent || '').trim();
// Look for address patterns (starts with number, has comma)
const addressMatch = text.match(/^(\d+.+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
results.push({
fullText: text,
address: addressMatch[1].trim(),
city: addressMatch[2].trim(),
state: addressMatch[3].trim(),
zip: addressMatch[4].trim(),
url: link.href,
remainingText: text.substring(addressMatch[0].length).trim()
});
}
});
return results;
});
const scrapeDate = new Date().toISOString().split('T')[0];
const leads = [];
for (const prop of properties) {
// Extract property type and square footage from remaining text
const sqFtMatch = prop.remainingText.match(/(\d+\.?\d*)\s*k?\s*SF/i);
const sqFt = sqFtMatch ? sqFtMatch[0] : '';
const propertyType = prop.remainingText.replace(sqFt, '').trim() || '';
const lead = {
scrapeDate,
ownerName: '',
propertyAddress: prop.address,
city: prop.city,
state: prop.state,
zip: prop.zip,
propertyType,
squareFootage: sqFt,
ownerLocation: '',
propertyCount: '',
propertyUrl: prop.url,
ownerUrl: '',
email: '',
phone: ''
};
leads.push(lead);
}
log(`✅ Extracted ${leads.length} properties`);
return leads;
}
/**
* Extract owner data from dashboard
*/
async function extractOwnersFromDashboard(page) {
log('🔍 Extracting owner data from dashboard...');
const owners = await page.evaluate(() => {
const results = [];
const ownerLinks = Array.from(document.querySelectorAll('a[href*="/person/"]'));
ownerLinks.forEach(link => {
const text = (link.innerText || link.textContent || '').trim();
// Pattern: Owner name\nOwns X properties Location
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
if (lines.length >= 2) {
const ownerName = lines[0];
const location = lines.find(l => l.includes(',')) || '';
const propertyCountMatch = text.match(/(\d+)\s*propert/i);
const propertyCount = propertyCountMatch ? propertyCountMatch[1] : '';
results.push({
ownerName,
location,
propertyCount,
url: link.href,
fullText: text
});
}
});
return results;
});
const scrapeDate = new Date().toISOString().split('T')[0];
const leads = [];
for (const owner of owners) {
// Parse location more carefully - extract city and state
// Format is: "Owns X properties City, State" or just "City, State"
let city = '';
let state = '';
let ownerLocation = owner.location;
if (ownerLocation.includes(',')) {
const parts = ownerLocation.split(',').map(p => p.trim());
// If the last part is a state (2 uppercase letters), use it
if (parts.length >= 2 && /^[A-Z]{2}$/.test(parts[parts.length - 1])) {
state = parts[parts.length - 1];
// The city is the second-to-last part, but we need to remove "Owns X properties" prefix
const cityWithPrefix = parts[parts.length - 2];
const cityMatch = cityWithPrefix.match(/(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$/);
city = cityMatch ? cityMatch[1] : '';
} else if (parts.length === 2) {
city = parts[0];
state = parts[1];
}
}
const lead = {
scrapeDate,
ownerName: owner.ownerName,
propertyAddress: '',
city,
state,
zip: '',
propertyType: '',
squareFootage: '',
ownerLocation: owner.location,
propertyCount: owner.propertyCount,
propertyUrl: '',
ownerUrl: owner.url,
email: '',
phone: ''
};
leads.push(lead);
}
log(`✅ Extracted ${leads.length} owners`);
return leads;
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Lead Scraper v2...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
let sheetId;
try {
// Setup Google Sheet
sheetId = await getOrCreateSheet();
if (sheetId) {
try {
const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`);
if (!existingData.includes('Owner Name')) {
await initializeSheet(sheetId);
}
} catch (error) {
await initializeSheet(sheetId);
}
} else {
log('💾 Will save leads to: reonomy-leads.json');
}
// Login to Reonomy
log('\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Logging in...');
await sleep(8000);
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to home/dashboard to extract recent data
log('\n📍 Step 2: Navigating to dashboard...');
await page.goto('https://app.reonomy.com/#!/home', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
log('✅ On dashboard');
// Extract leads
log('\n📍 Step 3: Extracting lead data...');
const allLeads = [];
// Extract properties
const properties = await extractPropertiesFromDashboard(page);
allLeads.push(...properties);
// Extract owners
const owners = await extractOwnersFromDashboard(page);
allLeads.push(...owners);
log(`\n✅ Total leads extracted: ${allLeads.length}`);
if (allLeads.length === 0) {
log('\n⚠ No leads found. Taking screenshot for debugging...');
await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
} else {
// Save leads
log('\n📍 Step 4: Saving leads...');
for (const lead of allLeads) {
await appendToSheet(sheetId, lead);
await sleep(500);
}
if (!sheetId && jsonLeads.length > 0) {
saveToJsonFile(jsonLeads);
}
}
log('\n✅ Scraping complete!');
if (sheetId) {
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
} else {
log('💾 Leads saved to: reonomy-leads.json');
}
log(`📝 Log file: ${LOG_FILE}`);
return { sheetId, leadCount: allLeads.length };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-error.png');
} catch (e) {
// Ignore screenshot errors
}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run scraper
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
if (result.sheetId) {
console.log(`\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
}
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});