590 lines
18 KiB
JavaScript
590 lines
18 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Reonomy Scraper v12 - AGENT-BROWSER EDITION (Vercel Labs)
|
||
*
|
||
* Key features:
|
||
* - Uses agent-browser CLI tool (Rust backend, Playwright engine)
|
||
* - State save/load for auth persistence (no repeated login)
|
||
* - Ref-based navigation (AI-friendly, deterministic)
|
||
* - Semantic locators (find by role, text, label, placeholder)
|
||
* - Extracts from BOTH Builder and Lot AND Owner tabs
|
||
* - Uses direct ownership URLs (no property card clicking)
|
||
* - Dual-tab extraction: property details + owner names + emails + phones
|
||
*
|
||
* Usage:
|
||
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v12-agent-browser.js
|
||
* Or set as environment variable
|
||
*/
|
||
|
||
const { spawn } = require('child_process');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
|
||
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
|
||
const HEADLESS = process.env.HEADLESS !== 'false';
|
||
|
||
// Full path to agent-browser wrapper
|
||
const AGENT_BROWSER = '/opt/homebrew/bin/agent-browser';
|
||
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v12-agent-browser.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v12.log');
|
||
const AUTH_STATE_FILE = path.join(__dirname, 'reonomy-auth-state.txt');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Execute agent-browser command and capture output
|
||
*/
|
||
async function execAgentBrowser(args, description = '') {
|
||
const fullArgs = args.length > 0 ? [AGENT_BROWSER, ...args] : [AGENT_BROWSER];
|
||
|
||
log(`🔧 ${description}`);
|
||
log(` Command: ${fullArgs.join(' ')}`);
|
||
|
||
return new Promise((resolve, reject) => {
|
||
const child = spawn(AGENT_BROWSER, args);
|
||
|
||
let stdout = '';
|
||
let stderr = '';
|
||
|
||
child.stdout.on('data', data => {
|
||
stdout += data.toString();
|
||
});
|
||
|
||
child.stderr.on('data', data => {
|
||
stderr += data.toString();
|
||
});
|
||
|
||
child.on('close', code => {
|
||
if (code === 0) {
|
||
log(` ✅ Success`);
|
||
resolve(stdout.trim());
|
||
} else {
|
||
log(` ❌ Failed (code ${code})`);
|
||
if (stderr) {
|
||
log(` Error: ${stderr.trim()}`);
|
||
}
|
||
reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
|
||
}
|
||
});
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Execute agent-browser command and parse JSON output
|
||
*/
|
||
async function execAgentBrowserJson(args, description = '') {
|
||
const output = await execAgentBrowser([...args, '--json'], description);
|
||
try {
|
||
return JSON.parse(output);
|
||
} catch (error) {
|
||
log(` ⚠️ JSON parse error: ${error.message}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Check if auth state file exists and load it
|
||
*/
|
||
async function loadAuthState() {
|
||
if (fs.existsSync(AUTH_STATE_FILE)) {
|
||
const state = fs.readFileSync(AUTH_STATE_FILE, 'utf8');
|
||
log('🔑 Loading saved auth state...');
|
||
log(` State file: ${AUTH_STATE_FILE}`);
|
||
return state.trim();
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* Save auth state to file
|
||
*/
|
||
async function saveAuthState(state) {
|
||
fs.writeFileSync(AUTH_STATE_FILE, state);
|
||
log('🔑 Saved auth state to file');
|
||
log(` State file: ${AUTH_STATE_FILE}`);
|
||
log(` State: ${state.substring(0, 100)}...`);
|
||
}
|
||
|
||
/**
|
||
* Take screenshot for debugging
|
||
*/
|
||
async function takeScreenshot(filename) {
|
||
const screenshotPath = `/tmp/${filename}`;
|
||
const outputPath = await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
|
||
if (outputPath.includes('Saved')) {
|
||
log(` 📸 Screenshot saved: ${screenshotPath}`);
|
||
}
|
||
return screenshotPath;
|
||
}
|
||
|
||
/**
|
||
* Extract data from Builder and Lot tab
|
||
*/
|
||
async function extractBuilderLotData() {
|
||
log('📊 Extracting Builder and Lot data...');
|
||
|
||
// Get snapshot
|
||
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get interactive elements');
|
||
const snapshot = JSON.parse(snapshotResult);
|
||
|
||
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
|
||
log(' ⚠️ Could not get snapshot');
|
||
return {
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
squareFootage: '',
|
||
propertyType: ''
|
||
};
|
||
}
|
||
|
||
log(` Found ${Object.keys(snapshot.data.refs || {}).length} interactive elements`);
|
||
|
||
// Extract property details using semantic locators
|
||
let propertyAddress = '';
|
||
let city = '';
|
||
let state = '';
|
||
let zip = '';
|
||
|
||
// Try heading first (property address)
|
||
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
|
||
if (element.role === 'heading') {
|
||
const addressMatch = element.name.match(/^(\d+[^,\n]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
if (addressMatch) {
|
||
propertyAddress = element.name.trim();
|
||
city = addressMatch[1]?.trim() || '';
|
||
state = addressMatch[2]?.trim() || '';
|
||
zip = addressMatch[3]?.trim() || '';
|
||
log(` 📍 Address: ${element.name}`);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract square footage from body text
|
||
const bodyTextResult = await execAgentBrowserJson(['eval', 'document.body.innerText'], 'Get body text');
|
||
const bodyText = bodyTextResult?.data?.result || '';
|
||
|
||
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
|
||
const squareFootage = sfMatch ? sfMatch[0] : '';
|
||
if (squareFootage) {
|
||
log(` 📐 Square Footage: ${squareFootage}`);
|
||
}
|
||
|
||
// Extract property type
|
||
const typePatterns = [
|
||
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
|
||
'General Industrial', 'Medical Building', 'School', 'Religious',
|
||
'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
|
||
'Tax Exempt', 'Mixed Use'
|
||
];
|
||
|
||
let propertyType = '';
|
||
for (const type of typePatterns) {
|
||
if (bodyText.includes(type)) {
|
||
propertyType = type;
|
||
log(` 🏢 Property Type: ${type}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
return {
|
||
propertyAddress,
|
||
city,
|
||
state,
|
||
zip,
|
||
squareFootage,
|
||
propertyType
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Extract data from Owner tab (emails + phones + owner names)
|
||
*/
|
||
async function extractOwnerTabData() {
|
||
log('👤 Extracting Owner tab data...');
|
||
|
||
// Extract owner names using semantic locators
|
||
const ownerData = await execAgentBrowserJson(['eval', `({
|
||
ownerNames: [],
|
||
emails: [],
|
||
phones: []
|
||
});`], 'Get owner data object');
|
||
|
||
if (!ownerData || !ownerData.data?.result) {
|
||
log(' ⚠️ Could not get owner data object');
|
||
return {
|
||
ownerNames: [],
|
||
emails: [],
|
||
phones: []
|
||
};
|
||
}
|
||
|
||
const result = ownerData.data.result;
|
||
|
||
// Extract owner names from page text (proven approach)
|
||
const bodyTextResult = await execAgentBrowserJson(['eval', 'document.body.innerText'], 'Get body text');
|
||
const bodyText = bodyTextResult?.data?.result || '';
|
||
|
||
const ownerLines = bodyText.split('\n');
|
||
|
||
for (const line of ownerLines) {
|
||
// Look for "Owner: X properties" pattern
|
||
const ownsMatch = line.match(/Owner:\s*(\d+)\s+properties?\s*([A-Z][a-z]+)/i);
|
||
if (ownsMatch && ownsMatch[2]) {
|
||
const owner = ownsMatch[2].trim();
|
||
if (owner && owner.length > 3 && !result.ownerNames.includes(owner)) {
|
||
result.ownerNames.push(owner);
|
||
log(` 👤 Owner: ${owner}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
log(` 👤 Owners found: ${result.ownerNames.length}`);
|
||
|
||
// Extract emails using dual approach
|
||
// 1. Mailto links
|
||
const mailtoResult = await execAgentBrowserJson(['eval', `({
|
||
mailtoLinks: Array.from(document.querySelectorAll('a[href^="mailto:"]')).map(a => a.href.replace('mailto:', ''))
|
||
});`], 'Extract mailto links');
|
||
|
||
if (mailtoResult && mailtoResult.data?.result?.mailtoLinks) {
|
||
mailtoResult.data.result.mailtoLinks.forEach(email => {
|
||
const cleanedEmail = email.trim();
|
||
if (cleanedEmail && cleanedEmail.length > 5 && !result.emails.includes(cleanedEmail)) {
|
||
result.emails.push(cleanedEmail);
|
||
}
|
||
});
|
||
log(` 📧 Emails from mailto links: ${result.emails.length}`);
|
||
}
|
||
|
||
// 2. Email patterns in text
|
||
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
||
const emailMatches = bodyText.match(emailRegex) || [];
|
||
|
||
if (emailMatches) {
|
||
emailMatches.forEach(email => {
|
||
if (!result.emails.includes(email)) {
|
||
result.emails.push(email);
|
||
}
|
||
});
|
||
log(` 📧 Emails from text regex: ${emailMatches.length}`);
|
||
}
|
||
|
||
log(` 📧 Total emails: ${result.emails.length}`);
|
||
|
||
// Extract phones using user-provided CSS selector
|
||
const phoneResult = await execAgentBrowserJson(['eval', `({
|
||
phoneTexts: Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text.length >= 10)
|
||
});`], 'Extract phones using CSS selector');
|
||
|
||
if (phoneResult && phoneResult.data?.result?.phoneTexts) {
|
||
phoneResult.data.result.phoneTexts.forEach(phone => {
|
||
// Clean phone numbers
|
||
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
|
||
if (cleanPhone.length >= 10 && !result.phones.includes(cleanPhone)) {
|
||
result.phones.push(cleanPhone);
|
||
}
|
||
});
|
||
log(` 📞 Phones found: ${result.phones.length}`);
|
||
}
|
||
|
||
log(` 📞 Total phones: ${result.phones.length}`);
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Extract property IDs from search results
|
||
*/
|
||
async function extractPropertyIds() {
|
||
log('📍 Extracting property IDs...');
|
||
|
||
const snapshot = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
|
||
|
||
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
|
||
log(' ⚠️ Could not get snapshot');
|
||
return [];
|
||
}
|
||
|
||
const propertyIds = [];
|
||
|
||
// Find all property links from search results
|
||
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
|
||
if (element.role === 'link') {
|
||
const match = element.url?.match(/property\/([a-f0-9-]+)/);
|
||
if (match) {
|
||
propertyIds.push({
|
||
id: match[1],
|
||
url: element.url
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
log(` ✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
return propertyIds;
|
||
}
|
||
|
||
/**
|
||
* Main scraper function
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v12 (AGENT-BROWSER EDITION)...\n');
|
||
|
||
// Check for saved auth state
|
||
const savedState = await loadAuthState();
|
||
let isLoggedIn = false;
|
||
|
||
// Step 1: Login to Reonomy (only if no saved state)
|
||
if (!savedState) {
|
||
log('\n📍 Step 1: Checking login status...');
|
||
await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
|
||
await sleep(2000);
|
||
|
||
// Check if we're already logged in
|
||
const snapshot = await execAgentBrowserJson(['snapshot', '-i'], 'Check if already logged in');
|
||
|
||
// Check if we see "Search Reonomy" button - indicates we're logged in
|
||
const isAlreadyLoggedIn = Object.values(snapshot.data?.refs || {}).some(
|
||
elem => elem.role === 'button' && elem.name === 'Search Reonomy'
|
||
);
|
||
|
||
if (isAlreadyLoggedIn) {
|
||
log('✅ Already logged in!');
|
||
isLoggedIn = true;
|
||
} else {
|
||
log('🔐 Not logged in, proceeding with login flow...');
|
||
|
||
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
|
||
log(' ⚠️ Could not get login form snapshot');
|
||
throw new Error('Login form not found');
|
||
}
|
||
|
||
// Find email and password inputs
|
||
let emailRef = null;
|
||
let passwordRef = null;
|
||
let loginButtonRef = null;
|
||
|
||
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
|
||
if (element.role === 'textbox') {
|
||
const name = (element.name || element.placeholder || '').toLowerCase();
|
||
if (name.includes('email')) {
|
||
emailRef = ref;
|
||
} else if (name.includes('password')) {
|
||
passwordRef = ref;
|
||
}
|
||
} else if (element.role === 'button' && element.name) {
|
||
const name = element.name.toLowerCase();
|
||
if (name.includes('log in') || name.includes('sign in')) {
|
||
loginButtonRef = ref;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!emailRef || !passwordRef || !loginButtonRef) {
|
||
log(' ⚠️ Could not find login form elements');
|
||
throw new Error('Login form not found');
|
||
}
|
||
|
||
// Fill email using ref
|
||
log(' 📧 Filling email...');
|
||
await execAgentBrowser(['fill', emailRef, REONOMY_EMAIL], 'Fill email');
|
||
await sleep(500);
|
||
|
||
// Fill password using ref
|
||
log(' 🔒 Filling password...');
|
||
await execAgentBrowser(['fill', passwordRef, REONOMY_PASSWORD], 'Fill password');
|
||
await sleep(500);
|
||
|
||
// Click login button using ref
|
||
log(' 🔑 Clicking login button...');
|
||
await execAgentBrowser(['click', loginButtonRef], 'Click login button');
|
||
await sleep(500);
|
||
|
||
// Press Enter to submit the form
|
||
log(' ⏎ Pressing Enter to submit...');
|
||
await execAgentBrowser(['press', 'Enter'], 'Press Enter');
|
||
|
||
// Wait for login
|
||
log(' ⏳ Waiting for login...');
|
||
await sleep(15000);
|
||
|
||
// Check if logged in
|
||
const urlCheck = await execAgentBrowserJson(['eval', 'window.location.href'], 'Check current URL');
|
||
|
||
if (urlCheck?.data?.result && (urlCheck.data.result.includes('#!/search/') || urlCheck.data.result.includes('/!/home'))) {
|
||
isLoggedIn = true;
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Extract search ID from current URL if present
|
||
const searchIdMatch = urlCheck.data.result.match(/#!\/search\/([a-f0-9-]+)/);
|
||
if (searchIdMatch) {
|
||
const currentSearchId = searchIdMatch[1];
|
||
|
||
// Save auth state for future use
|
||
await saveAuthState(urlCheck.data.result);
|
||
|
||
log('📝 Search ID updated: ' + currentSearchId);
|
||
SEARCH_ID = currentSearchId;
|
||
} else {
|
||
// Login went to home page, we'll navigate to search below
|
||
log('🏠 Logged in to home page, will navigate to search');
|
||
}
|
||
} else {
|
||
log('⚠️ Could not confirm login - URL does not match expected pattern');
|
||
throw new Error('Login may have failed');
|
||
}
|
||
}
|
||
} else {
|
||
log('✅ Found saved auth state! Skipping login flow.');
|
||
isLoggedIn = true;
|
||
log(` Saved state: ${savedState.substring(0, 100)}...`);
|
||
|
||
// Extract search ID from saved state
|
||
const searchIdMatch = savedState.match(/#!\/search\/([a-f0-9-]+)/);
|
||
if (searchIdMatch) {
|
||
const currentSearchId = searchIdMatch[1];
|
||
SEARCH_ID = currentSearchId;
|
||
log(`📝 Search ID from saved state: ${currentSearchId}`);
|
||
} else {
|
||
log('⚠️ Could not extract search ID from saved state');
|
||
throw new Error('Could not extract search ID from saved auth state');
|
||
}
|
||
}
|
||
|
||
// Step 2: Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;
|
||
|
||
await execAgentBrowser(['open', searchUrl], 'Open search URL');
|
||
await sleep(3000);
|
||
|
||
// Step 3: Extract property IDs
|
||
log('\n📍 Step 3: Extracting property IDs...');
|
||
const propertyIds = await extractPropertyIds();
|
||
|
||
if (propertyIds.length === 0) {
|
||
log(' ⚠️ No property IDs found.');
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
log(` ✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
// Step 4: Process each property
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
|
||
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
|
||
|
||
const leads = [];
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
|
||
|
||
// Navigate directly to ownership page (from your research)
|
||
const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;
|
||
log(` 🔗 Navigating to ownership page...`);
|
||
|
||
await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
|
||
await sleep(5000);
|
||
|
||
// Wait for Owner tab to load
|
||
log(' ⏳ Waiting for Owner tab to load...');
|
||
await sleep(8000);
|
||
|
||
// Extract data from Builder and Lot tab
|
||
log(' 📊 Extracting Builder and Lot data...');
|
||
const builderLotData = await extractBuilderLotData();
|
||
|
||
// Wait a moment before extracting Owner tab
|
||
await sleep(500);
|
||
|
||
// Extract data from Owner tab
|
||
log(' 👤 Extracting Owner tab data...');
|
||
const ownerData = await extractOwnerTabData();
|
||
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: prop.id,
|
||
propertyUrl: ownershipUrl,
|
||
...builderLotData,
|
||
...ownerData,
|
||
searchId: SEARCH_ID
|
||
};
|
||
|
||
log(` 📧 Emails: ${ownerData.emails.length}`);
|
||
log(` 📞 Phones: ${ownerData.phones.length}`);
|
||
log(` 👤 Owners: ${ownerData.ownerNames.length}`);
|
||
log(` 📍 Address: ${builderLotData.propertyAddress || 'N/A'}`);
|
||
|
||
leads.push(lead);
|
||
|
||
// Screenshot for debugging (first 3 properties only)
|
||
if (i < 3) {
|
||
const screenshotPath = `/tmp/reonomy-v12-property-${i + 1}.png`;
|
||
await takeScreenshot(screenshotPath);
|
||
}
|
||
}
|
||
|
||
// Step 5: Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
searchId: SEARCH_ID,
|
||
leadCount: leads.length,
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
}
|
||
|
||
/**
|
||
* Main execution
|
||
*/
|
||
(async () => {
|
||
try {
|
||
await scrapeLeads();
|
||
process.exit(0);
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
// Take screenshot of error state
|
||
try {
|
||
await takeScreenshot('reonomy-v12-error.png');
|
||
log('📸 Error screenshot saved: /tmp/reonomy-v12-error.png');
|
||
} catch (e) {
|
||
log('Could not save error screenshot');
|
||
}
|
||
|
||
process.exit(1);
|
||
}
|
||
})();
|