clawdbot-workspace/reonomy-scraper-v12-agent-browser.js

590 lines
18 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v12 - AGENT-BROWSER EDITION (Vercel Labs)
*
* Key features:
* - Uses agent-browser CLI tool (Rust backend, Playwright engine)
* - State save/load for auth persistence (no repeated login)
* - Ref-based navigation (AI-friendly, deterministic)
* - Semantic locators (find by role, text, label, placeholder)
* - Extracts from BOTH Builder and Lot AND Owner tabs
* - Uses direct ownership URLs (no property card clicking)
* - Dual-tab extraction: property details + owner names + emails + phones
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v12-agent-browser.js
* Or set as environment variable
*/
const { spawn } = require('child_process');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
const HEADLESS = process.env.HEADLESS !== 'false';
// Full path to agent-browser wrapper
const AGENT_BROWSER = '/opt/homebrew/bin/agent-browser';
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v12-agent-browser.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v12.log');
const AUTH_STATE_FILE = path.join(__dirname, 'reonomy-auth-state.txt');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute agent-browser command and capture output
*/
async function execAgentBrowser(args, description = '') {
const fullArgs = args.length > 0 ? [AGENT_BROWSER, ...args] : [AGENT_BROWSER];
log(`🔧 ${description}`);
log(` Command: ${fullArgs.join(' ')}`);
return new Promise((resolve, reject) => {
const child = spawn(AGENT_BROWSER, args);
let stdout = '';
let stderr = '';
child.stdout.on('data', data => {
stdout += data.toString();
});
child.stderr.on('data', data => {
stderr += data.toString();
});
child.on('close', code => {
if (code === 0) {
log(` ✅ Success`);
resolve(stdout.trim());
} else {
log(` ❌ Failed (code ${code})`);
if (stderr) {
log(` Error: ${stderr.trim()}`);
}
reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
}
});
});
}
/**
* Execute agent-browser command and parse JSON output
*/
async function execAgentBrowserJson(args, description = '') {
const output = await execAgentBrowser([...args, '--json'], description);
try {
return JSON.parse(output);
} catch (error) {
log(` ⚠️ JSON parse error: ${error.message}`);
return null;
}
}
/**
* Check if auth state file exists and load it
*/
async function loadAuthState() {
if (fs.existsSync(AUTH_STATE_FILE)) {
const state = fs.readFileSync(AUTH_STATE_FILE, 'utf8');
log('🔑 Loading saved auth state...');
log(` State file: ${AUTH_STATE_FILE}`);
return state.trim();
}
return null;
}
/**
* Save auth state to file
*/
async function saveAuthState(state) {
fs.writeFileSync(AUTH_STATE_FILE, state);
log('🔑 Saved auth state to file');
log(` State file: ${AUTH_STATE_FILE}`);
log(` State: ${state.substring(0, 100)}...`);
}
/**
* Take screenshot for debugging
*/
async function takeScreenshot(filename) {
const screenshotPath = `/tmp/${filename}`;
const outputPath = await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
if (outputPath.includes('Saved')) {
log(` 📸 Screenshot saved: ${screenshotPath}`);
}
return screenshotPath;
}
/**
* Extract data from Builder and Lot tab
*/
async function extractBuilderLotData() {
log('📊 Extracting Builder and Lot data...');
// Get snapshot
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get interactive elements');
const snapshot = JSON.parse(snapshotResult);
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
log(' ⚠️ Could not get snapshot');
return {
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: ''
};
}
log(` Found ${Object.keys(snapshot.data.refs || {}).length} interactive elements`);
// Extract property details using semantic locators
let propertyAddress = '';
let city = '';
let state = '';
let zip = '';
// Try heading first (property address)
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'heading') {
const addressMatch = element.name.match(/^(\d+[^,\n]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
propertyAddress = element.name.trim();
city = addressMatch[1]?.trim() || '';
state = addressMatch[2]?.trim() || '';
zip = addressMatch[3]?.trim() || '';
log(` 📍 Address: ${element.name}`);
break;
}
}
}
// Extract square footage from body text
const bodyTextResult = await execAgentBrowserJson(['eval', 'document.body.innerText'], 'Get body text');
const bodyText = bodyTextResult?.data?.result || '';
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
const squareFootage = sfMatch ? sfMatch[0] : '';
if (squareFootage) {
log(` 📐 Square Footage: ${squareFootage}`);
}
// Extract property type
const typePatterns = [
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
'General Industrial', 'Medical Building', 'School', 'Religious',
'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
'Tax Exempt', 'Mixed Use'
];
let propertyType = '';
for (const type of typePatterns) {
if (bodyText.includes(type)) {
propertyType = type;
log(` 🏢 Property Type: ${type}`);
break;
}
}
return {
propertyAddress,
city,
state,
zip,
squareFootage,
propertyType
};
}
/**
* Extract data from Owner tab (emails + phones + owner names)
*/
async function extractOwnerTabData() {
log('👤 Extracting Owner tab data...');
// Extract owner names using semantic locators
const ownerData = await execAgentBrowserJson(['eval', `({
ownerNames: [],
emails: [],
phones: []
});`], 'Get owner data object');
if (!ownerData || !ownerData.data?.result) {
log(' ⚠️ Could not get owner data object');
return {
ownerNames: [],
emails: [],
phones: []
};
}
const result = ownerData.data.result;
// Extract owner names from page text (proven approach)
const bodyTextResult = await execAgentBrowserJson(['eval', 'document.body.innerText'], 'Get body text');
const bodyText = bodyTextResult?.data?.result || '';
const ownerLines = bodyText.split('\n');
for (const line of ownerLines) {
// Look for "Owner: X properties" pattern
const ownsMatch = line.match(/Owner:\s*(\d+)\s+properties?\s*([A-Z][a-z]+)/i);
if (ownsMatch && ownsMatch[2]) {
const owner = ownsMatch[2].trim();
if (owner && owner.length > 3 && !result.ownerNames.includes(owner)) {
result.ownerNames.push(owner);
log(` 👤 Owner: ${owner}`);
}
}
}
log(` 👤 Owners found: ${result.ownerNames.length}`);
// Extract emails using dual approach
// 1. Mailto links
const mailtoResult = await execAgentBrowserJson(['eval', `({
mailtoLinks: Array.from(document.querySelectorAll('a[href^="mailto:"]')).map(a => a.href.replace('mailto:', ''))
});`], 'Extract mailto links');
if (mailtoResult && mailtoResult.data?.result?.mailtoLinks) {
mailtoResult.data.result.mailtoLinks.forEach(email => {
const cleanedEmail = email.trim();
if (cleanedEmail && cleanedEmail.length > 5 && !result.emails.includes(cleanedEmail)) {
result.emails.push(cleanedEmail);
}
});
log(` 📧 Emails from mailto links: ${result.emails.length}`);
}
// 2. Email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailRegex) || [];
if (emailMatches) {
emailMatches.forEach(email => {
if (!result.emails.includes(email)) {
result.emails.push(email);
}
});
log(` 📧 Emails from text regex: ${emailMatches.length}`);
}
log(` 📧 Total emails: ${result.emails.length}`);
// Extract phones using user-provided CSS selector
const phoneResult = await execAgentBrowserJson(['eval', `({
phoneTexts: Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text.length >= 10)
});`], 'Extract phones using CSS selector');
if (phoneResult && phoneResult.data?.result?.phoneTexts) {
phoneResult.data.result.phoneTexts.forEach(phone => {
// Clean phone numbers
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !result.phones.includes(cleanPhone)) {
result.phones.push(cleanPhone);
}
});
log(` 📞 Phones found: ${result.phones.length}`);
}
log(` 📞 Total phones: ${result.phones.length}`);
return result;
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds() {
log('📍 Extracting property IDs...');
const snapshot = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
log(' ⚠️ Could not get snapshot');
return [];
}
const propertyIds = [];
// Find all property links from search results
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'link') {
const match = element.url?.match(/property\/([a-f0-9-]+)/);
if (match) {
propertyIds.push({
id: match[1],
url: element.url
});
}
}
}
log(` ✅ Found ${propertyIds.length} property IDs`);
return propertyIds;
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v12 (AGENT-BROWSER EDITION)...\n');
// Check for saved auth state
const savedState = await loadAuthState();
let isLoggedIn = false;
// Step 1: Login to Reonomy (only if no saved state)
if (!savedState) {
log('\n📍 Step 1: Checking login status...');
await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
await sleep(2000);
// Check if we're already logged in
const snapshot = await execAgentBrowserJson(['snapshot', '-i'], 'Check if already logged in');
// Check if we see "Search Reonomy" button - indicates we're logged in
const isAlreadyLoggedIn = Object.values(snapshot.data?.refs || {}).some(
elem => elem.role === 'button' && elem.name === 'Search Reonomy'
);
if (isAlreadyLoggedIn) {
log('✅ Already logged in!');
isLoggedIn = true;
} else {
log('🔐 Not logged in, proceeding with login flow...');
if (!snapshot || !snapshot.data || !snapshot.data.refs) {
log(' ⚠️ Could not get login form snapshot');
throw new Error('Login form not found');
}
// Find email and password inputs
let emailRef = null;
let passwordRef = null;
let loginButtonRef = null;
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'textbox') {
const name = (element.name || element.placeholder || '').toLowerCase();
if (name.includes('email')) {
emailRef = ref;
} else if (name.includes('password')) {
passwordRef = ref;
}
} else if (element.role === 'button' && element.name) {
const name = element.name.toLowerCase();
if (name.includes('log in') || name.includes('sign in')) {
loginButtonRef = ref;
}
}
}
if (!emailRef || !passwordRef || !loginButtonRef) {
log(' ⚠️ Could not find login form elements');
throw new Error('Login form not found');
}
// Fill email using ref
log(' 📧 Filling email...');
await execAgentBrowser(['fill', emailRef, REONOMY_EMAIL], 'Fill email');
await sleep(500);
// Fill password using ref
log(' 🔒 Filling password...');
await execAgentBrowser(['fill', passwordRef, REONOMY_PASSWORD], 'Fill password');
await sleep(500);
// Click login button using ref
log(' 🔑 Clicking login button...');
await execAgentBrowser(['click', loginButtonRef], 'Click login button');
await sleep(500);
// Press Enter to submit the form
log(' ⏎ Pressing Enter to submit...');
await execAgentBrowser(['press', 'Enter'], 'Press Enter');
// Wait for login
log(' ⏳ Waiting for login...');
await sleep(15000);
// Check if logged in
const urlCheck = await execAgentBrowserJson(['eval', 'window.location.href'], 'Check current URL');
if (urlCheck?.data?.result && (urlCheck.data.result.includes('#!/search/') || urlCheck.data.result.includes('/!/home'))) {
isLoggedIn = true;
log('✅ Successfully logged in!');
// Extract search ID from current URL if present
const searchIdMatch = urlCheck.data.result.match(/#!\/search\/([a-f0-9-]+)/);
if (searchIdMatch) {
const currentSearchId = searchIdMatch[1];
// Save auth state for future use
await saveAuthState(urlCheck.data.result);
log('📝 Search ID updated: ' + currentSearchId);
SEARCH_ID = currentSearchId;
} else {
// Login went to home page, we'll navigate to search below
log('🏠 Logged in to home page, will navigate to search');
}
} else {
log('⚠️ Could not confirm login - URL does not match expected pattern');
throw new Error('Login may have failed');
}
}
} else {
log('✅ Found saved auth state! Skipping login flow.');
isLoggedIn = true;
log(` Saved state: ${savedState.substring(0, 100)}...`);
// Extract search ID from saved state
const searchIdMatch = savedState.match(/#!\/search\/([a-f0-9-]+)/);
if (searchIdMatch) {
const currentSearchId = searchIdMatch[1];
SEARCH_ID = currentSearchId;
log(`📝 Search ID from saved state: ${currentSearchId}`);
} else {
log('⚠️ Could not extract search ID from saved state');
throw new Error('Could not extract search ID from saved auth state');
}
}
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;
await execAgentBrowser(['open', searchUrl], 'Open search URL');
await sleep(3000);
// Step 3: Extract property IDs
log('\n📍 Step 3: Extracting property IDs...');
const propertyIds = await extractPropertyIds();
if (propertyIds.length === 0) {
log(' ⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
log(` ✅ Found ${propertyIds.length} property IDs`);
// Step 4: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate directly to ownership page (from your research)
const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;
log(` 🔗 Navigating to ownership page...`);
await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
await sleep(5000);
// Wait for Owner tab to load
log(' ⏳ Waiting for Owner tab to load...');
await sleep(8000);
// Extract data from Builder and Lot tab
log(' 📊 Extracting Builder and Lot data...');
const builderLotData = await extractBuilderLotData();
// Wait a moment before extracting Owner tab
await sleep(500);
// Extract data from Owner tab
log(' 👤 Extracting Owner tab data...');
const ownerData = await extractOwnerTabData();
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
...builderLotData,
...ownerData,
searchId: SEARCH_ID
};
log(` 📧 Emails: ${ownerData.emails.length}`);
log(` 📞 Phones: ${ownerData.phones.length}`);
log(` 👤 Owners: ${ownerData.ownerNames.length}`);
log(` 📍 Address: ${builderLotData.propertyAddress || 'N/A'}`);
leads.push(lead);
// Screenshot for debugging (first 3 properties only)
if (i < 3) {
const screenshotPath = `/tmp/reonomy-v12-property-${i + 1}.png`;
await takeScreenshot(screenshotPath);
}
}
// Step 5: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: SEARCH_ID,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}
/**
* Main execution
*/
(async () => {
try {
await scrapeLeads();
process.exit(0);
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
try {
await takeScreenshot('reonomy-v12-error.png');
log('📸 Error screenshot saved: /tmp/reonomy-v12-error.png');
} catch (e) {
log('Could not save error screenshot');
}
process.exit(1);
}
})();