clawdbot-workspace/reonomy-scraper-v10-agent-browser.js.backup

508 lines
16 KiB
JavaScript
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v10 - AGENT-BROWSER EDITION
*
* Key improvements over v9:
* - Uses agent-browser instead of Puppeteer (faster, more reliable)
* - State save/load for auth persistence (skip repeated login)
* - Extracts from BOTH "Builder and Lot" AND "Owner" tabs
* - Ref-based navigation for AI-friendly interaction
* - Semantic locators instead of fragile CSS selectors
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v10-agent-browser.js
* Or configure via environment variables
*/
const { spawn } = require('child_process');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v10-agent-browser.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v10.log');
const STATE_FILE = path.join(__dirname, 'reonomy-auth-state.txt');
// Log function
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute agent-browser command and capture output
*/
async function execAgentBrowser(args, description = '') {
const command = 'agent-browser';
const fullArgs = args.length > 0 ? [command, ...args] : [command];
log(`🔧 ${description}`);
log(` Command: ${fullArgs.join(' ')}`);
return new Promise((resolve, reject) => {
const child = spawn(command, fullArgs);
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
if (code === 0) {
log(` ✅ Success`);
resolve(stdout.trim());
} else {
log(` ❌ Failed (code ${code})`);
if (stderr) {
log(` Error: ${stderr.trim()}`);
}
reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
}
});
});
}
/**
* Execute agent-browser command and parse JSON output
*/
async function execAgentBrowserJson(args, description = '') {
const output = await execAgentBrowser([...args, '--json'], description);
try {
return JSON.parse(output);
} catch (error) {
log(` ⚠️ JSON parse error: ${error.message}`);
return null;
}
}
/**
* Execute agent-browser command and return success boolean
*/
async function execAgentBrowserSuccess(args, description = '') {
const output = await execAgentBrowser(args, description);
return output.includes('✓') || !output.includes('error');
}
/**
* Check if auth state file exists and load it
*/
async function loadAuthState() {
if (fs.existsSync(STATE_FILE)) {
const state = fs.readFileSync(STATE_FILE, 'utf8');
log('🔑 Loading saved auth state...');
log(` State file: ${STATE_FILE}`);
return state.trim();
}
return null;
}
/**
* Save auth state to file
*/
async function saveAuthState(state) {
fs.writeFileSync(STATE_FILE, state);
log('🔑 Saved auth state to file');
log(` State file: ${STATE_FILE}`);
}
/**
* Take screenshot for debugging
*/
async function takeScreenshot(filename) {
const screenshotPath = `/tmp/${filename}`;
const outputPath = await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
if (outputPath.includes('Saved')) {
log(` 📸 Screenshot saved: ${screenshotPath}`);
}
return screenshotPath;
}
/**
* Extract data from Builder and Lot tab
*/
async function extractBuilderLotData() {
log('📊 Extracting Builder and Lot data...');
// Get snapshot of all interactive elements
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get interactive elements');
const snapshot = JSON.parse(snapshotResult);
log(` Found ${Object.keys(snapshot.refs || {}).length} interactive elements`);
// Extract property details using semantic locators
let propertyData = {
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: ''
};
// Try heading first (property address)
for (const [ref, element] of Object.entries(snapshot.refs || {})) {
if (element.role === 'heading') {
const addressMatch = element.name.match(/(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
propertyData.propertyAddress = element.name.trim();
propertyData.city = addressMatch[1]?.trim() || '';
propertyData.state = addressMatch[2]?.trim() || '';
propertyData.zip = addressMatch[3]?.trim() || '';
log(` 📍 Address: ${element.name}`);
break;
}
}
}
// Extract property type from body text
const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
const bodyText = JSON.parse(bodyTextResult).result || '';
const typePatterns = [
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
'General Industrial', 'Medical Building', 'School', 'Religious',
'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
'Tax Exempt', 'Mixed Use'
];
for (const type of typePatterns) {
if (bodyText.includes(type)) {
propertyData.propertyType = type;
log(` 🏢 Property Type: ${type}`);
break;
}
}
// Extract square footage from body text
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
propertyData.squareFootage = sfMatch[0];
log(` 📐 Square Footage: ${sfMatch[0]}`);
}
return propertyData;
}
/**
* Extract data from Owner tab
*/
async function extractOwnerData() {
log('👤 Extracting Owner tab data...');
// Get snapshot of Owner tab
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get Owner tab elements');
const snapshot = JSON.parse(snapshotResult);
const ownerData = {
ownerNames: [],
emails: [],
phones: []
};
// Extract owner names from page text
const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
const bodyText = JSON.parse(bodyTextResult).result || '';
// Owner name patterns (from previous scraper)
const ownerPatterns = [
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management)))/g
];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) {
ownerData.ownerNames.push(owner);
}
});
}
}
// Extract phones using user-provided CSS selector
const phoneResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text && text.length >= 10)`], 'Extract phones');
const phoneData = JSON.parse(phoneResult);
if (phoneData.result && Array.isArray(phoneData.result)) {
phoneData.result.forEach(phone => {
// Clean phone numbers (remove extra spaces, formatting)
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !ownerData.phones.includes(cleanPhone)) {
ownerData.phones.push(cleanPhone);
}
});
log(` 📞 Phones found: ${ownerData.phones.length}`);
}
// Extract emails using mailto links (more robust pattern)
const emailResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('a[href^=\"mailto:\"], a[href*=\"@\"]')).map(a => {
const href = a.getAttribute('href');
if (href && href.includes('mailto:')) {
return href.replace('mailto:', '');
} else if (href && href.includes('@')) {
return href;
}
return '';
}).filter(email => email && email.length > 3 && email.includes('@'))"], 'Extract emails');
const emailData = JSON.parse(emailResult);
if (emailData.result && Array.isArray(emailData.result)) {
const newEmails = emailData.result.filter(email => !ownerData.emails.includes(email));
newEmails.forEach(email => {
ownerData.emails.push(email);
});
log(` 📧 Emails found: ${ownerData.emails.length} (new: ${newEmails.length})`);
}
return ownerData;
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v10 (AGENT-BROWSER EDITION)...\n');
// Step 1: Check for saved auth state
const savedState = await loadAuthState();
if (savedState) {
log(` Found saved auth state! Skipping login flow.`);
log(` Saved state: ${savedState.substring(0, 100)}...`);
}
// Step 2: Navigate to search using search ID
log('\n📍 Step 1: Navigating to search...');
const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;
await execAgentBrowser(['open', searchUrl], 'Open search URL');
await sleep(3000);
// Step 3: Extract property IDs from search results
log('\n📍 Step 2: Extracting property IDs...');
const snapshotResult = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
const snapshot = JSON.parse(snapshotResult);
const propertyIds = [];
// Find all property links from search results
if (snapshot.data) {
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'link') {
const match = element.url?.match(/property\/([a-f0-9-]+)/);
if (match) {
propertyIds.push({
id: match[1],
url: `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${match[1]}`
});
}
}
}
}
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Step 4: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 3: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate to property ownership page directly
log(` 🔗 Navigating to ownership page...`);
const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;
await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
await sleep(8000); // Wait for page to load
// Extract data from BOTH tabs
log(` 📊 Extracting Builder and Lot data...`);
const builderLotData = await extractBuilderLotData();
log(` 👤 Extracting Owner tab data...`);
const ownerData = await extractOwnerData();
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
...builderLotData,
...ownerData,
searchId: SEARCH_ID
};
log(` 📧 Emails: ${lead.emails.length}`);
log(` 📞 Phones: ${lead.phones.length}`);
log(` 👤 Owners: ${lead.ownerNames.length}`);
log(` 📍 Address: ${lead.propertyAddress || 'N/A'}`);
leads.push(lead);
// Screenshot for debugging (first 3 properties only)
if (i < 3) {
await takeScreenshot(`reonomy-v10-property-${i + 1}.png`);
}
}
// Step 5: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: SEARCH_ID,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
// Also save search ID for reuse
fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID);
log(`💾 Search ID saved to: reonomy-search-id.txt`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}
/**
* Main execution
*/
(async () => {
try {
// If no saved auth state, perform login
const savedState = await loadAuthState();
if (!savedState) {
log('\n🔐 Step 0: Logging in to Reonomy...');
// Navigate to login page
await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
await sleep(2000);
// Get snapshot for login form
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get login form');
const snapshot = JSON.parse(snapshotResult);
// Find email input
let emailRef = null;
let passwordRef = null;
let loginButtonRef = null;
if (snapshot.data && snapshot.data.refs) {
for (const [ref, element] of Object.entries(snapshot.data.refs)) {
if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('email')) {
emailRef = ref;
} else if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('password')) {
passwordRef = ref;
} else if (element.role === 'button' && element.name && element.name.toLowerCase().includes('log in')) {
loginButtonRef = ref;
}
}
}
if (!emailRef || !passwordRef || !loginButtonRef) {
log('⚠️ Could not find login form elements');
throw new Error('Login form not found');
}
// Fill email using evaluate (safer than fill command)
log(' 📧 Filling email...');
await execAgentBrowser(['eval', `document.querySelector('input[type=\"email\"]').value = '${REONOMY_EMAIL}'`], 'Fill email');
await sleep(500);
// Fill password using evaluate
log(' 🔒 Filling password...');
await execAgentBrowser(['eval', `document.querySelector('input[type=\"password\"]').value = '${REONOMY_PASSWORD}'`], 'Fill password');
await sleep(500);
// Click login button
log(' 🔑 Clicking login button...');
await execAgentBrowser(['click', loginButtonRef], 'Click login button');
// Wait for login and redirect
log(' ⏳ Waiting for login to complete (15s)...');
await sleep(15000);
// Check if we're on search page now
const urlCheckResult = await execAgentBrowser(['eval', 'window.location.href'], 'Check current URL');
const urlCheck = JSON.parse(urlCheckResult);
if (urlCheck.result && urlCheck.result.includes('#!/search/')) {
log('✅ Login successful!');
// Extract search ID from current URL
const searchIdMatch = urlCheck.result.match(/#!\/search\/([a-f0-9-]+)/);
if (searchIdMatch) {
const currentSearchId = searchIdMatch[1];
// Save auth state
log(`🔑 Saving auth state...`);
await saveAuthState(urlCheck.result);
// Update SEARCH_ID from environment or use captured
const newSearchId = process.env.REONOMY_SEARCH_ID || currentSearchId;
process.env.REONOMY_SEARCH_ID = newSearchId;
SEARCH_ID = newSearchId;
log(`📝 Search ID updated: ${SEARCH_ID}`);
// Update the search ID file for reuse
fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID);
}
} else {
log('⚠️ Could not confirm login - URL does not match expected pattern');
throw new Error('Login may have failed');
}
} else {
log('⚠️ Could not get current URL');
throw new Error('Could not confirm login state');
}
}
// Proceed with scraping
await scrapeLeads();
process.exit(0);
})().catch(error => {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
takeScreenshot('reonomy-v10-error.png');
throw error;
});