#!/usr/bin/env node /** * Reonomy Scraper v10 - AGENT-BROWSER EDITION * * Key improvements over v9: * - Uses agent-browser instead of Puppeteer (faster, more reliable) * - State save/load for auth persistence (skip repeated login) * - Extracts from BOTH "Builder and Lot" AND "Owner" tabs * - Ref-based navigation for AI-friendly interaction * - Semantic locators instead of fragile CSS selectors * * Usage: * SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v10-agent-browser.js * Or configure via environment variables */ const { spawn } = require('child_process'); const fs = require('fs'); const path = require('path'); // Configuration const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com'; const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532'; const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6'; const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20; const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v10-agent-browser.json'); const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v10.log'); const STATE_FILE = path.join(__dirname, 'reonomy-auth-state.txt'); // Log function function log(message) { const timestamp = new Date().toISOString(); const logMessage = `[${timestamp}] ${message}\n`; console.log(message); fs.appendFileSync(LOG_FILE, logMessage); } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Execute agent-browser command and capture output */ async function execAgentBrowser(args, description = '') { const command = 'agent-browser'; const fullArgs = args.length > 0 ? [command, ...args] : [command]; log(`šŸ”§ ${description}`); log(` Command: ${fullArgs.join(' ')}`); return new Promise((resolve, reject) => { const child = spawn(command, fullArgs); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); child.on('close', (code) => { if (code === 0) { log(` āœ… Success`); resolve(stdout.trim()); } else { log(` āŒ Failed (code ${code})`); if (stderr) { log(` Error: ${stderr.trim()}`); } reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`)); } }); }); } /** * Execute agent-browser command and parse JSON output */ async function execAgentBrowserJson(args, description = '') { const output = await execAgentBrowser([...args, '--json'], description); try { return JSON.parse(output); } catch (error) { log(` āš ļø JSON parse error: ${error.message}`); return null; } } /** * Execute agent-browser command and return success boolean */ async function execAgentBrowserSuccess(args, description = '') { const output = await execAgentBrowser(args, description); return output.includes('āœ“') || !output.includes('error'); } /** * Check if auth state file exists and load it */ async function loadAuthState() { if (fs.existsSync(STATE_FILE)) { const state = fs.readFileSync(STATE_FILE, 'utf8'); log('šŸ”‘ Loading saved auth state...'); log(` State file: ${STATE_FILE}`); return state.trim(); } return null; } /** * Save auth state to file */ async function saveAuthState(state) { fs.writeFileSync(STATE_FILE, state); log('šŸ”‘ Saved auth state to file'); log(` State file: ${STATE_FILE}`); } /** * Take screenshot for debugging */ async function takeScreenshot(filename) { const screenshotPath = `/tmp/${filename}`; const outputPath = await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot'); if (outputPath.includes('Saved')) { log(` šŸ“ø Screenshot saved: ${screenshotPath}`); } return screenshotPath; } /** * Extract data from Builder and Lot tab */ async function extractBuilderLotData() { log('šŸ“Š Extracting Builder and Lot data...'); // Get snapshot of all interactive elements const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get interactive elements'); const snapshot = JSON.parse(snapshotResult); log(` Found ${Object.keys(snapshot.refs || {}).length} interactive elements`); // Extract property details using semantic locators let propertyData = { propertyAddress: '', city: '', state: '', zip: '', squareFootage: '', propertyType: '' }; // Try heading first (property address) for (const [ref, element] of Object.entries(snapshot.refs || {})) { if (element.role === 'heading') { const addressMatch = element.name.match(/(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/); if (addressMatch) { propertyData.propertyAddress = element.name.trim(); propertyData.city = addressMatch[1]?.trim() || ''; propertyData.state = addressMatch[2]?.trim() || ''; propertyData.zip = addressMatch[3]?.trim() || ''; log(` šŸ“ Address: ${element.name}`); break; } } } // Extract property type from body text const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text'); const bodyText = JSON.parse(bodyTextResult).result || ''; const typePatterns = [ 'Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building', 'Residential', 'Vacant Land', 'Tax Exempt', 'Mixed Use' ]; for (const type of typePatterns) { if (bodyText.includes(type)) { propertyData.propertyType = type; log(` šŸ¢ Property Type: ${type}`); break; } } // Extract square footage from body text const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i); if (sfMatch) { propertyData.squareFootage = sfMatch[0]; log(` šŸ“ Square Footage: ${sfMatch[0]}`); } return propertyData; } /** * Extract data from Owner tab */ async function extractOwnerData() { log('šŸ‘¤ Extracting Owner tab data...'); // Get snapshot of Owner tab const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get Owner tab elements'); const snapshot = JSON.parse(snapshotResult); const ownerData = { ownerNames: [], emails: [], phones: [] }; // Extract owner names from page text const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text'); const bodyText = JSON.parse(bodyTextResult).result || ''; // Owner name patterns (from previous scraper) const ownerPatterns = [ /Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management)))/g ]; for (const pattern of ownerPatterns) { const matches = bodyText.match(pattern); if (matches) { matches.forEach(m => { const owner = typeof m === 'string' ? m : m[1]; if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) { ownerData.ownerNames.push(owner); } }); } } // Extract phones using user-provided CSS selector const phoneResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text && text.length >= 10)`], 'Extract phones'); const phoneData = JSON.parse(phoneResult); if (phoneData.result && Array.isArray(phoneData.result)) { phoneData.result.forEach(phone => { // Clean phone numbers (remove extra spaces, formatting) const cleanPhone = phone.replace(/[\s\-\(\)]/g, ''); if (cleanPhone.length >= 10 && !ownerData.phones.includes(cleanPhone)) { ownerData.phones.push(cleanPhone); } }); log(` šŸ“ž Phones found: ${ownerData.phones.length}`); } // Extract emails using mailto links (more robust pattern) const emailResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('a[href^=\"mailto:\"], a[href*=\"@\"]')).map(a => { const href = a.getAttribute('href'); if (href && href.includes('mailto:')) { return href.replace('mailto:', ''); } else if (href && href.includes('@')) { return href; } return ''; }).filter(email => email && email.length > 3 && email.includes('@'))"], 'Extract emails'); const emailData = JSON.parse(emailResult); if (emailData.result && Array.isArray(emailData.result)) { const newEmails = emailData.result.filter(email => !ownerData.emails.includes(email)); newEmails.forEach(email => { ownerData.emails.push(email); }); log(` šŸ“§ Emails found: ${ownerData.emails.length} (new: ${newEmails.length})`); } return ownerData; } /** * Main scraper function */ async function scrapeLeads() { log('šŸš€ Starting Reonomy Scraper v10 (AGENT-BROWSER EDITION)...\n'); // Step 1: Check for saved auth state const savedState = await loadAuthState(); if (savedState) { log(`āœ… Found saved auth state! Skipping login flow.`); log(` Saved state: ${savedState.substring(0, 100)}...`); } // Step 2: Navigate to search using search ID log('\nšŸ“ Step 1: Navigating to search...'); const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`; await execAgentBrowser(['open', searchUrl], 'Open search URL'); await sleep(3000); // Step 3: Extract property IDs from search results log('\nšŸ“ Step 2: Extracting property IDs...'); const snapshotResult = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search'); const snapshot = JSON.parse(snapshotResult); const propertyIds = []; // Find all property links from search results if (snapshot.data) { for (const [ref, element] of Object.entries(snapshot.data.refs || {})) { if (element.role === 'link') { const match = element.url?.match(/property\/([a-f0-9-]+)/); if (match) { propertyIds.push({ id: match[1], url: `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${match[1]}` }); } } } } log(`āœ… Found ${propertyIds.length} property IDs`); if (propertyIds.length === 0) { log('āš ļø No property IDs found.'); throw new Error('No properties found on search page.'); } // Step 4: Process each property const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES); log(`\nšŸ“ Step 3: Processing ${propertiesToScrape.length} properties...\n`); const leads = []; for (let i = 0; i < propertiesToScrape.length; i++) { const prop = propertiesToScrape[i]; log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`); // Navigate to property ownership page directly log(` šŸ”— Navigating to ownership page...`); const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`; await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL'); await sleep(8000); // Wait for page to load // Extract data from BOTH tabs log(` šŸ“Š Extracting Builder and Lot data...`); const builderLotData = await extractBuilderLotData(); log(` šŸ‘¤ Extracting Owner tab data...`); const ownerData = await extractOwnerData(); const lead = { scrapeDate: new Date().toISOString().split('T')[0], propertyId: prop.id, propertyUrl: ownershipUrl, ...builderLotData, ...ownerData, searchId: SEARCH_ID }; log(` šŸ“§ Emails: ${lead.emails.length}`); log(` šŸ“ž Phones: ${lead.phones.length}`); log(` šŸ‘¤ Owners: ${lead.ownerNames.length}`); log(` šŸ“ Address: ${lead.propertyAddress || 'N/A'}`); leads.push(lead); // Screenshot for debugging (first 3 properties only) if (i < 3) { await takeScreenshot(`reonomy-v10-property-${i + 1}.png`); } } // Step 5: Save results if (leads.length > 0) { log(`\nāœ… Total leads scraped: ${leads.length}`); const outputData = { scrapeDate: new Date().toISOString(), searchId: SEARCH_ID, leadCount: leads.length, leads: leads }; fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2)); log(`šŸ’¾ Saved to: ${OUTPUT_FILE}`); // Also save search ID for reuse fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID); log(`šŸ’¾ Search ID saved to: reonomy-search-id.txt`); } else { log('\nāš ļø No leads scraped.'); } log('\nāœ… Scraping complete!'); return { leadCount: leads.length, outputFile: OUTPUT_FILE }; } /** * Main execution */ (async () => { try { // If no saved auth state, perform login const savedState = await loadAuthState(); if (!savedState) { log('\nšŸ” Step 0: Logging in to Reonomy...'); // Navigate to login page await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page'); await sleep(2000); // Get snapshot for login form const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get login form'); const snapshot = JSON.parse(snapshotResult); // Find email input let emailRef = null; let passwordRef = null; let loginButtonRef = null; if (snapshot.data && snapshot.data.refs) { for (const [ref, element] of Object.entries(snapshot.data.refs)) { if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('email')) { emailRef = ref; } else if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('password')) { passwordRef = ref; } else if (element.role === 'button' && element.name && element.name.toLowerCase().includes('log in')) { loginButtonRef = ref; } } } if (!emailRef || !passwordRef || !loginButtonRef) { log('āš ļø Could not find login form elements'); throw new Error('Login form not found'); } // Fill email using evaluate (safer than fill command) log(' šŸ“§ Filling email...'); await execAgentBrowser(['eval', `document.querySelector('input[type=\"email\"]').value = '${REONOMY_EMAIL}'`], 'Fill email'); await sleep(500); // Fill password using evaluate log(' šŸ”’ Filling password...'); await execAgentBrowser(['eval', `document.querySelector('input[type=\"password\"]').value = '${REONOMY_PASSWORD}'`], 'Fill password'); await sleep(500); // Click login button log(' šŸ”‘ Clicking login button...'); await execAgentBrowser(['click', loginButtonRef], 'Click login button'); // Wait for login and redirect log(' ā³ Waiting for login to complete (15s)...'); await sleep(15000); // Check if we're on search page now const urlCheckResult = await execAgentBrowser(['eval', 'window.location.href'], 'Check current URL'); const urlCheck = JSON.parse(urlCheckResult); if (urlCheck.result && urlCheck.result.includes('#!/search/')) { log('āœ… Login successful!'); // Extract search ID from current URL const searchIdMatch = urlCheck.result.match(/#!\/search\/([a-f0-9-]+)/); if (searchIdMatch) { const currentSearchId = searchIdMatch[1]; // Save auth state log(`šŸ”‘ Saving auth state...`); await saveAuthState(urlCheck.result); // Update SEARCH_ID from environment or use captured const newSearchId = process.env.REONOMY_SEARCH_ID || currentSearchId; process.env.REONOMY_SEARCH_ID = newSearchId; SEARCH_ID = newSearchId; log(`šŸ“ Search ID updated: ${SEARCH_ID}`); // Update the search ID file for reuse fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID); } } else { log('āš ļø Could not confirm login - URL does not match expected pattern'); throw new Error('Login may have failed'); } } else { log('āš ļø Could not get current URL'); throw new Error('Could not confirm login state'); } } // Proceed with scraping await scrapeLeads(); process.exit(0); })().catch(error => { log(`\nāŒ Error: ${error.message}`); log(error.stack); // Take screenshot of error state takeScreenshot('reonomy-v10-error.png'); throw error; });