clawdbot-workspace/reonomy-scraper-v9-fixed.js

350 lines
11 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v9 - FIXED EDITION
*
* Fixed v9 issues:
* - Added missing comma to regex array (line ~90)
* - Added phone and email extraction logic (after owner names, before return)
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v9-fixed.js
* Or set as environment variable
*/
const { spawn } = require('child_process');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v9-fixed.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v9-fixed.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function execAgentBrowser(args, description = '') {
const command = 'agent-browser';
const fullArgs = args.length > 0 ? [command, ...args] : [command];
log(`🔧 ${description}`);
log(` Command: ${fullArgs.join(' ')}`);
return new Promise((resolve, reject) => {
const child = spawn(command, fullArgs);
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
if (code === 0) {
log(` ✅ Success`);
resolve(stdout.trim());
} else {
log(` ❌ Failed (code ${code})`);
if (stderr) {
log(` Error: ${stderr.trim()}`);
}
reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
}
});
});
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v9 (FIXED EDITION)...\n');
// Step 1: Login to Reonomy
log('\n🔐 Step 1: Logging in to Reonomy...');
await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
await sleep(2000);
// Get snapshot for login form
const snapshotResult = await execAgentBrowser(['snapshot', '-i'], 'Get login form');
const snapshot = JSON.parse(snapshotResult);
// Find email input
let emailRef = null;
let passwordRef = null;
let loginButtonRef = null;
if (snapshot.data && snapshot.data.refs) {
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('email')) {
emailRef = ref;
} else if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('password')) {
passwordRef = ref;
} else if (element.role === 'button' && element.name && element.name.toLowerCase().includes('log in')) {
loginButtonRef = ref;
}
}
}
if (!emailRef || !passwordRef || !loginButtonRef) {
log('⚠️ Could not find login form elements');
throw new Error('Login form not found');
}
// Fill email
log(' 📧 Filling email...');
await execAgentBrowser(['eval', `document.querySelector('input[type="email"]').value = '${REONOMY_EMAIL}'`], 'Fill email');
await sleep(500);
// Fill password
log(' 🔒 Filling password...');
await execAgentBrowser(['eval', `document.querySelector('input[type="password"]').value = '${REONOMY_PASSWORD}'`], 'Fill password');
await sleep(500);
// Click login button
log(' 🔑 Clicking login button...');
await execAgentBrowser(['click', loginButtonRef], 'Click login button');
// Wait for login and redirect
log(' ⏳ Waiting for login to complete (15s)...');
await sleep(15000);
// Check if we're on search page now
const urlCheckResult = await execAgentBrowser(['eval', 'window.location.href'], 'Check current URL');
const urlCheck = JSON.parse(urlCheckResult);
if (urlCheck.result && urlCheck.result.includes('#!/search/')) {
log('✅ Login successful!');
// Extract search ID from current URL
const searchIdMatch = urlCheck.result.match(/#!\/search\/([a-f0-9-]+)/);
if (searchIdMatch) {
const currentSearchId = searchIdMatch[1];
// Update SEARCH_ID from environment or use captured
const newSearchId = process.env.REONOMY_SEARCH_ID || currentSearchId;
process.env.REONOMY_SEARCH_ID = newSearchId;
SEARCH_ID = newSearchId;
log(`📝 Search ID updated: ${SEARCH_ID}`);
}
} else {
log('⚠️ Could not confirm login - URL does not match expected pattern');
throw new Error('Login may have failed');
}
// Step 2: Navigate to search using search ID
log('\n📍 Step 2: Navigating to search...');
const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;
await execAgentBrowser(['open', searchUrl], 'Open search URL');
await sleep(3000);
// Step 3: Extract property IDs from search results
log('\n📍 Step 3: Extracting property IDs...');
const snapshotResult = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
const snapshot = JSON.parse(snapshotResult);
const propertyIds = [];
// Find all property links from search results
if (snapshot.data) {
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
if (element.role === 'link') {
const match = element.url?.match(/property\/([a-f0-9-]+)/);
if (match) {
propertyIds.push({
id: match[1],
url: `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${match[1]}`
});
}
}
}
}
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Step 4: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate to property ownership page directly
log(` 🔗 Navigating to ownership page...`);
const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;
await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
await sleep(8000); // Wait for page to load
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const propertyData = await extractOwnerTabData();
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
...propertyData,
searchId: SEARCH_ID
};
log(` 📧 Emails: ${propertyData.emails.length}`);
log(` 📞 Phones: ${propertyData.phones.length}`);
log(` 👤 Owners: ${propertyData.ownerNames.length}`);
log(` 📍 Address: ${propertyData.propertyAddress || 'N/A'}`);
leads.push(lead);
// Go back to search results for next property
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(3000);
}
// Step 5: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: SEARCH_ID,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}
/**
* Extract data from Owner tab (includes ALL data: owner names, emails, phones)
*/
async function extractOwnerTabData() {
log('📊 Extracting Owner tab data...');
// Get snapshot of Owner tab
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get Owner tab elements');
const snapshot = JSON.parse(snapshotResult);
const ownerData = {
ownerNames: [],
emails: [],
phones: []
};
// Extract owner names from page text (from v9 - proven to work)
const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
const bodyText = JSON.parse(bodyTextResult).result || '';
// Owner name patterns (from v9)
const ownerPatterns = [
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
/Owns\s+(\d+)\s+properties?\s*in\s+([A-Z][a-z]+)/i
];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) {
ownerData.ownerNames.push(owner);
}
});
}
}
// Extract phones using your CSS selector (from v9 - proven to work)
const phoneResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text && text.length >= 10)`], 'Extract phones');
const phoneData = JSON.parse(phoneResult);
if (phoneData.result && Array.isArray(phoneData.result)) {
phoneData.result.forEach(phone => {
// Clean phone numbers (remove extra spaces, formatting)
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !ownerData.phones.includes(cleanPhone)) {
ownerData.phones.push(cleanPhone);
}
});
log(` 📞 Phones: ${ownerData.phones.length}`);
}
// Extract emails using mailto links (v9 approach + additional patterns)
const emailResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('a[href^="mailto:"], a[href*="@"]')).map(a => {
const href = a.getAttribute('href');
if (href && href.includes('mailto:')) {
return href.replace('mailto:', '');
} else if (href && href.includes('@')) {
return href;
}
return '';
}).filter(email => email && email.length > 3)`], 'Extract emails');
const emailData = JSON.parse(emailResult);
if (emailData.result && Array.isArray(emailData.result)) {
emailData.result.forEach(email => {
if (email && email.length > 3 && !ownerData.emails.includes(email)) {
ownerData.emails.push(email);
}
});
log(` 📧 Emails: ${ownerData.emails.length}`);
return ownerData;
}
/**
* Main execution
*/
(async () => {
try {
await scrapeLeads();
process.exit(0);
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
const screenshotPath = `/tmp/reonomy-v9-error.png`;
await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
throw error;
}
})();