350 lines
11 KiB
JavaScript
350 lines
11 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Reonomy Scraper v9 - FIXED EDITION
|
||
*
|
||
* Fixed v9 issues:
|
||
* - Added missing comma to regex array (line ~90)
|
||
* - Added phone and email extraction logic (after owner names, before return)
|
||
*
|
||
* Usage:
|
||
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v9-fixed.js
|
||
* Or set as environment variable
|
||
*/
|
||
|
||
const { spawn } = require('child_process');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
|
||
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v9-fixed.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v9-fixed.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
async function execAgentBrowser(args, description = '') {
|
||
const command = 'agent-browser';
|
||
const fullArgs = args.length > 0 ? [command, ...args] : [command];
|
||
|
||
log(`🔧 ${description}`);
|
||
log(` Command: ${fullArgs.join(' ')}`);
|
||
|
||
return new Promise((resolve, reject) => {
|
||
const child = spawn(command, fullArgs);
|
||
|
||
let stdout = '';
|
||
let stderr = '';
|
||
|
||
child.stdout.on('data', (data) => {
|
||
stdout += data.toString();
|
||
});
|
||
|
||
child.stderr.on('data', (data) => {
|
||
stderr += data.toString();
|
||
});
|
||
|
||
child.on('close', (code) => {
|
||
if (code === 0) {
|
||
log(` ✅ Success`);
|
||
resolve(stdout.trim());
|
||
} else {
|
||
log(` ❌ Failed (code ${code})`);
|
||
if (stderr) {
|
||
log(` Error: ${stderr.trim()}`);
|
||
}
|
||
reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
|
||
}
|
||
});
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Main scraper function
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v9 (FIXED EDITION)...\n');
|
||
|
||
// Step 1: Login to Reonomy
|
||
log('\n🔐 Step 1: Logging in to Reonomy...');
|
||
|
||
await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
|
||
await sleep(2000);
|
||
|
||
// Get snapshot for login form
|
||
const snapshotResult = await execAgentBrowser(['snapshot', '-i'], 'Get login form');
|
||
const snapshot = JSON.parse(snapshotResult);
|
||
|
||
// Find email input
|
||
let emailRef = null;
|
||
let passwordRef = null;
|
||
let loginButtonRef = null;
|
||
|
||
if (snapshot.data && snapshot.data.refs) {
|
||
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
|
||
if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('email')) {
|
||
emailRef = ref;
|
||
} else if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('password')) {
|
||
passwordRef = ref;
|
||
} else if (element.role === 'button' && element.name && element.name.toLowerCase().includes('log in')) {
|
||
loginButtonRef = ref;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!emailRef || !passwordRef || !loginButtonRef) {
|
||
log('⚠️ Could not find login form elements');
|
||
throw new Error('Login form not found');
|
||
}
|
||
|
||
// Fill email
|
||
log(' 📧 Filling email...');
|
||
await execAgentBrowser(['eval', `document.querySelector('input[type="email"]').value = '${REONOMY_EMAIL}'`], 'Fill email');
|
||
await sleep(500);
|
||
|
||
// Fill password
|
||
log(' 🔒 Filling password...');
|
||
await execAgentBrowser(['eval', `document.querySelector('input[type="password"]').value = '${REONOMY_PASSWORD}'`], 'Fill password');
|
||
await sleep(500);
|
||
|
||
// Click login button
|
||
log(' 🔑 Clicking login button...');
|
||
await execAgentBrowser(['click', loginButtonRef], 'Click login button');
|
||
|
||
// Wait for login and redirect
|
||
log(' ⏳ Waiting for login to complete (15s)...');
|
||
await sleep(15000);
|
||
|
||
// Check if we're on search page now
|
||
const urlCheckResult = await execAgentBrowser(['eval', 'window.location.href'], 'Check current URL');
|
||
const urlCheck = JSON.parse(urlCheckResult);
|
||
|
||
if (urlCheck.result && urlCheck.result.includes('#!/search/')) {
|
||
log('✅ Login successful!');
|
||
|
||
// Extract search ID from current URL
|
||
const searchIdMatch = urlCheck.result.match(/#!\/search\/([a-f0-9-]+)/);
|
||
if (searchIdMatch) {
|
||
const currentSearchId = searchIdMatch[1];
|
||
|
||
// Update SEARCH_ID from environment or use captured
|
||
const newSearchId = process.env.REONOMY_SEARCH_ID || currentSearchId;
|
||
process.env.REONOMY_SEARCH_ID = newSearchId;
|
||
SEARCH_ID = newSearchId;
|
||
|
||
log(`📝 Search ID updated: ${SEARCH_ID}`);
|
||
}
|
||
} else {
|
||
log('⚠️ Could not confirm login - URL does not match expected pattern');
|
||
throw new Error('Login may have failed');
|
||
}
|
||
|
||
// Step 2: Navigate to search using search ID
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;
|
||
|
||
await execAgentBrowser(['open', searchUrl], 'Open search URL');
|
||
await sleep(3000);
|
||
|
||
// Step 3: Extract property IDs from search results
|
||
log('\n📍 Step 3: Extracting property IDs...');
|
||
const snapshotResult = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
|
||
const snapshot = JSON.parse(snapshotResult);
|
||
|
||
const propertyIds = [];
|
||
|
||
// Find all property links from search results
|
||
if (snapshot.data) {
|
||
for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
|
||
if (element.role === 'link') {
|
||
const match = element.url?.match(/property\/([a-f0-9-]+)/);
|
||
if (match) {
|
||
propertyIds.push({
|
||
id: match[1],
|
||
url: `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${match[1]}`
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
log(`✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
if (propertyIds.length === 0) {
|
||
log('⚠️ No property IDs found.');
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
// Step 4: Process each property
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
|
||
|
||
const leads = [];
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
|
||
|
||
// Navigate to property ownership page directly
|
||
log(` 🔗 Navigating to ownership page...`);
|
||
const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;
|
||
|
||
await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
|
||
await sleep(8000); // Wait for page to load
|
||
|
||
// Extract data from Owner tab
|
||
log(` 📊 Extracting data from Owner tab...`);
|
||
const propertyData = await extractOwnerTabData();
|
||
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: prop.id,
|
||
propertyUrl: ownershipUrl,
|
||
...propertyData,
|
||
searchId: SEARCH_ID
|
||
};
|
||
|
||
log(` 📧 Emails: ${propertyData.emails.length}`);
|
||
log(` 📞 Phones: ${propertyData.phones.length}`);
|
||
log(` 👤 Owners: ${propertyData.ownerNames.length}`);
|
||
log(` 📍 Address: ${propertyData.propertyAddress || 'N/A'}`);
|
||
|
||
leads.push(lead);
|
||
|
||
// Go back to search results for next property
|
||
log(` 🔙 Going back to search results...`);
|
||
await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
await sleep(3000);
|
||
}
|
||
|
||
// Step 5: Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
searchId: SEARCH_ID,
|
||
leadCount: leads.length,
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
}
|
||
|
||
/**
|
||
* Extract data from Owner tab (includes ALL data: owner names, emails, phones)
|
||
*/
|
||
async function extractOwnerTabData() {
|
||
log('📊 Extracting Owner tab data...');
|
||
|
||
// Get snapshot of Owner tab
|
||
const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get Owner tab elements');
|
||
const snapshot = JSON.parse(snapshotResult);
|
||
|
||
const ownerData = {
|
||
ownerNames: [],
|
||
emails: [],
|
||
phones: []
|
||
};
|
||
|
||
// Extract owner names from page text (from v9 - proven to work)
|
||
const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
|
||
const bodyText = JSON.parse(bodyTextResult).result || '';
|
||
|
||
// Owner name patterns (from v9)
|
||
const ownerPatterns = [
|
||
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
|
||
/Owns\s+(\d+)\s+properties?\s*in\s+([A-Z][a-z]+)/i
|
||
];
|
||
|
||
for (const pattern of ownerPatterns) {
|
||
const matches = bodyText.match(pattern);
|
||
if (matches) {
|
||
matches.forEach(m => {
|
||
const owner = typeof m === 'string' ? m : m[1];
|
||
if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) {
|
||
ownerData.ownerNames.push(owner);
|
||
}
|
||
});
|
||
}
|
||
}
|
||
|
||
// Extract phones using your CSS selector (from v9 - proven to work)
|
||
const phoneResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text && text.length >= 10)`], 'Extract phones');
|
||
const phoneData = JSON.parse(phoneResult);
|
||
|
||
if (phoneData.result && Array.isArray(phoneData.result)) {
|
||
phoneData.result.forEach(phone => {
|
||
// Clean phone numbers (remove extra spaces, formatting)
|
||
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
|
||
if (cleanPhone.length >= 10 && !ownerData.phones.includes(cleanPhone)) {
|
||
ownerData.phones.push(cleanPhone);
|
||
}
|
||
});
|
||
log(` 📞 Phones: ${ownerData.phones.length}`);
|
||
}
|
||
|
||
// Extract emails using mailto links (v9 approach + additional patterns)
|
||
const emailResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('a[href^="mailto:"], a[href*="@"]')).map(a => {
|
||
const href = a.getAttribute('href');
|
||
if (href && href.includes('mailto:')) {
|
||
return href.replace('mailto:', '');
|
||
} else if (href && href.includes('@')) {
|
||
return href;
|
||
}
|
||
return '';
|
||
}).filter(email => email && email.length > 3)`], 'Extract emails');
|
||
const emailData = JSON.parse(emailResult);
|
||
|
||
if (emailData.result && Array.isArray(emailData.result)) {
|
||
emailData.result.forEach(email => {
|
||
if (email && email.length > 3 && !ownerData.emails.includes(email)) {
|
||
ownerData.emails.push(email);
|
||
}
|
||
});
|
||
log(` 📧 Emails: ${ownerData.emails.length}`);
|
||
|
||
return ownerData;
|
||
}
|
||
|
||
/**
|
||
* Main execution
|
||
*/
|
||
(async () => {
|
||
try {
|
||
await scrapeLeads();
|
||
process.exit(0);
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
// Take screenshot of error state
|
||
const screenshotPath = `/tmp/reonomy-v9-error.png`;
|
||
await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
|
||
throw error;
|
||
}
|
||
})();
|