clawdbot-workspace/reonomy-scraper-v7-fixed.js

451 lines
14 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v7 - FIXED CLICK-THROUGH
*
* Key changes:
* 1. Removed invalid await inside page.evaluate()
* 2. Fixed page.evaluateHandle() usage
* 3. Better error handling
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
const PAGE_LOAD_DELAY_MS = 5000;
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v7-fixed.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v7-fixed.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Apply advanced filters
*/
async function applyAdvancedFilters(page) {
log('🔍 Applying advanced filters: Has Phone + Has Email...');
// Look for "More Filters" button
const moreFiltersBtn = await page.waitForSelector('button:has-text("More Filters"), button[aria-label*="Filters"], button:has-text("Filters")', {
timeout: 15000
}).catch(() => null);
if (moreFiltersBtn) {
await moreFiltersBtn.click();
await sleep(2000);
}
// Look for "Has Phone" filter
const hasPhoneFilter = await page.evaluate(() => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const phoneFilter = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('phone') || text.includes('has phone');
});
return phoneFilter ? phoneFilter.textContent : null;
}).catch(() => null);
if (hasPhoneFilter) {
// Find the input/checkbox associated with this label
const checkboxInfo = await page.evaluate((filterText) => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const label = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('phone') || text.includes('has phone');
});
if (!label) return null;
const parent = label.closest('div, form, label');
if (!parent) return null;
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
return input ? { tag: input.tagName, id: input.id } : null;
}, hasPhoneFilter).catch(() => null);
if (checkboxInfo && checkboxInfo.tag === 'INPUT') {
log(` ✅ Found Has Phone checkbox: ${checkboxInfo.id}`);
// Check if it's already checked, if not, click it
const isChecked = await page.evaluate((id) => {
const input = document.getElementById(id);
return input ? input.checked : false;
}, checkboxInfo.id).catch(() => false);
if (!isChecked) {
await page.evaluate((id) => {
const input = document.getElementById(id);
if (input) input.click();
}, checkboxInfo.id).catch(() => {
// Try clicking the label
log(` ⚠️ Could not click checkbox, trying label click...`);
page.evaluate((filterText) => {
const labels = Array.from(document.querySelectorAll('label'));
const label = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('phone') || text.includes('has phone');
});
if (label) label.click();
}, hasPhoneFilter).catch(() => {});
});
}
}
}
// Look for "Has Email" filter
const hasEmailFilter = await page.evaluate(() => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const emailFilter = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('email') || text.includes('has email');
});
return emailFilter ? emailFilter.textContent : null;
}).catch(() => null);
if (hasEmailFilter) {
const checkboxInfo = await page.evaluate((filterText) => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const label = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('email') || text.includes('has email');
});
if (!label) return null;
const parent = label.closest('div, form, label');
if (!parent) return null;
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
return input ? { tag: input.tagName, id: input.id } : null;
}, hasEmailFilter).catch(() => null);
if (checkboxInfo && checkboxInfo.tag === 'INPUT') {
log(` ✅ Found Has Email checkbox: ${checkboxInfo.id}`);
const isChecked = await page.evaluate((id) => {
const input = document.getElementById(id);
return input ? input.checked : false;
}, checkboxInfo.id).catch(() => false);
if (!isChecked) {
await page.evaluate((id) => {
const input = document.getElementById(id);
if (input) input.click();
}, checkboxInfo.id).catch(() => {
page.evaluate((filterText) => {
const labels = Array.from(document.querySelectorAll('label'));
const label = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('email') || text.includes('has email');
});
if (label) label.click();
}, hasEmailFilter).catch(() => {});
});
}
}
}
await sleep(2000);
}
/**
* Extract contact info from property page
*/
async function extractContactInfoFromProperty(page) {
const contactInfo = await page.evaluate(() => {
const info = {
emails: [],
phones: [],
address: '',
owners: [],
pageTitle: document.title
};
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5) {
info.emails.push(email);
}
});
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7) {
info.phones.push(phone);
}
});
// Extract property address
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.address = addressMatch[0];
}
// Look for owner names
const ownerPattern = /Owns\s+(\d+)\s+properties?\s+([A-Za-z\s,]+)/i;
const ownerMatch = document.body.innerText.match(ownerPattern);
if (ownerMatch) {
info.owners.push(ownerMatch[2]?.trim());
}
return info;
});
return contactInfo;
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v7 (FIXED)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Apply advanced filters
log('\n📍 Step 3: Applying advanced filters...');
await applyAdvancedFilters(page);
// Perform search
log(`📍 Step 4: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs
log('\n📍 Step 5: Extracting property IDs...');
const propertyIds = await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found. The page structure may have changed.');
throw new Error('No properties found on search page.');
}
// Limit properties
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 6: Clicking through ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Click on property button
log(' 🔗 Clicking property button...');
try {
// Find and click the button with the property link
await page.evaluate((propData) => {
const buttons = Array.from(document.querySelectorAll('button'));
const target = buttons.find(b => {
const link = b.querySelector('a[href*="/property/"]');
return link && link.href.includes(propData.id);
});
if (target) {
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
target.click();
} else {
// Try to find button by text if no matching link
const textButton = buttons.find(b => b.textContent.includes(propData.id));
if (textButton) {
textButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
textButton.click();
}
}
}, { id: prop.id });
} catch (e) {
log(` ⚠️ Could not click property: ${e.message}`);
}
await sleep(3000);
// Wait for property page to load
log(' ⏳ Waiting for property page to load...');
await sleep(PAGE_LOAD_DELAY_MS);
// Extract contact info from property page
log(' 📊 Extracting contact info...');
const contactInfo = await extractContactInfoFromProperty(page);
log(` 📧 Emails: ${contactInfo.emails.length} found: ${contactInfo.emails.join(', ') || 'none'}`);
log(` 📞 Phones: ${contactInfo.phones.length} found: ${contactInfo.phones.join(', ') || 'none'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: page.url(),
address: contactInfo.address || '',
emails: contactInfo.emails,
phones: contactInfo.phones,
owners: contactInfo.owners,
pageTitle: contactInfo.pageTitle,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Go back to search results
log(' 🔙 Going back to search results...');
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(2000);
// Rate limiting
const rateDelay = 2000;
log(` ⏸ Rate limit delay: ${rateDelay}ms...`);
await sleep(rateDelay);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v7-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v7-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});