451 lines
14 KiB
JavaScript
451 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Reonomy Scraper v7 - FIXED CLICK-THROUGH
|
||
*
|
||
* Key changes:
|
||
* 1. Removed invalid await inside page.evaluate()
|
||
* 2. Fixed page.evaluateHandle() usage
|
||
* 3. Better error handling
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
|
||
const HEADLESS = process.env.HEADLESS === 'true';
|
||
const MAX_PROPERTIES = 20;
|
||
const PAGE_LOAD_DELAY_MS = 5000;
|
||
|
||
// Output files
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v7-fixed.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v7-fixed.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Apply advanced filters
|
||
*/
|
||
async function applyAdvancedFilters(page) {
|
||
log('🔍 Applying advanced filters: Has Phone + Has Email...');
|
||
|
||
// Look for "More Filters" button
|
||
const moreFiltersBtn = await page.waitForSelector('button:has-text("More Filters"), button[aria-label*="Filters"], button:has-text("Filters")', {
|
||
timeout: 15000
|
||
}).catch(() => null);
|
||
|
||
if (moreFiltersBtn) {
|
||
await moreFiltersBtn.click();
|
||
await sleep(2000);
|
||
}
|
||
|
||
// Look for "Has Phone" filter
|
||
const hasPhoneFilter = await page.evaluate(() => {
|
||
const labels = Array.from(document.querySelectorAll('label, span, div'));
|
||
const phoneFilter = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('phone') || text.includes('has phone');
|
||
});
|
||
return phoneFilter ? phoneFilter.textContent : null;
|
||
}).catch(() => null);
|
||
|
||
if (hasPhoneFilter) {
|
||
// Find the input/checkbox associated with this label
|
||
const checkboxInfo = await page.evaluate((filterText) => {
|
||
const labels = Array.from(document.querySelectorAll('label, span, div'));
|
||
const label = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('phone') || text.includes('has phone');
|
||
});
|
||
|
||
if (!label) return null;
|
||
|
||
const parent = label.closest('div, form, label');
|
||
if (!parent) return null;
|
||
|
||
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
|
||
return input ? { tag: input.tagName, id: input.id } : null;
|
||
}, hasPhoneFilter).catch(() => null);
|
||
|
||
if (checkboxInfo && checkboxInfo.tag === 'INPUT') {
|
||
log(` ✅ Found Has Phone checkbox: ${checkboxInfo.id}`);
|
||
// Check if it's already checked, if not, click it
|
||
const isChecked = await page.evaluate((id) => {
|
||
const input = document.getElementById(id);
|
||
return input ? input.checked : false;
|
||
}, checkboxInfo.id).catch(() => false);
|
||
|
||
if (!isChecked) {
|
||
await page.evaluate((id) => {
|
||
const input = document.getElementById(id);
|
||
if (input) input.click();
|
||
}, checkboxInfo.id).catch(() => {
|
||
// Try clicking the label
|
||
log(` ⚠️ Could not click checkbox, trying label click...`);
|
||
page.evaluate((filterText) => {
|
||
const labels = Array.from(document.querySelectorAll('label'));
|
||
const label = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('phone') || text.includes('has phone');
|
||
});
|
||
if (label) label.click();
|
||
}, hasPhoneFilter).catch(() => {});
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
// Look for "Has Email" filter
|
||
const hasEmailFilter = await page.evaluate(() => {
|
||
const labels = Array.from(document.querySelectorAll('label, span, div'));
|
||
const emailFilter = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('email') || text.includes('has email');
|
||
});
|
||
return emailFilter ? emailFilter.textContent : null;
|
||
}).catch(() => null);
|
||
|
||
if (hasEmailFilter) {
|
||
const checkboxInfo = await page.evaluate((filterText) => {
|
||
const labels = Array.from(document.querySelectorAll('label, span, div'));
|
||
const label = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('email') || text.includes('has email');
|
||
});
|
||
|
||
if (!label) return null;
|
||
|
||
const parent = label.closest('div, form, label');
|
||
if (!parent) return null;
|
||
|
||
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
|
||
return input ? { tag: input.tagName, id: input.id } : null;
|
||
}, hasEmailFilter).catch(() => null);
|
||
|
||
if (checkboxInfo && checkboxInfo.tag === 'INPUT') {
|
||
log(` ✅ Found Has Email checkbox: ${checkboxInfo.id}`);
|
||
const isChecked = await page.evaluate((id) => {
|
||
const input = document.getElementById(id);
|
||
return input ? input.checked : false;
|
||
}, checkboxInfo.id).catch(() => false);
|
||
|
||
if (!isChecked) {
|
||
await page.evaluate((id) => {
|
||
const input = document.getElementById(id);
|
||
if (input) input.click();
|
||
}, checkboxInfo.id).catch(() => {
|
||
page.evaluate((filterText) => {
|
||
const labels = Array.from(document.querySelectorAll('label'));
|
||
const label = labels.find(el => {
|
||
const text = el.textContent?.toLowerCase() || '';
|
||
return text.includes('email') || text.includes('has email');
|
||
});
|
||
if (label) label.click();
|
||
}, hasEmailFilter).catch(() => {});
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
await sleep(2000);
|
||
}
|
||
|
||
/**
|
||
* Extract contact info from property page
|
||
*/
|
||
async function extractContactInfoFromProperty(page) {
|
||
const contactInfo = await page.evaluate(() => {
|
||
const info = {
|
||
emails: [],
|
||
phones: [],
|
||
address: '',
|
||
owners: [],
|
||
pageTitle: document.title
|
||
};
|
||
|
||
// Extract emails from mailto: links
|
||
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
|
||
const email = a.href.replace('mailto:', '');
|
||
if (email && email.length > 5) {
|
||
info.emails.push(email);
|
||
}
|
||
});
|
||
|
||
// Extract phones from tel: links
|
||
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
|
||
const phone = a.href.replace('tel:', '');
|
||
if (phone && phone.length > 7) {
|
||
info.phones.push(phone);
|
||
}
|
||
});
|
||
|
||
// Extract property address
|
||
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
if (addressMatch) {
|
||
info.address = addressMatch[0];
|
||
}
|
||
|
||
// Look for owner names
|
||
const ownerPattern = /Owns\s+(\d+)\s+properties?\s+([A-Za-z\s,]+)/i;
|
||
const ownerMatch = document.body.innerText.match(ownerPattern);
|
||
if (ownerMatch) {
|
||
info.owners.push(ownerMatch[2]?.trim());
|
||
}
|
||
|
||
return info;
|
||
});
|
||
|
||
return contactInfo;
|
||
}
|
||
|
||
/**
|
||
* Main scraper
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v7 (FIXED)...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS ? 'new' : false,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
const leads = [];
|
||
|
||
try {
|
||
// Login
|
||
log('📍 Step 1: Logging into Reonomy...');
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
await page.click('button[type="submit"]');
|
||
|
||
log('⏳ Waiting for login...');
|
||
await sleep(10000);
|
||
|
||
// Check if logged in
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
await page.goto('https://app.reonomy.com/#!/search', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
|
||
// Apply advanced filters
|
||
log('\n📍 Step 3: Applying advanced filters...');
|
||
await applyAdvancedFilters(page);
|
||
|
||
// Perform search
|
||
log(`📍 Step 4: Searching for: ${SEARCH_LOCATION}...`);
|
||
|
||
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
|
||
timeout: 10000
|
||
}).catch(() => {
|
||
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
|
||
});
|
||
|
||
if (searchInput) {
|
||
await searchInput.click({ clickCount: 3 });
|
||
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
|
||
await sleep(1000);
|
||
await page.keyboard.press('Enter');
|
||
log('⏳ Searching...');
|
||
await sleep(5000);
|
||
}
|
||
|
||
// Extract search ID
|
||
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
|
||
if (!urlMatch) {
|
||
throw new Error('Could not extract search ID from URL');
|
||
}
|
||
const searchId = urlMatch[1];
|
||
log(`✅ Search ID: ${searchId}`);
|
||
|
||
// Extract property IDs
|
||
log('\n📍 Step 5: Extracting property IDs...');
|
||
const propertyIds = await page.evaluate(() => {
|
||
const ids = [];
|
||
const links = document.querySelectorAll('a[href*="/property/"]');
|
||
|
||
links.forEach(link => {
|
||
const href = link.href;
|
||
const match = href.match(/property\/([a-f0-9-]+)/);
|
||
|
||
if (match) {
|
||
ids.push({
|
||
id: match[1],
|
||
url: href
|
||
});
|
||
}
|
||
});
|
||
|
||
return ids;
|
||
});
|
||
|
||
log(`✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
if (propertyIds.length === 0) {
|
||
log('⚠️ No property IDs found. The page structure may have changed.');
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
// Limit properties
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
|
||
log(`\n📍 Step 6: Clicking through ${propertiesToScrape.length} properties...`);
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
|
||
|
||
// Click on property button
|
||
log(' 🔗 Clicking property button...');
|
||
try {
|
||
// Find and click the button with the property link
|
||
await page.evaluate((propData) => {
|
||
const buttons = Array.from(document.querySelectorAll('button'));
|
||
const target = buttons.find(b => {
|
||
const link = b.querySelector('a[href*="/property/"]');
|
||
return link && link.href.includes(propData.id);
|
||
});
|
||
|
||
if (target) {
|
||
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
target.click();
|
||
} else {
|
||
// Try to find button by text if no matching link
|
||
const textButton = buttons.find(b => b.textContent.includes(propData.id));
|
||
if (textButton) {
|
||
textButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
textButton.click();
|
||
}
|
||
}
|
||
}, { id: prop.id });
|
||
} catch (e) {
|
||
log(` ⚠️ Could not click property: ${e.message}`);
|
||
}
|
||
|
||
await sleep(3000);
|
||
|
||
// Wait for property page to load
|
||
log(' ⏳ Waiting for property page to load...');
|
||
await sleep(PAGE_LOAD_DELAY_MS);
|
||
|
||
// Extract contact info from property page
|
||
log(' 📊 Extracting contact info...');
|
||
const contactInfo = await extractContactInfoFromProperty(page);
|
||
log(` 📧 Emails: ${contactInfo.emails.length} found: ${contactInfo.emails.join(', ') || 'none'}`);
|
||
log(` 📞 Phones: ${contactInfo.phones.length} found: ${contactInfo.phones.join(', ') || 'none'}`);
|
||
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: prop.id,
|
||
propertyUrl: page.url(),
|
||
address: contactInfo.address || '',
|
||
emails: contactInfo.emails,
|
||
phones: contactInfo.phones,
|
||
owners: contactInfo.owners,
|
||
pageTitle: contactInfo.pageTitle,
|
||
searchLocation: SEARCH_LOCATION,
|
||
searchId: searchId
|
||
};
|
||
|
||
leads.push(lead);
|
||
|
||
// Go back to search results
|
||
log(' 🔙 Going back to search results...');
|
||
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
// Rate limiting
|
||
const rateDelay = 2000;
|
||
log(` ⏸ Rate limit delay: ${rateDelay}ms...`);
|
||
await sleep(rateDelay);
|
||
}
|
||
|
||
// Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
location: SEARCH_LOCATION,
|
||
searchId: searchId,
|
||
leadCount: leads.length,
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-v7-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-v7-error.png');
|
||
} catch (e) {}
|
||
|
||
throw error;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
}
|
||
}
|
||
|
||
// Run
|
||
scrapeLeads()
|
||
.then(result => {
|
||
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
|
||
console.log(`\n💾 View your leads at: ${result.outputFile}`);
|
||
process.exit(0);
|
||
})
|
||
.catch(error => {
|
||
log(`\n💥 Scraper failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|