clawdbot-workspace/reonomy-scraper-v6-clickthrough.js

403 lines
12 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v6 - CLICK-THROUGH APPROACH
*
* Key changes:
* 1. Use advanced filters: "Has Phone" + "Has Email"
* 2. Click into properties (not just navigate to ownership)
* 3. Extract contact info from property page
* 4. Go back to results
* 5. Repeat for next properties
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
const PAGE_LOAD_DELAY_MS = 8000; // Longer wait for property pages
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v6-clickthrough.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v6.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Apply advanced filters
*/
async function applyAdvancedFilters(page) {
log('🔍 Applying advanced filters: Has Phone + Has Email...');
// Look for "More Filters" button
const moreFiltersBtn = await page.waitForSelector('button:has-text("More Filters"), button[aria-label*="Filters"], button:has-text("Filters")', {
timeout: 15000
}).catch(() => null);
if (moreFiltersBtn) {
await moreFiltersBtn.click();
await sleep(2000);
}
// Look for "Has Phone" filter
const hasPhoneFilter = await page.evaluate(() => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const phoneFilter = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('phone') || text.includes('has phone');
});
return phoneFilter ? phoneFilter.textContent : null;
}).catch(() => null);
if (hasPhoneFilter) {
// Find the checkbox or radio button near this label
const checkbox = await page.evaluateHandle((label) => {
const parent = label.closest('div, form, label');
if (!parent) return null;
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
return input ? { tag: input.tagName, id: input.id } : null;
}, hasPhoneFilter).catch(() => null);
if (checkbox) {
log(` ✅ Found Has Phone filter: ${checkbox.tag}#${checkbox.id}`);
await page.evaluate((el) => {
const input = document.getElementById(el.id);
if (input && !input.checked) {
input.click();
}
}, { id: checkbox.id }).catch(() => {
log(` ⚠️ Could not interact with Has Phone filter checkbox, trying label click...`);
await page.evaluateHandle((label) => {
if (label) label.click();
}, hasPhoneFilter).catch(() => {});
});
await sleep(1000);
}
}
// Look for "Has Email" filter
const hasEmailFilter = await page.evaluate(() => {
const labels = Array.from(document.querySelectorAll('label, span, div'));
const emailFilter = labels.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('email') || text.includes('has email');
});
return emailFilter ? emailFilter.textContent : null;
}).catch(() => null);
if (hasEmailFilter) {
const checkbox = await page.evaluateHandle((label) => {
const parent = label.closest('div, form, label');
if (!parent) return null;
const input = parent.querySelector('input[type="checkbox"], input[type="radio"]');
return input ? { tag: input.tagName, id: input.id } : null;
}, hasEmailFilter).catch(() => null);
if (checkbox) {
log(` ✅ Found Has Email filter: ${checkbox.tag}#${checkbox.id}`);
await page.evaluate((el) => {
const input = document.getElementById(el.id);
if (input && !input.checked) {
input.click();
}
}, { id: checkbox.id }).catch(() => {
log(` ⚠️ Could not interact with Has Email filter checkbox, trying label click...`);
await page.evaluateHandle((label) => {
if (label) label.click();
}, hasEmailFilter).catch(() => {});
});
await sleep(1000);
}
}
await sleep(2000);
}
/**
* Extract contact info from property page (after clicking into it)
*/
async function extractContactInfoFromProperty(page) {
return await page.evaluate(() => {
const info = {
emails: [],
phones: [],
address: '',
owners: [],
pageTitle: document.title
};
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5) {
info.emails.push(email);
}
});
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7) {
info.phones.push(phone);
}
});
// Extract property address
const addressMatch = document.body.innerText.match(/^(\d+[^,]+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.address = addressMatch[0];
}
// Look for owner names
const ownerPattern = /Owns\s+(\d+)\s+properties?\s+([A-Za-z\s,]+)/i;
const ownerMatch = document.body.innerText.match(ownerPattern);
if (ownerMatch) {
info.owners.push(ownerMatch[2]?.trim());
}
return info;
});
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v6 (CLICK-THROUGH APPROACH)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Apply advanced filters for contact info
log('\n📍 Step 3: Applying advanced filters...');
await applyAdvancedFilters(page);
// Perform search
log(`📍 Step 4: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs from search results
log('\n📍 Step 5: Extracting property IDs...');
const propertyIds = await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found. Check search results.');
throw new Error('No properties found on search page.');
}
// Limit properties
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 6: Clicking through ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Click on property button
log(` 🔗 Clicking property...`);
try {
await page.evaluateHandle((propData) => {
const buttons = Array.from(document.querySelectorAll('button'));
const target = buttons.find(b => {
const link = b.querySelector('a[href*="/property/"]');
return link && link.href.includes(propData.id);
});
if (target) {
// Scroll into view if needed
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
target.click();
}
}, { id: prop.id });
} catch (e) {
log(` ⚠️ Could not click property: ${e.message}`);
}
await sleep(3000);
// Wait for property page to load
log(` ⏳ Waiting for property page to load...`);
await sleep(PAGE_LOAD_DELAY_MS);
// Extract contact info from property page
const contactInfo = await extractContactInfoFromProperty(page);
log(` 📧 Emails: ${contactInfo.emails.length} found`);
log(` 📞 Phones: ${contactInfo.phones.length} found`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: prop.url,
address: contactInfo.address || '',
emails: contactInfo.emails,
phones: contactInfo.phones,
owners: contactInfo.owners,
pageTitle: contactInfo.pageTitle
};
leads.push(lead);
// Go back to search results
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(3000);
// Rate limiting
const rateDelay = 2000;
log(` ⏸ Rate limit delay: ${rateDelay}ms...`);
await sleep(rateDelay);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v6-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v6-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});