598 lines
18 KiB
JavaScript
598 lines
18 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Reonomy Scraper v10 - OWNER TAB EXTRACTION WITH FILTERS
|
||
*
|
||
* Key improvements:
|
||
* - Filters for phone and email in advanced search > owner section
|
||
* - Extended wait (up to 30s) for contact details to load
|
||
* - Waits until emails or phones are found before proceeding
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
|
||
const HEADLESS = process.env.HEADLESS === 'true';
|
||
const MAX_PROPERTIES = 20;
|
||
|
||
// Output files
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v10-filters.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v10.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Extract ALL data from Owner tab
|
||
*/
|
||
async function extractOwnerTabData(page) {
|
||
return await page.evaluate(() => {
|
||
const info = {
|
||
propertyId: '',
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
squareFootage: '',
|
||
propertyType: '',
|
||
emails: [],
|
||
phones: [],
|
||
ownerNames: [],
|
||
pageTitle: document.title,
|
||
bodyTextSample: ''
|
||
};
|
||
|
||
// Extract property ID from URL
|
||
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
|
||
if (propIdMatch) {
|
||
info.propertyId = propIdMatch[1];
|
||
}
|
||
|
||
// Extract property address from h1, h2, h3
|
||
const headingSelectors = ['h1', 'h2', 'h3'];
|
||
for (const sel of headingSelectors) {
|
||
const heading = document.querySelector(sel);
|
||
if (heading) {
|
||
const text = heading.textContent.trim();
|
||
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
if (addressMatch) {
|
||
info.propertyAddress = addressMatch[0];
|
||
info.city = addressMatch[1]?.trim();
|
||
info.state = addressMatch[2]?.trim();
|
||
info.zip = addressMatch[3]?.trim();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract property details (SF, type)
|
||
const bodyText = document.body.innerText;
|
||
|
||
// Square footage
|
||
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
|
||
if (sfMatch) {
|
||
info.squareFootage = sfMatch[0];
|
||
}
|
||
|
||
// Property type
|
||
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
|
||
for (const type of typePatterns) {
|
||
if (bodyText.includes(type)) {
|
||
info.propertyType = type;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract emails from mailto: links
|
||
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
|
||
const email = a.href.replace('mailto:', '');
|
||
if (email && email.length > 5 && !info.emails.includes(email)) {
|
||
info.emails.push(email);
|
||
}
|
||
});
|
||
|
||
// Also try email patterns in text
|
||
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
||
const emailMatches = bodyText.match(emailRegex);
|
||
if (emailMatches) {
|
||
emailMatches.forEach(email => {
|
||
if (!info.emails.includes(email)) {
|
||
info.emails.push(email);
|
||
}
|
||
});
|
||
}
|
||
|
||
// Extract phones from tel: links
|
||
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
|
||
const phone = a.href.replace('tel:', '');
|
||
if (phone && phone.length >= 10 && !info.phones.includes(phone)) {
|
||
info.phones.push(phone);
|
||
}
|
||
});
|
||
|
||
// Also try phone patterns in text
|
||
const phoneRegex = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
|
||
const phoneMatches = bodyText.match(phoneRegex);
|
||
if (phoneMatches) {
|
||
phoneMatches.forEach(phone => {
|
||
if (!info.phones.includes(phone)) {
|
||
info.phones.push(phone);
|
||
}
|
||
});
|
||
}
|
||
|
||
// Extract owner names from Owner tab section
|
||
const ownerPatterns = [
|
||
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
|
||
/Owns\s+\d+\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
|
||
];
|
||
|
||
for (const pattern of ownerPatterns) {
|
||
const matches = bodyText.match(pattern);
|
||
if (matches) {
|
||
matches.forEach(m => {
|
||
const owner = typeof m === 'string' ? m : m[1];
|
||
if (owner && owner.length > 3 && !info.ownerNames.includes(owner)) {
|
||
info.ownerNames.push(owner);
|
||
}
|
||
});
|
||
}
|
||
}
|
||
|
||
// Save sample for debugging
|
||
info.bodyTextSample = bodyText.substring(0, 500);
|
||
|
||
return info;
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Extract property IDs from search results
|
||
*/
|
||
async function extractPropertyIds(page) {
|
||
return await page.evaluate(() => {
|
||
const ids = [];
|
||
const links = document.querySelectorAll('a[href*="/property/"]');
|
||
|
||
links.forEach(link => {
|
||
const href = link.href;
|
||
const match = href.match(/property\/([a-f0-9-]+)/);
|
||
|
||
if (match) {
|
||
ids.push({
|
||
id: match[1],
|
||
url: href
|
||
});
|
||
}
|
||
});
|
||
|
||
return ids;
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Check if contact details are present (emails or phones)
|
||
*/
|
||
async function hasContactDetails(page) {
|
||
const data = await extractOwnerTabData(page);
|
||
return data.emails.length > 0 || data.phones.length > 0;
|
||
}
|
||
|
||
/**
|
||
* Apply phone and email filters in advanced search > owner
|
||
*/
|
||
async function applyContactFilters(page) {
|
||
log('📍 Step 3b: Applying phone and email filters...');
|
||
|
||
// Click on advanced search button
|
||
log(' 🔘 Clicking advanced search...');
|
||
|
||
// Try multiple selectors for advanced search button
|
||
const advancedSearchSelectors = [
|
||
'button[title*="Advanced"]',
|
||
'button:contains("Advanced")',
|
||
'div[class*="advanced"] button',
|
||
'button[class*="filter"]',
|
||
'button[aria-label*="filter"]',
|
||
'button[aria-label*="Filter"]'
|
||
];
|
||
|
||
let advancedButton = null;
|
||
for (const selector of advancedSearchSelectors) {
|
||
try {
|
||
advancedButton = await page.waitForSelector(selector, { timeout: 3000, visible: true });
|
||
if (advancedButton) break;
|
||
} catch (e) {}
|
||
}
|
||
|
||
// If no button found, try clicking by text content
|
||
if (!advancedButton) {
|
||
log(' 🔍 Looking for "Advanced" button by text...');
|
||
advancedButton = await page.evaluateHandle(() => {
|
||
const buttons = Array.from(document.querySelectorAll('button'));
|
||
return buttons.find(b => b.textContent.includes('Advanced') || b.textContent.includes('advanced'));
|
||
});
|
||
}
|
||
|
||
if (advancedButton) {
|
||
await advancedButton.click();
|
||
await sleep(2000);
|
||
log(' ✅ Advanced search opened');
|
||
} else {
|
||
log(' ⚠️ Could not find advanced search button, continuing without filters');
|
||
return false;
|
||
}
|
||
|
||
// Navigate to Owner tab in filters
|
||
log(' 📋 Navigating to Owner section...');
|
||
|
||
// Try to find Owner tab in filter panel
|
||
const ownerTabClicked = await page.evaluate(() => {
|
||
const tabs = Array.from(document.querySelectorAll('button, div[role="tab"], a[role="tab"]'));
|
||
const ownerTab = tabs.find(t => t.textContent.includes('Owner') && t.textContent.length < 20);
|
||
if (ownerTab) {
|
||
ownerTab.click();
|
||
return true;
|
||
}
|
||
return false;
|
||
});
|
||
|
||
if (ownerTabClicked) {
|
||
await sleep(1000);
|
||
log(' ✅ Owner tab selected');
|
||
}
|
||
|
||
// Find and enable phone filter
|
||
log(' 📞 Enabling phone filter...');
|
||
const phoneFilterEnabled = await page.evaluate(() => {
|
||
// Look for checkbox, switch, or toggle for phone
|
||
const phoneLabels = Array.from(document.querySelectorAll('label, span, div')).filter(el => {
|
||
const text = el.textContent.toLowerCase();
|
||
return text.includes('phone') && (text.includes('available') || text.includes('has') || text.includes('filter'));
|
||
});
|
||
|
||
for (const label of phoneLabels) {
|
||
const checkbox = label.querySelector('input[type="checkbox"]') ||
|
||
label.previousElementSibling?.querySelector('input[type="checkbox"]') ||
|
||
label.parentElement?.querySelector('input[type="checkbox"]');
|
||
|
||
if (checkbox && !checkbox.checked) {
|
||
checkbox.click();
|
||
return true;
|
||
}
|
||
|
||
// Also try clicking the label itself
|
||
if (!checkbox) {
|
||
const switchEl = label.querySelector('[role="switch"]') ||
|
||
label.querySelector('.switch') ||
|
||
label.querySelector('.toggle');
|
||
if (switchEl) {
|
||
switchEl.click();
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
});
|
||
|
||
if (phoneFilterEnabled) {
|
||
log(' ✅ Phone filter enabled');
|
||
} else {
|
||
log(' ⚠️ Could not enable phone filter');
|
||
}
|
||
|
||
// Find and enable email filter
|
||
log(' 📧 Enabling email filter...');
|
||
const emailFilterEnabled = await page.evaluate(() => {
|
||
const emailLabels = Array.from(document.querySelectorAll('label, span, div')).filter(el => {
|
||
const text = el.textContent.toLowerCase();
|
||
return text.includes('email') && (text.includes('available') || text.includes('has') || text.includes('filter'));
|
||
});
|
||
|
||
for (const label of emailLabels) {
|
||
const checkbox = label.querySelector('input[type="checkbox"]') ||
|
||
label.previousElementSibling?.querySelector('input[type="checkbox"]') ||
|
||
label.parentElement?.querySelector('input[type="checkbox"]');
|
||
|
||
if (checkbox && !checkbox.checked) {
|
||
checkbox.click();
|
||
return true;
|
||
}
|
||
|
||
if (!checkbox) {
|
||
const switchEl = label.querySelector('[role="switch"]') ||
|
||
label.querySelector('.switch') ||
|
||
label.querySelector('.toggle');
|
||
if (switchEl) {
|
||
switchEl.click();
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
});
|
||
|
||
if (emailFilterEnabled) {
|
||
log(' ✅ Email filter enabled');
|
||
} else {
|
||
log(' ⚠️ Could not enable email filter');
|
||
}
|
||
|
||
// Apply filters
|
||
log(' ✅ Applying filters...');
|
||
|
||
// Look for apply/search button
|
||
const applyButton = await page.evaluateHandle(() => {
|
||
const buttons = Array.from(document.querySelectorAll('button'));
|
||
return buttons.find(b => b.textContent.includes('Apply') || b.textContent.includes('Search') || b.textContent.includes('Done'));
|
||
});
|
||
|
||
if (applyButton) {
|
||
await applyButton.click();
|
||
await sleep(3000);
|
||
log(' ✅ Filters applied');
|
||
}
|
||
|
||
return phoneFilterEnabled || emailFilterEnabled;
|
||
}
|
||
|
||
/**
|
||
* Wait for contact details (up to 30 seconds)
|
||
*/
|
||
async function waitForContactDetails(page, timeoutMs = 30000) {
|
||
const startTime = Date.now();
|
||
log(` ⏳ Waiting for contact details (up to ${timeoutMs/1000}s)...`);
|
||
|
||
while (Date.now() - startTime < timeoutMs) {
|
||
const hasContacts = await hasContactDetails(page);
|
||
|
||
if (hasContacts) {
|
||
const data = await extractOwnerTabData(page);
|
||
log(` ✅ Contact details found! (${data.emails.length} emails, ${data.phones.length} phones)`);
|
||
return true;
|
||
}
|
||
|
||
await sleep(1000);
|
||
}
|
||
|
||
log(' ⚠️ No contact details found after timeout');
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* Main scraper
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v10 (FILTERS + EXTENDED WAIT)...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS ? 'new' : false,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
const leads = [];
|
||
|
||
try {
|
||
// Login
|
||
log('📍 Step 1: Logging into Reonomy...');
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
await page.click('button[type="submit"]');
|
||
|
||
log('⏳ Waiting for login...');
|
||
await sleep(10000);
|
||
|
||
// Check if logged in
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
await page.goto('https://app.reonomy.com/#!/search', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
|
||
// Perform initial search
|
||
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
|
||
|
||
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
|
||
timeout: 10000
|
||
}).catch(() => {
|
||
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
|
||
});
|
||
|
||
if (searchInput) {
|
||
await searchInput.click({ clickCount: 3 });
|
||
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
|
||
await sleep(1000);
|
||
await page.keyboard.press('Enter');
|
||
log('⏳ Searching...');
|
||
await sleep(5000);
|
||
}
|
||
|
||
// Apply phone and email filters
|
||
await applyContactFilters(page);
|
||
|
||
// Extract search ID from URL
|
||
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
|
||
if (!urlMatch) {
|
||
throw new Error('Could not extract search ID from URL');
|
||
}
|
||
const searchId = urlMatch[1];
|
||
log(`✅ Search ID: ${searchId}`);
|
||
|
||
// Extract property IDs
|
||
log('\n📍 Step 4: Extracting property IDs...');
|
||
const propertyIds = await extractPropertyIds(page);
|
||
log(`✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
if (propertyIds.length === 0) {
|
||
log('⚠️ No property IDs found.');
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
// Process each property
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
|
||
log(`\n📍 Step 5: Processing ${propertiesToScrape.length} properties...`);
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
|
||
|
||
// Click on property button (navigate to it)
|
||
log(` 🔗 Clicking property...`);
|
||
|
||
const clicked = await page.evaluateHandle((propData) => {
|
||
const buttons = Array.from(document.querySelectorAll('button'));
|
||
const target = buttons.find(b => {
|
||
const link = b.querySelector('a[href*="/property/"]');
|
||
return link && link.href.includes(propData.id);
|
||
});
|
||
|
||
if (target) {
|
||
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
target.click();
|
||
return { clicked: true };
|
||
}
|
||
}, { id: prop.id }).catch(() => {
|
||
return { clicked: false };
|
||
});
|
||
|
||
if (!clicked.clicked) {
|
||
log(` ⚠️ Could not click property, trying to navigate directly...`);
|
||
await page.goto(prop.url, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
}
|
||
|
||
// Initial wait for property page to load
|
||
log(` ⏳ Waiting for Owner tab to load...`);
|
||
await sleep(3000);
|
||
|
||
// Extended wait for contact details (up to 30 seconds)
|
||
await waitForContactDetails(page, 30000);
|
||
|
||
// Extract data from Owner tab
|
||
log(` 📊 Extracting data from Owner tab...`);
|
||
const propertyData = await extractOwnerTabData(page);
|
||
|
||
log(` 📧 Emails: ${propertyData.emails.length} found`);
|
||
log(` 📞 Phones: ${propertyData.phones.length} found`);
|
||
log(` 👤 Owners: ${propertyData.ownerNames.length} found`);
|
||
log(` 🏢 Address: ${propertyData.propertyAddress || 'N/A'}`);
|
||
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: propertyData.propertyId,
|
||
propertyUrl: propertyData.pageTitle?.includes('property') ? `https://app.reonomy.com/#!/property/${propertyData.propertyId}` : page.url(),
|
||
address: propertyData.propertyAddress || '',
|
||
city: propertyData.city || '',
|
||
state: propertyData.state || '',
|
||
zip: propertyData.zip || '',
|
||
squareFootage: propertyData.squareFootage || '',
|
||
propertyType: propertyData.propertyType || '',
|
||
ownerNames: propertyData.ownerNames.join('; ') || '',
|
||
emails: propertyData.emails,
|
||
phones: propertyData.phones,
|
||
searchLocation: SEARCH_LOCATION,
|
||
searchId: searchId,
|
||
filtersApplied: { phone: true, email: true }
|
||
};
|
||
|
||
leads.push(lead);
|
||
|
||
// Go back to search results for next property
|
||
log(` 🔙 Going back to search results...`);
|
||
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
await sleep(3000);
|
||
}
|
||
|
||
// Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
location: SEARCH_LOCATION,
|
||
searchId: searchId,
|
||
leadCount: leads.length,
|
||
filters: { phone: true, email: true },
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-v10-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-v10-error.png');
|
||
} catch (e) {}
|
||
|
||
throw error;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
}
|
||
}
|
||
|
||
// Run
|
||
scrapeLeads()
|
||
.then(result => {
|
||
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
|
||
console.log(`\n💾 View your leads at: ${result.outputFile}`);
|
||
process.exit(0);
|
||
})
|
||
.catch(error => {
|
||
log(`\n💥 Scraper failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|