clawdbot-workspace/reonomy-scraper-v10-filters.js

598 lines
18 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v10 - OWNER TAB EXTRACTION WITH FILTERS
*
* Key improvements:
* - Filters for phone and email in advanced search > owner section
* - Extended wait (up to 30s) for contact details to load
* - Waits until emails or phones are found before proceeding
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v10-filters.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v10.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract ALL data from Owner tab
*/
async function extractOwnerTabData(page) {
return await page.evaluate(() => {
const info = {
propertyId: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: '',
emails: [],
phones: [],
ownerNames: [],
pageTitle: document.title,
bodyTextSample: ''
};
// Extract property ID from URL
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
info.propertyId = propIdMatch[1];
}
// Extract property address from h1, h2, h3
const headingSelectors = ['h1', 'h2', 'h3'];
for (const sel of headingSelectors) {
const heading = document.querySelector(sel);
if (heading) {
const text = heading.textContent.trim();
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.propertyAddress = addressMatch[0];
info.city = addressMatch[1]?.trim();
info.state = addressMatch[2]?.trim();
info.zip = addressMatch[3]?.trim();
break;
}
}
}
// Extract property details (SF, type)
const bodyText = document.body.innerText;
// Square footage
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
info.squareFootage = sfMatch[0];
}
// Property type
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
for (const type of typePatterns) {
if (bodyText.includes(type)) {
info.propertyType = type;
break;
}
}
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5 && !info.emails.includes(email)) {
info.emails.push(email);
}
});
// Also try email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailRegex);
if (emailMatches) {
emailMatches.forEach(email => {
if (!info.emails.includes(email)) {
info.emails.push(email);
}
});
}
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length >= 10 && !info.phones.includes(phone)) {
info.phones.push(phone);
}
});
// Also try phone patterns in text
const phoneRegex = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
const phoneMatches = bodyText.match(phoneRegex);
if (phoneMatches) {
phoneMatches.forEach(phone => {
if (!info.phones.includes(phone)) {
info.phones.push(phone);
}
});
}
// Extract owner names from Owner tab section
const ownerPatterns = [
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
/Owns\s+\d+\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !info.ownerNames.includes(owner)) {
info.ownerNames.push(owner);
}
});
}
}
// Save sample for debugging
info.bodyTextSample = bodyText.substring(0, 500);
return info;
});
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
}
/**
* Check if contact details are present (emails or phones)
*/
async function hasContactDetails(page) {
const data = await extractOwnerTabData(page);
return data.emails.length > 0 || data.phones.length > 0;
}
/**
* Apply phone and email filters in advanced search > owner
*/
async function applyContactFilters(page) {
log('📍 Step 3b: Applying phone and email filters...');
// Click on advanced search button
log(' 🔘 Clicking advanced search...');
// Try multiple selectors for advanced search button
const advancedSearchSelectors = [
'button[title*="Advanced"]',
'button:contains("Advanced")',
'div[class*="advanced"] button',
'button[class*="filter"]',
'button[aria-label*="filter"]',
'button[aria-label*="Filter"]'
];
let advancedButton = null;
for (const selector of advancedSearchSelectors) {
try {
advancedButton = await page.waitForSelector(selector, { timeout: 3000, visible: true });
if (advancedButton) break;
} catch (e) {}
}
// If no button found, try clicking by text content
if (!advancedButton) {
log(' 🔍 Looking for "Advanced" button by text...');
advancedButton = await page.evaluateHandle(() => {
const buttons = Array.from(document.querySelectorAll('button'));
return buttons.find(b => b.textContent.includes('Advanced') || b.textContent.includes('advanced'));
});
}
if (advancedButton) {
await advancedButton.click();
await sleep(2000);
log(' ✅ Advanced search opened');
} else {
log(' ⚠️ Could not find advanced search button, continuing without filters');
return false;
}
// Navigate to Owner tab in filters
log(' 📋 Navigating to Owner section...');
// Try to find Owner tab in filter panel
const ownerTabClicked = await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('button, div[role="tab"], a[role="tab"]'));
const ownerTab = tabs.find(t => t.textContent.includes('Owner') && t.textContent.length < 20);
if (ownerTab) {
ownerTab.click();
return true;
}
return false;
});
if (ownerTabClicked) {
await sleep(1000);
log(' ✅ Owner tab selected');
}
// Find and enable phone filter
log(' 📞 Enabling phone filter...');
const phoneFilterEnabled = await page.evaluate(() => {
// Look for checkbox, switch, or toggle for phone
const phoneLabels = Array.from(document.querySelectorAll('label, span, div')).filter(el => {
const text = el.textContent.toLowerCase();
return text.includes('phone') && (text.includes('available') || text.includes('has') || text.includes('filter'));
});
for (const label of phoneLabels) {
const checkbox = label.querySelector('input[type="checkbox"]') ||
label.previousElementSibling?.querySelector('input[type="checkbox"]') ||
label.parentElement?.querySelector('input[type="checkbox"]');
if (checkbox && !checkbox.checked) {
checkbox.click();
return true;
}
// Also try clicking the label itself
if (!checkbox) {
const switchEl = label.querySelector('[role="switch"]') ||
label.querySelector('.switch') ||
label.querySelector('.toggle');
if (switchEl) {
switchEl.click();
return true;
}
}
}
return false;
});
if (phoneFilterEnabled) {
log(' ✅ Phone filter enabled');
} else {
log(' ⚠️ Could not enable phone filter');
}
// Find and enable email filter
log(' 📧 Enabling email filter...');
const emailFilterEnabled = await page.evaluate(() => {
const emailLabels = Array.from(document.querySelectorAll('label, span, div')).filter(el => {
const text = el.textContent.toLowerCase();
return text.includes('email') && (text.includes('available') || text.includes('has') || text.includes('filter'));
});
for (const label of emailLabels) {
const checkbox = label.querySelector('input[type="checkbox"]') ||
label.previousElementSibling?.querySelector('input[type="checkbox"]') ||
label.parentElement?.querySelector('input[type="checkbox"]');
if (checkbox && !checkbox.checked) {
checkbox.click();
return true;
}
if (!checkbox) {
const switchEl = label.querySelector('[role="switch"]') ||
label.querySelector('.switch') ||
label.querySelector('.toggle');
if (switchEl) {
switchEl.click();
return true;
}
}
}
return false;
});
if (emailFilterEnabled) {
log(' ✅ Email filter enabled');
} else {
log(' ⚠️ Could not enable email filter');
}
// Apply filters
log(' ✅ Applying filters...');
// Look for apply/search button
const applyButton = await page.evaluateHandle(() => {
const buttons = Array.from(document.querySelectorAll('button'));
return buttons.find(b => b.textContent.includes('Apply') || b.textContent.includes('Search') || b.textContent.includes('Done'));
});
if (applyButton) {
await applyButton.click();
await sleep(3000);
log(' ✅ Filters applied');
}
return phoneFilterEnabled || emailFilterEnabled;
}
/**
* Wait for contact details (up to 30 seconds)
*/
async function waitForContactDetails(page, timeoutMs = 30000) {
const startTime = Date.now();
log(` ⏳ Waiting for contact details (up to ${timeoutMs/1000}s)...`);
while (Date.now() - startTime < timeoutMs) {
const hasContacts = await hasContactDetails(page);
if (hasContacts) {
const data = await extractOwnerTabData(page);
log(` ✅ Contact details found! (${data.emails.length} emails, ${data.phones.length} phones)`);
return true;
}
await sleep(1000);
}
log(' ⚠️ No contact details found after timeout');
return false;
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v10 (FILTERS + EXTENDED WAIT)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Perform initial search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Apply phone and email filters
await applyContactFilters(page);
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs
log('\n📍 Step 4: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 5: Processing ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Click on property button (navigate to it)
log(` 🔗 Clicking property...`);
const clicked = await page.evaluateHandle((propData) => {
const buttons = Array.from(document.querySelectorAll('button'));
const target = buttons.find(b => {
const link = b.querySelector('a[href*="/property/"]');
return link && link.href.includes(propData.id);
});
if (target) {
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
target.click();
return { clicked: true };
}
}, { id: prop.id }).catch(() => {
return { clicked: false };
});
if (!clicked.clicked) {
log(` ⚠️ Could not click property, trying to navigate directly...`);
await page.goto(prop.url, {
waitUntil: 'networkidle2',
timeout: 30000
});
}
// Initial wait for property page to load
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(3000);
// Extended wait for contact details (up to 30 seconds)
await waitForContactDetails(page, 30000);
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const propertyData = await extractOwnerTabData(page);
log(` 📧 Emails: ${propertyData.emails.length} found`);
log(` 📞 Phones: ${propertyData.phones.length} found`);
log(` 👤 Owners: ${propertyData.ownerNames.length} found`);
log(` 🏢 Address: ${propertyData.propertyAddress || 'N/A'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: propertyData.propertyId,
propertyUrl: propertyData.pageTitle?.includes('property') ? `https://app.reonomy.com/#!/property/${propertyData.propertyId}` : page.url(),
address: propertyData.propertyAddress || '',
city: propertyData.city || '',
state: propertyData.state || '',
zip: propertyData.zip || '',
squareFootage: propertyData.squareFootage || '',
propertyType: propertyData.propertyType || '',
ownerNames: propertyData.ownerNames.join('; ') || '',
emails: propertyData.emails,
phones: propertyData.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId,
filtersApplied: { phone: true, email: true }
};
leads.push(lead);
// Go back to search results for next property
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(3000);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
filters: { phone: true, email: true },
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v10-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v10-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});