621 lines
19 KiB
JavaScript
621 lines
19 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Reonomy Scraper v8 - FULL EXTRACTION WITH CLICK-THROUGH
|
||
*
|
||
* Workflow:
|
||
* 1. Login
|
||
* 2. Search for location
|
||
* 3. Apply advanced filters (Has Phone + Has Email)
|
||
* 4. Extract property IDs
|
||
* 5. For each property:
|
||
* - Click on property button
|
||
* - Wait for property page to fully load
|
||
* - Look for contact info tabs/sections
|
||
* - Click "View Contact" or "Ownership" if needed
|
||
* - Extract ALL data (emails, phones, owners, addresses, property details)
|
||
* - Go back to search results
|
||
* - Continue to next property
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
|
||
const HEADLESS = process.env.HEADLESS === 'true';
|
||
const MAX_PROPERTIES = 20;
|
||
|
||
// Longer waits for full content loading
|
||
const AFTER_CLICK_WAIT_MS = 5000;
|
||
const AFTER_TAB_SWITCH_WAIT_MS = 3000;
|
||
const BACK_NAVIGATION_WAIT_MS = 3000;
|
||
|
||
// Output files
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v8-full.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v8.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Apply advanced filters
|
||
*/
|
||
async function applyAdvancedFilters(page) {
|
||
log('🔍 Step 2.1: Applying advanced filters (Has Phone + Has Email)...');
|
||
|
||
try {
|
||
// Look for "More Filters" button
|
||
const moreFiltersBtn = await page.waitForSelector('button:has-text("More Filters"), button[aria-label*="Filters"], button:has-text("Filters")', {
|
||
timeout: 15000
|
||
}).catch(() => null);
|
||
|
||
if (moreFiltersBtn) {
|
||
log(' 📋 Clicking "More Filters"...');
|
||
await moreFiltersBtn.click();
|
||
await sleep(2000);
|
||
}
|
||
|
||
// Look for "Has Phone" filter
|
||
let hasPhoneFound = false;
|
||
const phoneSelectors = [
|
||
'label:has-text("Has Phone"), label:has-text("phone") input[type="checkbox"]',
|
||
'input[type="checkbox"][data-test*="phone"], input[type="checkbox"][id*="phone"]',
|
||
'.filter-item:has-text("Has Phone") input[type="checkbox"]'
|
||
];
|
||
|
||
for (const selector of phoneSelectors) {
|
||
const checkbox = await page.waitForSelector(selector, { timeout: 3000 }).catch(() => null);
|
||
if (checkbox) {
|
||
const isChecked = await (await page.evaluate(el => el.checked, { el }).catch(() => false));
|
||
if (!isChecked) {
|
||
log(' ☑️ Checking "Has Phone" filter...');
|
||
await checkbox.click();
|
||
await sleep(500);
|
||
hasPhoneFound = true;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!hasPhoneFound) {
|
||
log(' ⚠️ "Has Phone" filter not found, skipping');
|
||
}
|
||
|
||
await sleep(1000);
|
||
|
||
// Look for "Has Email" filter
|
||
let hasEmailFound = false;
|
||
const emailSelectors = [
|
||
'label:has-text("Has Email"), label:has-text("email") input[type="checkbox"]',
|
||
'input[type="checkbox"][data-test*="email"], input[type="checkbox"][id*="email"]',
|
||
'.filter-item:has-text("Has Email") input[type="checkbox"]'
|
||
];
|
||
|
||
for (const selector of emailSelectors) {
|
||
const checkbox = await page.waitForSelector(selector, { timeout: 3000 }).catch(() => null);
|
||
if (checkbox) {
|
||
const isChecked = await (await page.evaluate(el => el.checked, { el }).catch(() => false));
|
||
if (!isChecked) {
|
||
log(' ☑️ Checking "Has Email" filter...');
|
||
await checkbox.click();
|
||
await sleep(500);
|
||
hasEmailFound = true;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!hasEmailFound) {
|
||
log(' ⚠️ "Has Email" filter not found, skipping');
|
||
}
|
||
|
||
log('✅ Filters applied');
|
||
|
||
} catch (error) {
|
||
log(` ⚠️ Filter application had issues: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract ALL available data from property page
|
||
*/
|
||
async function extractFullPropertyData(page, propertyUrl) {
|
||
log(' 🔎 Extracting full property data...');
|
||
|
||
const data = await page.evaluate(() => {
|
||
const result = {
|
||
propertyId: '',
|
||
address: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: '',
|
||
ownerName: '',
|
||
ownerLocation: '',
|
||
propertyCount: '',
|
||
emails: [],
|
||
phones: [],
|
||
contacts: [],
|
||
pageTitle: document.title,
|
||
url: window.location.href
|
||
};
|
||
|
||
// Extract property ID from URL
|
||
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
|
||
if (propIdMatch) {
|
||
result.propertyId = propIdMatch[1];
|
||
}
|
||
|
||
// Extract property address (look in multiple places)
|
||
const addressPatterns = [
|
||
// h1, h2, h3, h4, h5, h6
|
||
document.querySelector('h1, h2, h3, h4, h5, h6')?.textContent?.trim(),
|
||
// Heading with "Highway" or "Avenue" or "Street" etc.
|
||
...Array.from(document.querySelectorAll('[role="heading"], h1, h2, h3')).map(h => h.textContent?.trim()).find(t =>
|
||
t && (t.includes('Highway') || t.includes('Avenue') || t.includes('Street') ||
|
||
t.includes('Rd') || t.includes('Dr') || t.includes('Way') ||
|
||
t.includes('Ln') || t.includes('Blvd') || t.includes('Rte'))
|
||
];
|
||
|
||
for (const addr of addressPatterns) {
|
||
if (addr && addr.length > 10 && addr.length < 200) {
|
||
result.address = addr;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract city, state, zip from address
|
||
const addressMatch = result.address.match(/,\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
if (addressMatch) {
|
||
result.city = addressMatch[1]?.trim();
|
||
result.state = addressMatch[2]?.trim();
|
||
result.zip = addressMatch[3]?.trim();
|
||
}
|
||
|
||
// Extract property type
|
||
const typePatterns = ['SF', 'Acre', 'General Industrial', 'Retail Stores', 'Warehouse', 'Office Building', 'Medical Building'];
|
||
const bodyText = document.body.innerText;
|
||
for (const type of typePatterns) {
|
||
if (bodyText.includes(type)) {
|
||
result.propertyType = type;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract square footage
|
||
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
|
||
if (sfMatch) {
|
||
result.squareFootage = sfMatch[0];
|
||
}
|
||
|
||
// Extract emails (from mailto: links and email patterns)
|
||
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
|
||
const email = a.href.replace('mailto:', '');
|
||
if (email && email.length > 5 && !result.emails.includes(email)) {
|
||
result.emails.push(email);
|
||
}
|
||
});
|
||
|
||
// Also try email regex patterns in text
|
||
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
||
const emailMatches = bodyText.match(emailRegex);
|
||
if (emailMatches) {
|
||
emailMatches.forEach(email => {
|
||
if (!result.emails.includes(email)) {
|
||
result.emails.push(email);
|
||
}
|
||
});
|
||
}
|
||
|
||
// Extract phones (from tel: links and phone patterns)
|
||
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
|
||
const phone = a.href.replace('tel:', '');
|
||
if (phone && phone.length > 7 && !result.phones.includes(phone)) {
|
||
result.phones.push(phone);
|
||
}
|
||
});
|
||
|
||
// Also try phone regex patterns in text
|
||
const phoneRegex = /\(?:(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})|(\d{10})/g;
|
||
const phoneMatches = bodyText.match(phoneRegex);
|
||
if (phoneMatches) {
|
||
phoneMatches.forEach(match => {
|
||
const phone = match.replace(/^:?\s*|\.|-/g, '');
|
||
if (phone && phone.length >= 10 && !result.phones.includes(phone)) {
|
||
result.phones.push(phone);
|
||
}
|
||
});
|
||
}
|
||
|
||
// Extract owner names
|
||
const ownerPatterns = [
|
||
/Owner:\s*([A-Za-z\s]+)/g,
|
||
/Owns\s+\d+\s+properties\s*in\s*([A-Za-z\s,]+)/i,
|
||
/([A-Z][a-z]+\s+[A-Z][a-z]+\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g
|
||
];
|
||
const ownerMatches = [...new Set()];
|
||
for (const pattern of ownerPatterns) {
|
||
const matches = bodyText.match(pattern);
|
||
if (matches) {
|
||
matches.forEach(m => {
|
||
const owner = typeof m === 'string' ? m : (m[1] || m);
|
||
if (owner && owner.length > 3 && !result.owners.includes(owner)) {
|
||
ownerMatches.push(owner);
|
||
}
|
||
});
|
||
}
|
||
}
|
||
result.owners = Array.from(ownerMatches);
|
||
|
||
// Extract property count
|
||
const propCountMatch = bodyText.match(/Owns\s+(\d+)\s+properties/i);
|
||
if (propCountMatch) {
|
||
result.propertyCount = propCountMatch[1];
|
||
}
|
||
|
||
// Look for owner location
|
||
const locationPattern = /\s+in\s+([A-Za-z\s,]+(?:\s*,\s+[A-Z]{2})?/i;
|
||
const locationMatch = bodyText.match(locationPattern);
|
||
if (locationMatch) {
|
||
result.ownerLocation = locationMatch[1]?.trim();
|
||
}
|
||
|
||
// Look for contact tabs/buttons
|
||
const tabSelectors = [
|
||
'button:has-text("View Contact"), button:has-text("Contact")',
|
||
'button:has-text("Ownership"), button:has-text("Owner")',
|
||
'[role="tab"]:has-text("Contact")'
|
||
];
|
||
|
||
for (const sel of tabSelectors) {
|
||
const tab = document.querySelector(sel);
|
||
if (tab) {
|
||
result.hasContactButton = true;
|
||
result.contactTabText = tab.textContent?.trim();
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract all contact section text (for debug)
|
||
const contactSection = document.body.innerText.substring(0, 1000);
|
||
result.contactSectionSample = contactSection;
|
||
|
||
return result;
|
||
});
|
||
|
||
log(` 📧 Emails: ${data.emails.length} found`);
|
||
log(` 📞 Phones: ${data.phones.length} found`);
|
||
log(` 👤 Owners: ${data.owners.length} found`);
|
||
|
||
return data;
|
||
}
|
||
|
||
/**
|
||
* Click on property button and navigate to it
|
||
*/
|
||
async function clickAndNavigateToProperty(page, propertyId) {
|
||
log(`\n🔗 Clicking property ${propertyId}...`);
|
||
|
||
const clicked = await page.evaluate((propId) => {
|
||
const buttons = Array.from(document.querySelectorAll('button'));
|
||
|
||
// Try to find button with property ID in its link
|
||
const targetButton = buttons.find(b => {
|
||
const link = b.querySelector('a[href*="/property/"]');
|
||
if (link) {
|
||
const href = link.href;
|
||
const match = href.match(/property\/([a-f0-9-]+)/);
|
||
return match && match[1] === propId;
|
||
}
|
||
});
|
||
|
||
// If not found by link, try by text content
|
||
const textButton = buttons.find(b => {
|
||
const text = b.textContent || b.innerText || '';
|
||
return text.includes(propId);
|
||
});
|
||
|
||
if (targetButton) {
|
||
targetButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
setTimeout(() => {
|
||
targetButton.click();
|
||
}, 100);
|
||
return { clicked: true };
|
||
} else if (textButton) {
|
||
textButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
setTimeout(() => {
|
||
textButton.click();
|
||
}, 100);
|
||
return { clicked: true };
|
||
}
|
||
|
||
return { clicked: false };
|
||
}, { propertyId }).catch(() => {
|
||
return { clicked: false };
|
||
});
|
||
|
||
await sleep(2000);
|
||
return clicked;
|
||
}
|
||
|
||
/**
|
||
* Try to find and click "View Contact" tab
|
||
*/
|
||
async function clickViewContactTab(page) {
|
||
log(' 📋 Looking for "View Contact" tab...');
|
||
|
||
const clicked = await page.evaluate(() => {
|
||
const tabs = ['button:has-text("View Contact")', 'button:has-text("Contact")', 'button:has-text("Ownership")', '[role="tab"]:has-text("Contact")'];
|
||
|
||
for (const selector of tabs) {
|
||
const tab = document.querySelector(selector);
|
||
if (tab) {
|
||
tab.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||
setTimeout(() => {
|
||
tab.click();
|
||
}, 200);
|
||
return { clicked: true };
|
||
}
|
||
}
|
||
|
||
return { clicked: false };
|
||
}).catch(() => {
|
||
return { clicked: false };
|
||
});
|
||
|
||
if (clicked && clicked.clicked) {
|
||
log(' ✅ Clicked contact tab');
|
||
await sleep(AFTER_TAB_SWITCH_WAIT_MS);
|
||
} else {
|
||
log(' ⚠️ No "View Contact" tab found');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Main scraper
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v8 (FULL EXTRACTION)...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS ? 'new' : false,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
const leads = [];
|
||
|
||
try {
|
||
// Step 1: Login
|
||
log('\n📍 Step 1: Logging into Reonomy...');
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
await page.click('button[type="submit"]');
|
||
|
||
log(' ⏳ Waiting for login...');
|
||
await sleep(10000);
|
||
|
||
// Check if logged in
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Step 2: Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
await page.goto('https://app.reonomy.com/#!/search', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
|
||
// Step 3: Apply advanced filters
|
||
log('\n📍 Step 3: Applying filters for contact info...');
|
||
await applyAdvancedFilters(page);
|
||
|
||
// Step 4: Perform search
|
||
log(`\n📍 Step 4: Searching for: ${SEARCH_LOCATION}...`);
|
||
|
||
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
|
||
timeout: 10000
|
||
}).catch(() => {
|
||
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
|
||
});
|
||
|
||
if (searchInput) {
|
||
await searchInput.click({ clickCount: 3 });
|
||
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
|
||
await sleep(1000);
|
||
await page.keyboard.press('Enter');
|
||
log(' ⏳ Searching...');
|
||
await sleep(5000);
|
||
}
|
||
|
||
// Extract search ID
|
||
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
|
||
if (!urlMatch) {
|
||
throw new Error('Could not extract search ID from URL');
|
||
}
|
||
const searchId = urlMatch[1];
|
||
log(`✅ Search ID: ${searchId}`);
|
||
|
||
// Step 5: Extract property IDs
|
||
log('\n📍 Step 5: Extracting property IDs...');
|
||
const propertyIds = await page.evaluate(() => {
|
||
const ids = [];
|
||
const links = document.querySelectorAll('a[href*="/property/"]');
|
||
|
||
links.forEach(link => {
|
||
const href = link.href;
|
||
const match = href.match(/property\/([a-f0-9-]+)/);
|
||
|
||
if (match) {
|
||
ids.push({
|
||
id: match[1],
|
||
url: href
|
||
});
|
||
}
|
||
});
|
||
|
||
return ids;
|
||
});
|
||
|
||
log(`✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
if (propertyIds.length === 0) {
|
||
log('⚠️ No property IDs found.');
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
// Step 6: Click through properties
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
|
||
log(`\n📍 Step 6: Clicking through ${propertiesToScrape.length} properties...`);
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property: ${prop.id}`);
|
||
|
||
// Click on property button
|
||
const clickResult = await clickAndNavigateToProperty(page, prop.id);
|
||
|
||
if (!clickResult.clicked) {
|
||
log(` ⚠️ Could not click property ${prop.id}`);
|
||
continue;
|
||
}
|
||
|
||
// Wait for property page to load
|
||
log(` ⏳ Waiting for property page to load...`);
|
||
await sleep(AFTER_CLICK_WAIT_MS);
|
||
|
||
// Try to click "View Contact" tab
|
||
await clickViewContactTab(page);
|
||
|
||
// Additional wait for dynamic content
|
||
log(` ⏳ Waiting for dynamic content...`);
|
||
await sleep(AFTER_TAB_SWITCH_WAIT_MS);
|
||
|
||
// Extract ALL data
|
||
const propertyData = await extractFullPropertyData(page);
|
||
|
||
log(` 📧 Emails found: ${propertyData.emails.length}`);
|
||
log(` 📞 Phones found: ${propertyData.phones.length}`);
|
||
log(` 👤 Owners found: ${propertyData.owners.length}`);
|
||
|
||
// Create lead object
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: prop.id,
|
||
propertyUrl: page.url(),
|
||
address: propertyData.address || '',
|
||
city: propertyData.city || '',
|
||
state: propertyData.state || '',
|
||
zip: propertyData.zip || '',
|
||
propertyType: propertyData.propertyType || '',
|
||
squareFootage: propertyData.squareFootage || '',
|
||
ownerNames: propertyData.owners.join(', '),
|
||
ownerLocation: propertyData.ownerLocation || '',
|
||
propertyCount: propertyData.propertyCount || '',
|
||
emails: propertyData.emails,
|
||
phones: propertyData.phones,
|
||
pageTitle: propertyData.pageTitle,
|
||
searchLocation: SEARCH_LOCATION,
|
||
searchId: searchId,
|
||
hasContactButton: propertyData.hasContactButton || false,
|
||
contactTabText: propertyData.contactTabText || ''
|
||
};
|
||
|
||
leads.push(lead);
|
||
|
||
// Go back to search results
|
||
log(` 🔙 Going back to search results...`);
|
||
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(BACK_NAVIGATION_WAIT_MS);
|
||
|
||
// Rate limiting
|
||
const rateDelay = 3000;
|
||
log(` ⏸ Rate limit: ${rateDelay}ms...`);
|
||
await sleep(rateDelay);
|
||
}
|
||
|
||
// Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
location: SEARCH_LOCATION,
|
||
searchId: searchId,
|
||
leadCount: leads.length,
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-v8-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-v8-error.png');
|
||
} catch (e) {}
|
||
|
||
throw error;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
}
|
||
}
|
||
|
||
// Run
|
||
scrapeLeads()
|
||
.then(result => {
|
||
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
|
||
console.log(`\n💾 View your leads at: ${result.outputFile}`);
|
||
process.exit(0);
|
||
})
|
||
.catch(error => {
|
||
log(`\n💥 Scraper failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|