clawdbot-workspace/reonomy-scraper-v8-full-extract.js

621 lines
19 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v8 - FULL EXTRACTION WITH CLICK-THROUGH
*
* Workflow:
* 1. Login
* 2. Search for location
* 3. Apply advanced filters (Has Phone + Has Email)
* 4. Extract property IDs
* 5. For each property:
* - Click on property button
* - Wait for property page to fully load
* - Look for contact info tabs/sections
* - Click "View Contact" or "Ownership" if needed
* - Extract ALL data (emails, phones, owners, addresses, property details)
* - Go back to search results
* - Continue to next property
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
// Longer waits for full content loading
const AFTER_CLICK_WAIT_MS = 5000;
const AFTER_TAB_SWITCH_WAIT_MS = 3000;
const BACK_NAVIGATION_WAIT_MS = 3000;
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v8-full.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v8.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Apply advanced filters
*/
async function applyAdvancedFilters(page) {
log('🔍 Step 2.1: Applying advanced filters (Has Phone + Has Email)...');
try {
// Look for "More Filters" button
const moreFiltersBtn = await page.waitForSelector('button:has-text("More Filters"), button[aria-label*="Filters"], button:has-text("Filters")', {
timeout: 15000
}).catch(() => null);
if (moreFiltersBtn) {
log(' 📋 Clicking "More Filters"...');
await moreFiltersBtn.click();
await sleep(2000);
}
// Look for "Has Phone" filter
let hasPhoneFound = false;
const phoneSelectors = [
'label:has-text("Has Phone"), label:has-text("phone") input[type="checkbox"]',
'input[type="checkbox"][data-test*="phone"], input[type="checkbox"][id*="phone"]',
'.filter-item:has-text("Has Phone") input[type="checkbox"]'
];
for (const selector of phoneSelectors) {
const checkbox = await page.waitForSelector(selector, { timeout: 3000 }).catch(() => null);
if (checkbox) {
const isChecked = await (await page.evaluate(el => el.checked, { el }).catch(() => false));
if (!isChecked) {
log(' ☑️ Checking "Has Phone" filter...');
await checkbox.click();
await sleep(500);
hasPhoneFound = true;
break;
}
}
}
if (!hasPhoneFound) {
log(' ⚠️ "Has Phone" filter not found, skipping');
}
await sleep(1000);
// Look for "Has Email" filter
let hasEmailFound = false;
const emailSelectors = [
'label:has-text("Has Email"), label:has-text("email") input[type="checkbox"]',
'input[type="checkbox"][data-test*="email"], input[type="checkbox"][id*="email"]',
'.filter-item:has-text("Has Email") input[type="checkbox"]'
];
for (const selector of emailSelectors) {
const checkbox = await page.waitForSelector(selector, { timeout: 3000 }).catch(() => null);
if (checkbox) {
const isChecked = await (await page.evaluate(el => el.checked, { el }).catch(() => false));
if (!isChecked) {
log(' ☑️ Checking "Has Email" filter...');
await checkbox.click();
await sleep(500);
hasEmailFound = true;
break;
}
}
}
if (!hasEmailFound) {
log(' ⚠️ "Has Email" filter not found, skipping');
}
log('✅ Filters applied');
} catch (error) {
log(` ⚠️ Filter application had issues: ${error.message}`);
}
}
/**
* Extract ALL available data from property page
*/
async function extractFullPropertyData(page, propertyUrl) {
log(' 🔎 Extracting full property data...');
const data = await page.evaluate(() => {
const result = {
propertyId: '',
address: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: '',
ownerName: '',
ownerLocation: '',
propertyCount: '',
emails: [],
phones: [],
contacts: [],
pageTitle: document.title,
url: window.location.href
};
// Extract property ID from URL
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
result.propertyId = propIdMatch[1];
}
// Extract property address (look in multiple places)
const addressPatterns = [
// h1, h2, h3, h4, h5, h6
document.querySelector('h1, h2, h3, h4, h5, h6')?.textContent?.trim(),
// Heading with "Highway" or "Avenue" or "Street" etc.
...Array.from(document.querySelectorAll('[role="heading"], h1, h2, h3')).map(h => h.textContent?.trim()).find(t =>
t && (t.includes('Highway') || t.includes('Avenue') || t.includes('Street') ||
t.includes('Rd') || t.includes('Dr') || t.includes('Way') ||
t.includes('Ln') || t.includes('Blvd') || t.includes('Rte'))
];
for (const addr of addressPatterns) {
if (addr && addr.length > 10 && addr.length < 200) {
result.address = addr;
break;
}
}
// Extract city, state, zip from address
const addressMatch = result.address.match(/,\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
result.city = addressMatch[1]?.trim();
result.state = addressMatch[2]?.trim();
result.zip = addressMatch[3]?.trim();
}
// Extract property type
const typePatterns = ['SF', 'Acre', 'General Industrial', 'Retail Stores', 'Warehouse', 'Office Building', 'Medical Building'];
const bodyText = document.body.innerText;
for (const type of typePatterns) {
if (bodyText.includes(type)) {
result.propertyType = type;
break;
}
}
// Extract square footage
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
result.squareFootage = sfMatch[0];
}
// Extract emails (from mailto: links and email patterns)
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5 && !result.emails.includes(email)) {
result.emails.push(email);
}
});
// Also try email regex patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailRegex);
if (emailMatches) {
emailMatches.forEach(email => {
if (!result.emails.includes(email)) {
result.emails.push(email);
}
});
}
// Extract phones (from tel: links and phone patterns)
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length > 7 && !result.phones.includes(phone)) {
result.phones.push(phone);
}
});
// Also try phone regex patterns in text
const phoneRegex = /\(?:(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})|(\d{10})/g;
const phoneMatches = bodyText.match(phoneRegex);
if (phoneMatches) {
phoneMatches.forEach(match => {
const phone = match.replace(/^:?\s*|\.|-/g, '');
if (phone && phone.length >= 10 && !result.phones.includes(phone)) {
result.phones.push(phone);
}
});
}
// Extract owner names
const ownerPatterns = [
/Owner:\s*([A-Za-z\s]+)/g,
/Owns\s+\d+\s+properties\s*in\s*([A-Za-z\s,]+)/i,
/([A-Z][a-z]+\s+[A-Z][a-z]+\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g
];
const ownerMatches = [...new Set()];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : (m[1] || m);
if (owner && owner.length > 3 && !result.owners.includes(owner)) {
ownerMatches.push(owner);
}
});
}
}
result.owners = Array.from(ownerMatches);
// Extract property count
const propCountMatch = bodyText.match(/Owns\s+(\d+)\s+properties/i);
if (propCountMatch) {
result.propertyCount = propCountMatch[1];
}
// Look for owner location
const locationPattern = /\s+in\s+([A-Za-z\s,]+(?:\s*,\s+[A-Z]{2})?/i;
const locationMatch = bodyText.match(locationPattern);
if (locationMatch) {
result.ownerLocation = locationMatch[1]?.trim();
}
// Look for contact tabs/buttons
const tabSelectors = [
'button:has-text("View Contact"), button:has-text("Contact")',
'button:has-text("Ownership"), button:has-text("Owner")',
'[role="tab"]:has-text("Contact")'
];
for (const sel of tabSelectors) {
const tab = document.querySelector(sel);
if (tab) {
result.hasContactButton = true;
result.contactTabText = tab.textContent?.trim();
break;
}
}
// Extract all contact section text (for debug)
const contactSection = document.body.innerText.substring(0, 1000);
result.contactSectionSample = contactSection;
return result;
});
log(` 📧 Emails: ${data.emails.length} found`);
log(` 📞 Phones: ${data.phones.length} found`);
log(` 👤 Owners: ${data.owners.length} found`);
return data;
}
/**
* Click on property button and navigate to it
*/
async function clickAndNavigateToProperty(page, propertyId) {
log(`\n🔗 Clicking property ${propertyId}...`);
const clicked = await page.evaluate((propId) => {
const buttons = Array.from(document.querySelectorAll('button'));
// Try to find button with property ID in its link
const targetButton = buttons.find(b => {
const link = b.querySelector('a[href*="/property/"]');
if (link) {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
return match && match[1] === propId;
}
});
// If not found by link, try by text content
const textButton = buttons.find(b => {
const text = b.textContent || b.innerText || '';
return text.includes(propId);
});
if (targetButton) {
targetButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
setTimeout(() => {
targetButton.click();
}, 100);
return { clicked: true };
} else if (textButton) {
textButton.scrollIntoView({ behavior: 'smooth', block: 'center' });
setTimeout(() => {
textButton.click();
}, 100);
return { clicked: true };
}
return { clicked: false };
}, { propertyId }).catch(() => {
return { clicked: false };
});
await sleep(2000);
return clicked;
}
/**
* Try to find and click "View Contact" tab
*/
async function clickViewContactTab(page) {
log(' 📋 Looking for "View Contact" tab...');
const clicked = await page.evaluate(() => {
const tabs = ['button:has-text("View Contact")', 'button:has-text("Contact")', 'button:has-text("Ownership")', '[role="tab"]:has-text("Contact")'];
for (const selector of tabs) {
const tab = document.querySelector(selector);
if (tab) {
tab.scrollIntoView({ behavior: 'smooth', block: 'center' });
setTimeout(() => {
tab.click();
}, 200);
return { clicked: true };
}
}
return { clicked: false };
}).catch(() => {
return { clicked: false };
});
if (clicked && clicked.clicked) {
log(' ✅ Clicked contact tab');
await sleep(AFTER_TAB_SWITCH_WAIT_MS);
} else {
log(' ⚠️ No "View Contact" tab found');
}
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v8 (FULL EXTRACTION)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Step 1: Login
log('\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log(' ⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Step 3: Apply advanced filters
log('\n📍 Step 3: Applying filters for contact info...');
await applyAdvancedFilters(page);
// Step 4: Perform search
log(`\n📍 Step 4: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log(' ⏳ Searching...');
await sleep(5000);
}
// Extract search ID
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Step 5: Extract property IDs
log('\n📍 Step 5: Extracting property IDs...');
const propertyIds = await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Step 6: Click through properties
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 6: Clicking through ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property: ${prop.id}`);
// Click on property button
const clickResult = await clickAndNavigateToProperty(page, prop.id);
if (!clickResult.clicked) {
log(` ⚠️ Could not click property ${prop.id}`);
continue;
}
// Wait for property page to load
log(` ⏳ Waiting for property page to load...`);
await sleep(AFTER_CLICK_WAIT_MS);
// Try to click "View Contact" tab
await clickViewContactTab(page);
// Additional wait for dynamic content
log(` ⏳ Waiting for dynamic content...`);
await sleep(AFTER_TAB_SWITCH_WAIT_MS);
// Extract ALL data
const propertyData = await extractFullPropertyData(page);
log(` 📧 Emails found: ${propertyData.emails.length}`);
log(` 📞 Phones found: ${propertyData.phones.length}`);
log(` 👤 Owners found: ${propertyData.owners.length}`);
// Create lead object
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: page.url(),
address: propertyData.address || '',
city: propertyData.city || '',
state: propertyData.state || '',
zip: propertyData.zip || '',
propertyType: propertyData.propertyType || '',
squareFootage: propertyData.squareFootage || '',
ownerNames: propertyData.owners.join(', '),
ownerLocation: propertyData.ownerLocation || '',
propertyCount: propertyData.propertyCount || '',
emails: propertyData.emails,
phones: propertyData.phones,
pageTitle: propertyData.pageTitle,
searchLocation: SEARCH_LOCATION,
searchId: searchId,
hasContactButton: propertyData.hasContactButton || false,
contactTabText: propertyData.contactTabText || ''
};
leads.push(lead);
// Go back to search results
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(BACK_NAVIGATION_WAIT_MS);
// Rate limiting
const rateDelay = 3000;
log(` ⏸ Rate limit: ${rateDelay}ms...`);
await sleep(rateDelay);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v8-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v8-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});