367 lines
11 KiB
JavaScript
367 lines
11 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Reonomy Scraper v11 - PUPPETEER (PROVEN BASE + EMAILS/PHONES)
|
||
*
|
||
* Based on v9 (Puppeteer) - proven working version
|
||
* Adds email and phone extraction logic to v9
|
||
* Uses direct ownership URLs (no property card clicking)
|
||
*
|
||
* Usage:
|
||
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v11-puppeteer.js
|
||
* Or set as environment variable
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
|
||
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
|
||
const HEADLESS = process.env.HEADLESS !== 'false';
|
||
|
||
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v11-puppeteer.json');
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v11.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Extract ALL data from Owner tab
|
||
*/
|
||
async function extractOwnerTabData(page) {
|
||
log('📊 Extracting Owner tab data...');
|
||
|
||
// Extract property ID from URL
|
||
const propIdMatch = page.url().match(/property\/([a-f0-9-]+)/);
|
||
const propertyId = propIdMatch ? propIdMatch[1] : '';
|
||
|
||
// Extract property details using v9's proven approach
|
||
const headingSelectors = ['h1', 'h2', 'h3'];
|
||
let propertyAddress = '';
|
||
let city = '';
|
||
let state = '';
|
||
let zip = '';
|
||
let squareFootage = '';
|
||
let propertyType = '';
|
||
|
||
for (const sel of headingSelectors) {
|
||
const heading = await page.$(sel);
|
||
if (heading) {
|
||
const text = (await page.evaluate(el => el.textContent, heading)).trim();
|
||
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
|
||
if (addressMatch) {
|
||
propertyAddress = addressMatch[0];
|
||
city = addressMatch[1]?.trim() || '';
|
||
state = addressMatch[2]?.trim() || '';
|
||
zip = addressMatch[3]?.trim() || '';
|
||
log(` 📍 Address: ${text}`);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract property type and SF from body text
|
||
const bodyText = await page.evaluate(() => document.body.innerText);
|
||
const bodyTextContent = JSON.parse(bodyText).result || '';
|
||
|
||
// Square footage
|
||
const sfMatch = bodyTextContent.match(/(\d+\.?\d*\s*k?\s*SF)/i);
|
||
if (sfMatch) {
|
||
squareFootage = sfMatch[0];
|
||
log(` 📐 Square Footage: ${sfMatch[0]}`);
|
||
}
|
||
|
||
// Property type
|
||
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
|
||
for (const type of typePatterns) {
|
||
if (bodyTextContent.includes(type)) {
|
||
propertyType = type;
|
||
log(` 🏢 Property Type: ${type}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract owner names using v9's proven regex patterns
|
||
const ownerPatterns = [
|
||
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
|
||
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
|
||
];
|
||
|
||
let ownerNames = [];
|
||
|
||
for (const pattern of ownerPatterns) {
|
||
const matches = bodyTextContent.match(pattern);
|
||
if (matches) {
|
||
matches.forEach(m => {
|
||
const owner = typeof m === 'string' ? m : m[1];
|
||
if (owner && owner.length > 3 && !ownerNames.includes(owner)) {
|
||
ownerNames.push(owner);
|
||
}
|
||
});
|
||
}
|
||
}
|
||
|
||
log(` 👤 Owners found: ${ownerNames.length}`);
|
||
|
||
// Extract phones using your CSS selector (proven to work)
|
||
const phoneResult = await page.evaluateHandle(() => {
|
||
return Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text.length >= 10);
|
||
});
|
||
|
||
let phones = [];
|
||
if (phoneResult.result && Array.isArray(phoneResult.result)) {
|
||
phoneResult.result.forEach(phone => {
|
||
// Clean phone numbers (remove extra spaces, formatting)
|
||
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
|
||
if (cleanPhone.length >= 10 && !phones.includes(cleanPhone)) {
|
||
phones.push(cleanPhone);
|
||
}
|
||
});
|
||
log(` 📞 Phones found: ${phones.length}`);
|
||
}
|
||
|
||
// Extract emails using mailto links (robust approach)
|
||
const emailResult = await page.evaluateHandle(() => {
|
||
// First try mailto links
|
||
const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]')).map(a => a.href.replace('mailto:', ''));
|
||
|
||
// Also try finding emails in text and from a/@ links
|
||
const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
||
const textEmails = bodyTextContent.match(emailPattern) || [];
|
||
|
||
// Combine and deduplicate
|
||
const allEmails = [...new Set([...mailtoLinks, ...textEmails])];
|
||
allEmails.forEach(email => {
|
||
if (email && email.length > 5 && !emails.includes(email)) {
|
||
emails.push(email);
|
||
}
|
||
});
|
||
|
||
log(` 📧 Emails found: ${emails.length}`);
|
||
|
||
const ownerData = {
|
||
propertyId: propertyId,
|
||
propertyAddress: propertyAddress,
|
||
city: city,
|
||
state: state,
|
||
zip: zip,
|
||
squareFootage: squareFootage,
|
||
propertyType: propertyType,
|
||
ownerNames: ownerNames,
|
||
emails: emails,
|
||
phones: phones
|
||
};
|
||
|
||
return ownerData;
|
||
}
|
||
|
||
/**
|
||
* Extract property IDs from search results
|
||
*/
|
||
async function extractPropertyIds(page) {
|
||
return await page.evaluate(() => {
|
||
const ids = [];
|
||
const links = document.querySelectorAll('a[href*="/property/"]');
|
||
|
||
links.forEach(link => {
|
||
const href = link.href;
|
||
const match = href.match(/property\/([a-f0-9-]+)/);
|
||
if (match) {
|
||
ids.push({
|
||
id: match[1],
|
||
url: `https://app.reonomy.com/#!/search/${window.location.href.split('/')[4]}/property/${match[1]}`
|
||
});
|
||
}
|
||
});
|
||
|
||
return ids;
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Main scraper function
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Scraper v11 (PUPPETEER + EMAILS/PHONES)...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
// Step 1: Login to Reonomy
|
||
log('\n🔐 Step 1: Logging into Reonomy...');
|
||
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
await page.click('button[type="submit"]');
|
||
|
||
log('⏳ Waiting for login...');
|
||
await sleep(15000);
|
||
|
||
// Check if logged in
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Step 2: Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
|
||
await page.goto('https://app.reonomy.com/#!/search', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
|
||
// Step 3: Extract search ID from URL
|
||
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
|
||
if (!urlMatch) {
|
||
throw new Error('Could not extract search ID from URL');
|
||
}
|
||
const searchId = urlMatch[1];
|
||
log(`✅ Search ID: ${searchId}`);
|
||
|
||
// Step 4: Extract property IDs
|
||
log('\n📍 Step 3: Extracting property IDs...');
|
||
|
||
const propertyIds = await extractPropertyIds(page);
|
||
log(`✅ Found ${propertyIds.length} property IDs`);
|
||
|
||
if (propertyIds.length === 0) {
|
||
throw new Error('No properties found on search page.');
|
||
}
|
||
|
||
// Step 5: Process each property
|
||
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
|
||
|
||
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
|
||
|
||
const leads = [];
|
||
|
||
for (let i = 0; i < propertiesToScrape.length; i++) {
|
||
const prop = propertiesToScrape[i];
|
||
|
||
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
|
||
|
||
// Navigate directly to ownership page (from your research)
|
||
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
|
||
log(` 🔗 Navigating to ownership page...`);
|
||
|
||
await page.goto(ownershipUrl, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
// Wait for Owner tab to load
|
||
log(` ⏳ Waiting for Owner tab to load...`);
|
||
await sleep(8000);
|
||
|
||
// Extract ALL data from Owner tab
|
||
log(` 📊 Extracting data from Owner tab...`);
|
||
const ownerData = await extractOwnerTabData(page);
|
||
|
||
log(` 📧 Emails: ${ownerData.emails.length} found`);
|
||
log(` 📞 Phones: ${ownerData.phones.length} found`);
|
||
log(` 👤 Owners: ${ownerData.ownerNames.length} found`);
|
||
log(` 📍 Address: ${ownerData.propertyAddress || 'N/A'}`);
|
||
|
||
const lead = {
|
||
scrapeDate: new Date().toISOString().split('T')[0],
|
||
propertyId: prop.id,
|
||
propertyUrl: ownershipUrl,
|
||
address: ownerData.propertyAddress || '',
|
||
city: ownerData.city || '',
|
||
state: ownerData.state || '',
|
||
zip: ownerData.zip || '',
|
||
squareFootage: ownerData.squareFootage || '',
|
||
propertyType: ownerData.propertyType || '',
|
||
ownerNames: ownerData.ownerNames.join('; ') || '',
|
||
emails: ownerData.emails,
|
||
phones: ownerData.phones,
|
||
searchLocation: SEARCH_LOCATION,
|
||
searchId: searchId
|
||
};
|
||
|
||
leads.push(lead);
|
||
|
||
// Screenshot for debugging (first 3 properties only)
|
||
if (i < 3) {
|
||
const screenshotPath = `/tmp/reonomy-v11-property-${i + 1}.png`;
|
||
await page.screenshot({ path: screenshotPath, fullPage: false });
|
||
log(` 📸 Screenshot saved: ${screenshotPath}`);
|
||
}
|
||
}
|
||
|
||
// Step 6: Save results
|
||
if (leads.length > 0) {
|
||
log(`\n✅ Total leads scraped: ${leads.length}`);
|
||
|
||
const outputData = {
|
||
scrapeDate: new Date().toISOString(),
|
||
searchId: searchId,
|
||
searchLocation: SEARCH_LOCATION,
|
||
leadCount: leads.length,
|
||
leads: leads
|
||
};
|
||
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
|
||
log(`💾 Saved to: ${OUTPUT_FILE}`);
|
||
} else {
|
||
log('\n⚠️ No leads scraped.');
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
|
||
|
||
}
|
||
|
||
/**
|
||
* Main execution
|
||
*/
|
||
(async () => {
|
||
try {
|
||
await scrapeLeads();
|
||
process.exit(0);
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
// Take screenshot of error state
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-v11-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-v11-error.png');
|
||
} catch (e) {
|
||
log('Could not save error screenshot');
|
||
}
|
||
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
process.exit(1);
|
||
}
|
||
})();
|