clawdbot-workspace/reonomy-scraper-v11-puppeteer.js

367 lines
11 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v11 - PUPPETEER (PROVEN BASE + EMAILS/PHONES)
*
* Based on v9 (Puppeteer) - proven working version
* Adds email and phone extraction logic to v9
* Uses direct ownership URLs (no property card clicking)
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v11-puppeteer.js
* Or set as environment variable
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
const HEADLESS = process.env.HEADLESS !== 'false';
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v11-puppeteer.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v11.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract ALL data from Owner tab
*/
async function extractOwnerTabData(page) {
log('📊 Extracting Owner tab data...');
// Extract property ID from URL
const propIdMatch = page.url().match(/property\/([a-f0-9-]+)/);
const propertyId = propIdMatch ? propIdMatch[1] : '';
// Extract property details using v9's proven approach
const headingSelectors = ['h1', 'h2', 'h3'];
let propertyAddress = '';
let city = '';
let state = '';
let zip = '';
let squareFootage = '';
let propertyType = '';
for (const sel of headingSelectors) {
const heading = await page.$(sel);
if (heading) {
const text = (await page.evaluate(el => el.textContent, heading)).trim();
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
propertyAddress = addressMatch[0];
city = addressMatch[1]?.trim() || '';
state = addressMatch[2]?.trim() || '';
zip = addressMatch[3]?.trim() || '';
log(` 📍 Address: ${text}`);
break;
}
}
}
// Extract property type and SF from body text
const bodyText = await page.evaluate(() => document.body.innerText);
const bodyTextContent = JSON.parse(bodyText).result || '';
// Square footage
const sfMatch = bodyTextContent.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
squareFootage = sfMatch[0];
log(` 📐 Square Footage: ${sfMatch[0]}`);
}
// Property type
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
for (const type of typePatterns) {
if (bodyTextContent.includes(type)) {
propertyType = type;
log(` 🏢 Property Type: ${type}`);
break;
}
}
// Extract owner names using v9's proven regex patterns
const ownerPatterns = [
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
];
let ownerNames = [];
for (const pattern of ownerPatterns) {
const matches = bodyTextContent.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !ownerNames.includes(owner)) {
ownerNames.push(owner);
}
});
}
}
log(` 👤 Owners found: ${ownerNames.length}`);
// Extract phones using your CSS selector (proven to work)
const phoneResult = await page.evaluateHandle(() => {
return Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text.length >= 10);
});
let phones = [];
if (phoneResult.result && Array.isArray(phoneResult.result)) {
phoneResult.result.forEach(phone => {
// Clean phone numbers (remove extra spaces, formatting)
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !phones.includes(cleanPhone)) {
phones.push(cleanPhone);
}
});
log(` 📞 Phones found: ${phones.length}`);
}
// Extract emails using mailto links (robust approach)
const emailResult = await page.evaluateHandle(() => {
// First try mailto links
const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]')).map(a => a.href.replace('mailto:', ''));
// Also try finding emails in text and from a/@ links
const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const textEmails = bodyTextContent.match(emailPattern) || [];
// Combine and deduplicate
const allEmails = [...new Set([...mailtoLinks, ...textEmails])];
allEmails.forEach(email => {
if (email && email.length > 5 && !emails.includes(email)) {
emails.push(email);
}
});
log(` 📧 Emails found: ${emails.length}`);
const ownerData = {
propertyId: propertyId,
propertyAddress: propertyAddress,
city: city,
state: state,
zip: zip,
squareFootage: squareFootage,
propertyType: propertyType,
ownerNames: ownerNames,
emails: emails,
phones: phones
};
return ownerData;
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: `https://app.reonomy.com/#!/search/${window.location.href.split('/')[4]}/property/${match[1]}`
});
}
});
return ids;
});
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v11 (PUPPETEER + EMAILS/PHONES)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
// Step 1: Login to Reonomy
log('\n🔐 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(15000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Step 3: Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Step 4: Extract property IDs
log('\n📍 Step 3: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
throw new Error('No properties found on search page.');
}
// Step 5: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate directly to ownership page (from your research)
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
log(` 🔗 Navigating to ownership page...`);
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for Owner tab to load
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(8000);
// Extract ALL data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const ownerData = await extractOwnerTabData(page);
log(` 📧 Emails: ${ownerData.emails.length} found`);
log(` 📞 Phones: ${ownerData.phones.length} found`);
log(` 👤 Owners: ${ownerData.ownerNames.length} found`);
log(` 📍 Address: ${ownerData.propertyAddress || 'N/A'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
address: ownerData.propertyAddress || '',
city: ownerData.city || '',
state: ownerData.state || '',
zip: ownerData.zip || '',
squareFootage: ownerData.squareFootage || '',
propertyType: ownerData.propertyType || '',
ownerNames: ownerData.ownerNames.join('; ') || '',
emails: ownerData.emails,
phones: ownerData.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Screenshot for debugging (first 3 properties only)
if (i < 3) {
const screenshotPath = `/tmp/reonomy-v11-property-${i + 1}.png`;
await page.screenshot({ path: screenshotPath, fullPage: false });
log(` 📸 Screenshot saved: ${screenshotPath}`);
}
}
// Step 6: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: searchId,
searchLocation: SEARCH_LOCATION,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}
/**
* Main execution
*/
(async () => {
try {
await scrapeLeads();
process.exit(0);
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
try {
await page.screenshot({ path: '/tmp/reonomy-v11-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v11-error.png');
} catch (e) {
log('Could not save error screenshot');
}
await browser.close();
log('\n🔚 Browser closed');
process.exit(1);
}
})();