clawdbot-workspace/reonomy-scraper-v9.1-fixed.js

354 lines
11 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v9.1 - FIXED EDITION
*
* Critical fix: Moved email/phone extraction logic BEFORE return statement
* This ensures extraction code actually executes
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v9.1-fixed.js
* Or set as environment variable
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const HEADLESS = process.env.HEADLESS !== 'false';
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v9.1-fixed.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v9.1-fixed.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract ALL data from Owner tab
* CRITICAL FIX: Email/phone extraction moved BEFORE return statement
*/
async function extractOwnerTabData(page) {
log('📊 Extracting Owner tab data...');
// Get snapshot first
const bodyText = await page.evaluate(() => {
return {
emails: [],
phones: [],
ownerNames: [],
pageTitle: document.title,
bodyTextSample: ''
};
});
// Extract property ID from URL
const propIdMatch = page.url().match(/property\/([a-f0-9-]+)/);
const propertyId = propIdMatch ? propIdMatch[1] : '';
// Extract property details (SF, type) from body text
const bodyTextContent = JSON.parse(bodyText).result || '';
// Square footage
const sfMatch = bodyTextContent.match(/(\d+\.?\d*\s*k?\s*SF)/i);
const squareFootage = sfMatch ? sfMatch[0] : '';
// Property type
const typePatterns = [
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
'General Industrial', 'Medical Building', 'School', 'Religious',
'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
'Tax Exempt', 'Mixed Use'
];
let propertyType = '';
for (const type of typePatterns) {
if (bodyTextContent.includes(type)) {
propertyType = type;
log(` 🏢 Property Type: ${type}`);
break;
}
}
// Extract owner names from page text (v9's proven approach)
const ownerPatterns = [
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
];
let ownerNames = [];
const ownerBodyText = JSON.parse(bodyText).result || '';
for (const pattern of ownerPatterns) {
const matches = ownerBodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !ownerNames.includes(owner)) {
ownerNames.push(owner);
}
});
}
}
log(` 👤 Owners found: ${ownerNames.length}`);
// *** CRITICAL FIX: Extract emails BEFORE return ***
// Extract emails using mailto links (robust approach)
const emailResult = await page.$$eval('a[href^="mailto:"]');
const emailSet = new Set(emailResult.map(a => a.href.replace('mailto:', '')));
// Also try email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = ownerBodyText.match(emailRegex) || [];
emailMatches.forEach(email => {
if (!emailSet.has(email)) {
emailSet.add(email);
}
});
// *** CRITICAL FIX: Extract phones BEFORE return ***
// Extract phones using your CSS selector (from your inspection)
const phoneElements = await page.$$eval('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2');
const phoneSet = new Set(phoneElements.map(el => el.textContent.trim()).filter(text => text.length >= 10));
// Deduplicate phones
const phoneSetUnique = new Set();
phoneSet.forEach(phone => {
// Clean phone numbers (remove extra spaces, formatting)
const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !phoneSetUnique.has(cleanPhone)) {
phoneSetUnique.add(cleanPhone);
}
});
const phones = Array.from(phoneSetUnique);
log(` 📧 Emails: ${emailSet.size} found`);
log(` 📞 Phones: ${phones.length} found`);
// Update info object with all data
const info = {
propertyId,
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage,
propertyType,
ownerNames,
emails: Array.from(emailSet),
phones,
pageTitle: document.title,
bodyTextSample: ownerBodyText.substring(0, 500)
};
log(` 📧 Emails: ${info.emails.length} found`);
log(` 📞 Phones: ${info.phones.length} found`);
log(` 👤 Owners: ${info.ownerNames.length} found`);
return info;
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: `https://app.reonomy.com/#!/search/${window.location.href.split('/')[4]}/property/${match[1]}`
});
}
});
return ids;
});
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v9.1 (FIXED EDITION)...\n');
// Step 1: Login to Reonomy
log('\n🔐 Step 1: Logging in to Reonomy...');
const browser = await puppeteer.launch({
headless: HEADLESS,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
try {
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(15000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Step 3: Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Step 4: Extract property IDs
log('\n📍 Step 3: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
throw new Error('No properties found on search page.');
}
// Step 5: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate directly to ownership page
log(` 🔗 Navigating to ownership page...`);
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for Owner tab to load
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(8000);
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const ownerData = await extractOwnerTabData(page);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
address: ownerData.propertyAddress || '',
city: ownerData.city || '',
state: ownerData.state || '',
zip: ownerData.zip || '',
squareFootage: ownerData.squareFootage || '',
propertyType: ownerData.propertyType || '',
ownerNames: ownerData.ownerNames.join('; ') || '',
emails: ownerData.emails,
phones: ownerData.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
log(` 📧 Emails: ${lead.emails.length}`);
log(` 📞 Phones: ${lead.phones.length}`);
log(` 👤 Owners: ${lead.ownerNames.length}`);
log(` 📍 Address: ${lead.propertyAddress || 'N/A'}`);
leads.push(lead);
// Screenshot for debugging (first 3 properties only)
if (i < 3) {
const screenshotPath = `/tmp/reonomy-v9.1-property-${i + 1}.png`;
await page.screenshot({ path: screenshotPath, fullPage: false });
log(` 📸 Screenshot saved: ${screenshotPath}`);
}
}
// Step 6: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
try {
await page.screenshot({ path: '/tmp/reonomy-v9.1-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v9.1-error.png');
} catch (e) {
log('Could not save error screenshot');
}
await browser.close();
log('\n🔚 Browser closed');
process.exit(1);
}
}
// Run
scrapeLeads();