clawdbot-workspace/reonomy-scraper-v12-fresh.js

355 lines
11 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v12 - FRESH START - CLEAN SLATE
*
* Proven foundation from v9 (Puppeteer)
* Fixed email/phone extraction (no complex regex)
* Extracts from BOTH Builder and Lot AND Owner tabs
* Uses direct ownership URLs (from research)
*
* Key improvements over v9:
* - Moved email/phone extraction BEFORE return statement (now executes!)
* - Simplified regex patterns (avoids syntax errors)
* - Added Builder and Lot tab extraction
* - Uses your CSS selector for phones: p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2
* - Uses direct ownership URL navigation (no property card clicking)
*
* Usage:
* SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v12-fresh.js
* Or set as environment variable
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const HEADLESS = process.env.HEADLESS !== 'false';
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v12-fresh.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v12.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract data from Builder and Lot tab
*/
async function extractBuilderLotData(page) {
log('📊 Extracting Builder and Lot data...');
const data = await page.evaluate(() => {
const result = {
squareFootage: '',
propertyType: ''
};
// Get page text
const bodyText = document.body.innerText;
// Extract square footage
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
result.squareFootage = sfMatch[0];
}
// Extract property type (simple patterns)
const typePatterns = [
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
'General Industrial', 'Medical Building', 'School', 'Religious',
'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
'Tax Exempt', 'Mixed Use'
];
for (const type of typePatterns) {
if (bodyText.includes(type)) {
result.propertyType = type;
break;
}
}
return result;
});
log(` 📐 Square Footage: ${data.squareFootage}`);
log(` 🏢 Property Type: ${data.propertyType}`);
return data;
}
/**
* Extract data from Owner tab (CRITICAL - emails + phones)
*/
async function extractOwnerTabData(page) {
log('👤 Extracting Owner tab data...');
const data = await page.evaluate(() => {
const result = {
emails: [],
phones: [],
ownerNames: []
};
// *** CRITICAL FIX: Extract emails BEFORE returning object ***
// Extract emails from mailto: links (simple, robust)
const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]'));
mailtoLinks.forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5 && !result.emails.includes(email)) {
result.emails.push(email);
}
});
// Also try email patterns in text
const bodyText = document.body.innerText;
const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailPattern);
if (emailMatches) {
emailMatches.forEach(email => {
if (!result.emails.includes(email)) {
result.emails.push(email);
}
});
}
// Extract phones using your CSS selector (from your inspection)
const phoneElements = Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2'));
phoneElements.forEach(p => {
const text = p.textContent.trim();
// Clean phone numbers (remove extra spaces, formatting)
const cleanPhone = text.replace(/[\s\-\(\)]/g, '');
if (cleanPhone.length >= 10 && !result.phones.includes(cleanPhone)) {
result.phones.push(cleanPhone);
}
});
// Extract owner names (proven simple pattern from v9)
const ownerLines = bodyText.split('\n');
for (const line of ownerLines) {
const ownerMatch = line.match(/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+)/i);
if (ownerMatch) {
const owner = ownerMatch[1].trim();
if (owner && owner.length > 3 && !result.ownerNames.includes(owner)) {
result.ownerNames.push(owner);
}
}
}
return result;
});
log(` 📧 Emails: ${data.emails.length} found`);
log(` 📞 Phones: ${data.phones.length} found`);
log(` 👤 Owners: ${data.ownerNames.length} found`);
return data;
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v12 (FRESH START)...\n');
// Launch browser
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
try {
// Step 1: Login to Reonomy
log('\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(15000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Step 3: Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Step 4: Extract property IDs
log('\n📍 Step 3: Extracting property IDs...');
const propertyIds = await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: `https://app.reonomy.com/#!/search/${searchId}/property/${match[1]}`
});
}
});
return ids;
});
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
throw new Error('No properties found on search page.');
}
// Step 5: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate directly to ownership page (from research - no clicking property cards)
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
log(` 🔗 Navigating to ownership page...`);
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for page to load
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(5000);
// Extract from Builder and Lot tab
log(` 📊 Extracting Builder and Lot data...`);
const builderLotData = await extractBuilderLotData(page);
// Wait a bit before extracting from Owner tab
await sleep(1000);
// Extract from Owner tab (CRITICAL: emails + phones)
log(` 👤 Extracting Owner tab data...`);
const ownerData = await extractOwnerTabData(page);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
...builderLotData,
...ownerData
};
log(` 📧 Emails: ${lead.emails.length} found`);
log(` 📞 Phones: ${lead.phones.length} found`);
log(` 👤 Owners: ${lead.ownerNames.length} found`);
log(` 📍 Address: ${lead.address || 'N/A'}`);
log(` 🏢 Property Type: ${lead.propertyType || 'N/A'}`);
log(` 📐 Square Footage: ${lead.squareFootage || 'N/A'}`);
leads.push(lead);
// Screenshot for debugging (first 3 properties only)
if (i < 3) {
const screenshotPath = `/tmp/reonomy-v12-property-${i + 1}.png`;
await page.screenshot({ path: screenshotPath, fullPage: false });
log(` 📸 Screenshot saved: ${screenshotPath}`);
}
}
// Step 6: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
try {
await page.screenshot({ path: '/tmp/reonomy-v12-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v12-error.png');
} catch (e) {
log('Could not save error screenshot');
}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
process.exit(0);
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});