#!/usr/bin/env node /** * Reonomy Scraper v12 - FRESH START - CLEAN SLATE * * Proven foundation from v9 (Puppeteer) * Fixed email/phone extraction (no complex regex) * Extracts from BOTH Builder and Lot AND Owner tabs * Uses direct ownership URLs (from research) * * Key improvements over v9: * - Moved email/phone extraction BEFORE return statement (now executes!) * - Simplified regex patterns (avoids syntax errors) * - Added Builder and Lot tab extraction * - Uses your CSS selector for phones: p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2 * - Uses direct ownership URL navigation (no property card clicking) * * Usage: * SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v12-fresh.js * Or set as environment variable */ const puppeteer = require('puppeteer'); const fs = require('fs'); const path = require('path'); // Configuration const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com'; const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532'; const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6'; const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20; const HEADLESS = process.env.HEADLESS !== 'false'; const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v12-fresh.json'); const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v12.log'); function log(message) { const timestamp = new Date().toISOString(); const logMessage = `[${timestamp}] ${message}\n`; console.log(message); fs.appendFileSync(LOG_FILE, logMessage); } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Extract data from Builder and Lot tab */ async function extractBuilderLotData(page) { log('šŸ“Š Extracting Builder and Lot data...'); const data = await page.evaluate(() => { const result = { squareFootage: '', propertyType: '' }; // Get page text const bodyText = document.body.innerText; // Extract square footage const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i); if (sfMatch) { result.squareFootage = sfMatch[0]; } // Extract property type (simple patterns) const typePatterns = [ 'Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building', 'Residential', 'Vacant Land', 'Tax Exempt', 'Mixed Use' ]; for (const type of typePatterns) { if (bodyText.includes(type)) { result.propertyType = type; break; } } return result; }); log(` šŸ“ Square Footage: ${data.squareFootage}`); log(` šŸ¢ Property Type: ${data.propertyType}`); return data; } /** * Extract data from Owner tab (CRITICAL - emails + phones) */ async function extractOwnerTabData(page) { log('šŸ‘¤ Extracting Owner tab data...'); const data = await page.evaluate(() => { const result = { emails: [], phones: [], ownerNames: [] }; // *** CRITICAL FIX: Extract emails BEFORE returning object *** // Extract emails from mailto: links (simple, robust) const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]')); mailtoLinks.forEach(a => { const email = a.href.replace('mailto:', ''); if (email && email.length > 5 && !result.emails.includes(email)) { result.emails.push(email); } }); // Also try email patterns in text const bodyText = document.body.innerText; const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g; const emailMatches = bodyText.match(emailPattern); if (emailMatches) { emailMatches.forEach(email => { if (!result.emails.includes(email)) { result.emails.push(email); } }); } // Extract phones using your CSS selector (from your inspection) const phoneElements = Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')); phoneElements.forEach(p => { const text = p.textContent.trim(); // Clean phone numbers (remove extra spaces, formatting) const cleanPhone = text.replace(/[\s\-\(\)]/g, ''); if (cleanPhone.length >= 10 && !result.phones.includes(cleanPhone)) { result.phones.push(cleanPhone); } }); // Extract owner names (proven simple pattern from v9) const ownerLines = bodyText.split('\n'); for (const line of ownerLines) { const ownerMatch = line.match(/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+)/i); if (ownerMatch) { const owner = ownerMatch[1].trim(); if (owner && owner.length > 3 && !result.ownerNames.includes(owner)) { result.ownerNames.push(owner); } } } return result; }); log(` šŸ“§ Emails: ${data.emails.length} found`); log(` šŸ“ž Phones: ${data.phones.length} found`); log(` šŸ‘¤ Owners: ${data.ownerNames.length} found`); return data; } /** * Main scraper */ async function scrapeLeads() { log('šŸš€ Starting Reonomy Scraper v12 (FRESH START)...\n'); // Launch browser const browser = await puppeteer.launch({ headless: HEADLESS ? 'new' : false, args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080'] }); const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); try { // Step 1: Login to Reonomy log('\nšŸ“ Step 1: Logging into Reonomy...'); await page.goto('https://app.reonomy.com/#!/account', { waitUntil: 'domcontentloaded', timeout: 60000 }); await sleep(2000); await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 }); await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 }); await page.click('button[type="submit"]'); log('ā³ Waiting for login...'); await sleep(15000); // Check if logged in const url = page.url(); if (url.includes('login') || url.includes('auth')) { throw new Error('Login failed. Please check credentials.'); } log('āœ… Successfully logged in!'); // Step 2: Navigate to search log('\nšŸ“ Step 2: Navigating to search...'); await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, { waitUntil: 'networkidle2', timeout: 60000 }); await sleep(3000); // Step 3: Extract search ID from URL const urlMatch = page.url().match(/search\/([a-f0-9-]+)/); if (!urlMatch) { throw new Error('Could not extract search ID from URL'); } const searchId = urlMatch[1]; log(`āœ… Search ID: ${searchId}`); // Step 4: Extract property IDs log('\nšŸ“ Step 3: Extracting property IDs...'); const propertyIds = await page.evaluate(() => { const ids = []; const links = document.querySelectorAll('a[href*="/property/"]'); links.forEach(link => { const href = link.href; const match = href.match(/property\/([a-f0-9-]+)/); if (match) { ids.push({ id: match[1], url: `https://app.reonomy.com/#!/search/${searchId}/property/${match[1]}` }); } }); return ids; }); log(`āœ… Found ${propertyIds.length} property IDs`); if (propertyIds.length === 0) { throw new Error('No properties found on search page.'); } // Step 5: Process each property const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES); log(`\nšŸ“ Step 4: Processing ${propertiesToScrape.length} properties...\n`); const leads = []; for (let i = 0; i < propertiesToScrape.length; i++) { const prop = propertiesToScrape[i]; log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`); // Navigate directly to ownership page (from research - no clicking property cards) const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`; log(` šŸ”— Navigating to ownership page...`); await page.goto(ownershipUrl, { waitUntil: 'networkidle2', timeout: 30000 }); // Wait for page to load log(` ā³ Waiting for Owner tab to load...`); await sleep(5000); // Extract from Builder and Lot tab log(` šŸ“Š Extracting Builder and Lot data...`); const builderLotData = await extractBuilderLotData(page); // Wait a bit before extracting from Owner tab await sleep(1000); // Extract from Owner tab (CRITICAL: emails + phones) log(` šŸ‘¤ Extracting Owner tab data...`); const ownerData = await extractOwnerTabData(page); const lead = { scrapeDate: new Date().toISOString().split('T')[0], propertyId: prop.id, propertyUrl: ownershipUrl, ...builderLotData, ...ownerData }; log(` šŸ“§ Emails: ${lead.emails.length} found`); log(` šŸ“ž Phones: ${lead.phones.length} found`); log(` šŸ‘¤ Owners: ${lead.ownerNames.length} found`); log(` šŸ“ Address: ${lead.address || 'N/A'}`); log(` šŸ¢ Property Type: ${lead.propertyType || 'N/A'}`); log(` šŸ“ Square Footage: ${lead.squareFootage || 'N/A'}`); leads.push(lead); // Screenshot for debugging (first 3 properties only) if (i < 3) { const screenshotPath = `/tmp/reonomy-v12-property-${i + 1}.png`; await page.screenshot({ path: screenshotPath, fullPage: false }); log(` šŸ“ø Screenshot saved: ${screenshotPath}`); } } // Step 6: Save results if (leads.length > 0) { log(`\nāœ… Total leads scraped: ${leads.length}`); const outputData = { scrapeDate: new Date().toISOString(), searchId: searchId, leadCount: leads.length, leads: leads }; fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2)); log(`šŸ’¾ Saved to: ${OUTPUT_FILE}`); } else { log('\nāš ļø No leads scraped.'); } log('\nāœ… Scraping complete!'); return { leadCount: leads.length, outputFile: OUTPUT_FILE }; } catch (error) { log(`\nāŒ Error: ${error.message}`); log(error.stack); // Take screenshot of error state try { await page.screenshot({ path: '/tmp/reonomy-v12-error.png', fullPage: true }); log('šŸ“ø Error screenshot saved: /tmp/reonomy-v12-error.png'); } catch (e) { log('Could not save error screenshot'); } throw error; } finally { await browser.close(); log('\nšŸ”š Browser closed'); process.exit(0); } } // Run scrapeLeads() .then(result => { log(`\nšŸŽ‰ Success! ${result.leadCount} leads scraped.`); console.log(`\nšŸ’¾ View your leads at: ${result.outputFile}`); process.exit(0); }) .catch(error => { log(`\nšŸ’„ Scraper failed: ${error.message}`); process.exit(1); });