#!/usr/bin/env node /** * Reonomy Lead Scraper v2 * * Improved scraper with better data extraction from dashboard * and search results. */ const puppeteer = require('puppeteer'); const { execSync } = require('child_process'); const fs = require('fs'); const path = require('path'); // Configuration from environment variables const REONOMY_EMAIL = process.env.REONOMY_EMAIL; const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD; const SHEET_ID = process.env.REONOMY_SHEET_ID; const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads'; const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY'; const HEADLESS = process.env.HEADLESS === 'true'; // Validate credentials if (!REONOMY_EMAIL || !REONOMY_PASSWORD) { console.error('āŒ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.'); console.error(' Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper-v2.js'); process.exit(1); } // Log file const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log'); function log(message) { const timestamp = new Date().toISOString(); const logMessage = `[${timestamp}] ${message}\n`; console.log(message); fs.appendFileSync(LOG_FILE, logMessage); } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Execute gog CLI command */ function gogCommand(command) { try { let fullCommand = `gog ${command}`; const account = process.env.GOG_ACCOUNT; if (account) { fullCommand = `gog --account "${account}" ${command}`; } const output = execSync(fullCommand, { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }); const combinedOutput = (output || '').trim(); return combinedOutput; } catch (error) { if (error.status !== 0) { const stderr = error.stderr ? error.stderr.toString() : ''; const stdout = error.stdout ? error.stdout.toString() : ''; if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) { return stdout.trim(); } if (stderr.includes('error') || stderr.includes('Error')) { throw new Error(`gog command failed: ${stderr}`); } throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`); } throw error; } } /** * Get or create Google Sheet */ async function getOrCreateSheet() { log('šŸ“Š Checking Google Sheets...'); if (SHEET_ID) { log(`āœ… Using existing sheet: ${SHEET_ID}`); return SHEET_ID; } try { log('šŸ“ Creating new Google Sheet...'); const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`); try { const result = JSON.parse(output); const newSheetId = result.spreadsheetId || result.id; log(`āœ… Created new sheet: ${newSheetId}`); return newSheetId; } catch (error) { const match = output.match(/([0-9A-Za-z_-]{20,})/); if (match) { log(`āœ… Created new sheet: ${match[1]}`); return match[1]; } throw new Error('Could not parse sheet ID from gog output'); } } catch (error) { log(`āš ļø Could not create Google Sheet: ${error.message}`); log('šŸ’¾ Leads will be saved to JSON file instead'); return null; } } /** * Initialize sheet with headers */ async function initializeSheet(sheetId) { log('šŸ“‹ Initializing sheet headers...'); const headers = [ 'Scrape Date', 'Owner Name', 'Property Address', 'City', 'State', 'ZIP', 'Property Type', 'Square Footage', 'Owner Location', 'Property Count', 'Property URL', 'Owner URL', 'Email', 'Phone' ]; const headerString = headers.map(h => `"${h}"`).join(' '); try { gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`); log('āœ… Sheet headers initialized'); } catch (error) { log(`āš ļø Could not set headers: ${error.message}`); } } /** * Append row to Google Sheet or save to JSON file */ async function appendToSheet(sheetId, rowData) { if (sheetId) { const values = Object.values(rowData).map(v => { if (v === null || v === undefined) return ''; const str = String(v).replace(/"/g, '""'); return `"${str}"`; }).join(' '); try { gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`); log(`āœ… Added: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`); } catch (error) { log(`āŒ Error appending to sheet: ${error.message}`); } } else { jsonLeads.push(rowData); log(`āœ… Collected: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`); } } /** * Save leads to JSON file */ function saveToJsonFile(leads) { const filename = path.join(__dirname, 'reonomy-leads.json'); const data = { scrapeDate: new Date().toISOString(), leadCount: leads.length, location: SEARCH_LOCATION, leads: leads }; try { fs.writeFileSync(filename, JSON.stringify(data, null, 2)); log(`šŸ’¾ Saved ${leads.length} leads to ${filename}`); return filename; } catch (error) { log(`āŒ Error saving to JSON: ${error.message}`); return null; } } let jsonLeads = []; /** * Extract property addresses and details from dashboard */ async function extractPropertiesFromDashboard(page) { log('šŸ” Extracting property data from dashboard...'); const properties = await page.evaluate(() => { const results = []; // Find all property links const propertyLinks = Array.from(document.querySelectorAll('a[href*="/property/"]')); propertyLinks.forEach(link => { const text = (link.innerText || link.textContent || '').trim(); // Look for address patterns (starts with number, has comma) const addressMatch = text.match(/^(\d+.+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/); if (addressMatch) { results.push({ fullText: text, address: addressMatch[1].trim(), city: addressMatch[2].trim(), state: addressMatch[3].trim(), zip: addressMatch[4].trim(), url: link.href, remainingText: text.substring(addressMatch[0].length).trim() }); } }); return results; }); const scrapeDate = new Date().toISOString().split('T')[0]; const leads = []; for (const prop of properties) { // Extract property type and square footage from remaining text const sqFtMatch = prop.remainingText.match(/(\d+\.?\d*)\s*k?\s*SF/i); const sqFt = sqFtMatch ? sqFtMatch[0] : ''; const propertyType = prop.remainingText.replace(sqFt, '').trim() || ''; const lead = { scrapeDate, ownerName: '', propertyAddress: prop.address, city: prop.city, state: prop.state, zip: prop.zip, propertyType, squareFootage: sqFt, ownerLocation: '', propertyCount: '', propertyUrl: prop.url, ownerUrl: '', email: '', phone: '' }; leads.push(lead); } log(`āœ… Extracted ${leads.length} properties`); return leads; } /** * Extract owner data from dashboard */ async function extractOwnersFromDashboard(page) { log('šŸ” Extracting owner data from dashboard...'); const owners = await page.evaluate(() => { const results = []; const ownerLinks = Array.from(document.querySelectorAll('a[href*="/person/"]')); ownerLinks.forEach(link => { const text = (link.innerText || link.textContent || '').trim(); // Pattern: Owner name\nOwns X properties Location const lines = text.split('\n').map(l => l.trim()).filter(l => l); if (lines.length >= 2) { const ownerName = lines[0]; const location = lines.find(l => l.includes(',')) || ''; const propertyCountMatch = text.match(/(\d+)\s*propert/i); const propertyCount = propertyCountMatch ? propertyCountMatch[1] : ''; results.push({ ownerName, location, propertyCount, url: link.href, fullText: text }); } }); return results; }); const scrapeDate = new Date().toISOString().split('T')[0]; const leads = []; for (const owner of owners) { // Parse location more carefully - extract city and state // Format is: "Owns X properties City, State" or just "City, State" let city = ''; let state = ''; let ownerLocation = owner.location; if (ownerLocation.includes(',')) { const parts = ownerLocation.split(',').map(p => p.trim()); // If the last part is a state (2 uppercase letters), use it if (parts.length >= 2 && /^[A-Z]{2}$/.test(parts[parts.length - 1])) { state = parts[parts.length - 1]; // The city is the second-to-last part, but we need to remove "Owns X properties" prefix const cityWithPrefix = parts[parts.length - 2]; const cityMatch = cityWithPrefix.match(/(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$/); city = cityMatch ? cityMatch[1] : ''; } else if (parts.length === 2) { city = parts[0]; state = parts[1]; } } const lead = { scrapeDate, ownerName: owner.ownerName, propertyAddress: '', city, state, zip: '', propertyType: '', squareFootage: '', ownerLocation: owner.location, propertyCount: owner.propertyCount, propertyUrl: '', ownerUrl: owner.url, email: '', phone: '' }; leads.push(lead); } log(`āœ… Extracted ${leads.length} owners`); return leads; } /** * Main scraper function */ async function scrapeLeads() { log('šŸš€ Starting Reonomy Lead Scraper v2...\n'); const browser = await puppeteer.launch({ headless: HEADLESS ? 'new' : false, args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080'] }); const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); let sheetId; try { // Setup Google Sheet sheetId = await getOrCreateSheet(); if (sheetId) { try { const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`); if (!existingData.includes('Owner Name')) { await initializeSheet(sheetId); } } catch (error) { await initializeSheet(sheetId); } } else { log('šŸ’¾ Will save leads to: reonomy-leads.json'); } // Login to Reonomy log('\nšŸ“ Step 1: Logging into Reonomy...'); await page.goto('https://app.reonomy.com/#!/account', { waitUntil: 'domcontentloaded', timeout: 60000 }); await sleep(2000); await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 }); await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 }); await page.click('button[type="submit"]'); log('ā³ Logging in...'); await sleep(8000); const url = page.url(); if (url.includes('login') || url.includes('auth')) { throw new Error('Login failed. Please check credentials.'); } log('āœ… Successfully logged in!'); // Navigate to home/dashboard to extract recent data log('\nšŸ“ Step 2: Navigating to dashboard...'); await page.goto('https://app.reonomy.com/#!/home', { waitUntil: 'networkidle2', timeout: 60000 }); await sleep(3000); log('āœ… On dashboard'); // Extract leads log('\nšŸ“ Step 3: Extracting lead data...'); const allLeads = []; // Extract properties const properties = await extractPropertiesFromDashboard(page); allLeads.push(...properties); // Extract owners const owners = await extractOwnersFromDashboard(page); allLeads.push(...owners); log(`\nāœ… Total leads extracted: ${allLeads.length}`); if (allLeads.length === 0) { log('\nāš ļø No leads found. Taking screenshot for debugging...'); await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true }); log('šŸ“ø Screenshot saved: /tmp/reonomy-no-leads.png'); } else { // Save leads log('\nšŸ“ Step 4: Saving leads...'); for (const lead of allLeads) { await appendToSheet(sheetId, lead); await sleep(500); } if (!sheetId && jsonLeads.length > 0) { saveToJsonFile(jsonLeads); } } log('\nāœ… Scraping complete!'); if (sheetId) { log(`šŸ“Š Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`); } else { log('šŸ’¾ Leads saved to: reonomy-leads.json'); } log(`šŸ“ Log file: ${LOG_FILE}`); return { sheetId, leadCount: allLeads.length }; } catch (error) { log(`\nāŒ Error: ${error.message}`); log(error.stack); try { await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true }); log('šŸ“ø Error screenshot saved: /tmp/reonomy-error.png'); } catch (e) { // Ignore screenshot errors } throw error; } finally { await browser.close(); log('\nšŸ”š Browser closed'); } } // Run scraper scrapeLeads() .then(result => { log(`\nšŸŽ‰ Success! ${result.leadCount} leads scraped.`); if (result.sheetId) { console.log(`\nšŸ“Š View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`); } process.exit(0); }) .catch(error => { log(`\nšŸ’„ Scraper failed: ${error.message}`); process.exit(1); });