1110 lines
31 KiB
JavaScript
1110 lines
31 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Reonomy Lead Scraper
|
||
*
|
||
* Scrapes property and owner leads from Reonomy and exports to Google Sheets.
|
||
*
|
||
* Usage:
|
||
* node reonomy-scraper.js [options]
|
||
*
|
||
* Environment Variables:
|
||
* REONOMY_EMAIL - Reonomy login email
|
||
* REONOMY_PASSWORD - Reonomy login password
|
||
* REONOMY_SHEET_ID - Google Sheet ID (optional, will create new sheet if not provided)
|
||
* REONOMY_LOCATION - Search location (e.g., "New York, NY")
|
||
* HEADLESS - Set to "true" for headless mode
|
||
*/
|
||
|
||
const puppeteer = require('puppeteer');
|
||
const { execSync } = require('child_process');
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// Configuration from environment variables
|
||
const REONOMY_EMAIL = process.env.REONOMY_EMAIL;
|
||
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD;
|
||
const SHEET_ID = process.env.REONOMY_SHEET_ID;
|
||
const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads';
|
||
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
|
||
const HEADLESS = process.env.HEADLESS === 'true';
|
||
const MAX_PROPERTIES = 20; // Skip property pages (no contact info there)
|
||
const MAX_OWNERS = 2; // Limit number of owners to scrape to avoid rate limiting
|
||
const PAGE_DELAY_MS = 3000; // Delay between page visits for rate limiting
|
||
|
||
// Validate credentials
|
||
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
|
||
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
|
||
console.error(' Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper.js');
|
||
process.exit(1);
|
||
}
|
||
|
||
// Log file
|
||
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');
|
||
|
||
function log(message) {
|
||
const timestamp = new Date().toISOString();
|
||
const logMessage = `[${timestamp}] ${message}\n`;
|
||
console.log(message);
|
||
fs.appendFileSync(LOG_FILE, logMessage);
|
||
}
|
||
|
||
function sleep(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Execute gog CLI command
|
||
*/
|
||
function gogCommand(command) {
|
||
try {
|
||
// Add account if specified
|
||
let fullCommand = `gog ${command}`;
|
||
const account = process.env.GOG_ACCOUNT;
|
||
if (account) {
|
||
fullCommand = `gog --account "${account}" ${command}`;
|
||
}
|
||
|
||
const output = execSync(fullCommand, {
|
||
encoding: 'utf-8',
|
||
timeout: 30000,
|
||
stdio: ['pipe', 'pipe', 'pipe']
|
||
});
|
||
|
||
// Combine stdout and stderr
|
||
const combinedOutput = (output || '').trim();
|
||
return combinedOutput;
|
||
} catch (error) {
|
||
// Check if it's a real error or just stderr output
|
||
if (error.status !== 0) {
|
||
const stderr = error.stderr ? error.stderr.toString() : '';
|
||
const stdout = error.stdout ? error.stdout.toString() : '';
|
||
|
||
// If we got useful output in stdout despite the error status, return it
|
||
if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) {
|
||
return stdout.trim();
|
||
}
|
||
|
||
// Otherwise throw the error
|
||
if (stderr.includes('error') || stderr.includes('Error')) {
|
||
throw new Error(`gog command failed: ${stderr}`);
|
||
}
|
||
throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`);
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get or create Google Sheet
|
||
*/
|
||
async function getOrCreateSheet() {
|
||
log('📊 Checking Google Sheets...');
|
||
|
||
if (SHEET_ID) {
|
||
log(`✅ Using existing sheet: ${SHEET_ID}`);
|
||
return SHEET_ID;
|
||
}
|
||
|
||
try {
|
||
// Create a new sheet
|
||
log('📝 Creating new Google Sheet...');
|
||
const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`);
|
||
|
||
try {
|
||
const result = JSON.parse(output);
|
||
const newSheetId = result.spreadsheetId || result.id;
|
||
log(`✅ Created new sheet: ${newSheetId}`);
|
||
return newSheetId;
|
||
} catch (error) {
|
||
// Try to extract ID from text output
|
||
const match = output.match(/([0-9A-Za-z_-]{20,})/);
|
||
if (match) {
|
||
log(`✅ Created new sheet: ${match[1]}`);
|
||
return match[1];
|
||
}
|
||
throw new Error('Could not parse sheet ID from gog output');
|
||
}
|
||
} catch (error) {
|
||
log(`⚠️ Could not create Google Sheet: ${error.message}`);
|
||
log('💾 Leads will be saved to JSON file instead');
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Initialize sheet with headers
|
||
*/
|
||
async function initializeSheet(sheetId) {
|
||
log('📋 Initializing sheet headers...');
|
||
|
||
const headers = [
|
||
'Scrape Date',
|
||
'Owner Name',
|
||
'Property Address',
|
||
'City',
|
||
'State',
|
||
'ZIP',
|
||
'Property Type',
|
||
'Square Footage',
|
||
'Owner Location',
|
||
'Property Count',
|
||
'Property URL',
|
||
'Owner URL',
|
||
'Email',
|
||
'Phone'
|
||
];
|
||
|
||
const headerString = headers.map(h => `"${h}"`).join(' ');
|
||
|
||
try {
|
||
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
|
||
log('✅ Sheet headers initialized');
|
||
} catch (error) {
|
||
log(`⚠️ Could not set headers: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Append row to Google Sheet or save to JSON file
|
||
*/
|
||
async function appendToSheet(sheetId, rowData) {
|
||
if (sheetId) {
|
||
const values = Object.values(rowData).map(v => {
|
||
if (v === null || v === undefined) return '';
|
||
// Escape quotes
|
||
const str = String(v).replace(/"/g, '""');
|
||
return `"${str}"`;
|
||
}).join(' ');
|
||
|
||
try {
|
||
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
|
||
log(`✅ Added: ${rowData.ownerName} - ${rowData.propertyAddress}`);
|
||
} catch (error) {
|
||
log(`❌ Error appending to sheet: ${error.message}`);
|
||
}
|
||
} else {
|
||
// Save to JSON file
|
||
jsonLeads.push(rowData);
|
||
log(`✅ Collected: ${rowData.ownerName} - ${rowData.propertyAddress}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Save leads to JSON file
|
||
*/
|
||
function saveToJsonFile(leads) {
|
||
const filename = path.join(__dirname, 'reonomy-leads.json');
|
||
const data = {
|
||
scrapeDate: new Date().toISOString(),
|
||
leadCount: leads.length,
|
||
location: SEARCH_LOCATION,
|
||
leads: leads
|
||
};
|
||
|
||
try {
|
||
fs.writeFileSync(filename, JSON.stringify(data, null, 2));
|
||
log(`💾 Saved ${leads.length} leads to ${filename}`);
|
||
return filename;
|
||
} catch (error) {
|
||
log(`❌ Error saving to JSON: ${error.message}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
// Global array to store leads when not using Google Sheets
|
||
let jsonLeads = [];
|
||
|
||
/**
|
||
* Extract contact info from a property detail page
|
||
*/
|
||
async function extractPropertyContactInfo(page, propertyUrl) {
|
||
log(` 🏠 Visiting property: ${propertyUrl}`);
|
||
|
||
try {
|
||
await page.goto(propertyUrl, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000); // Wait for dynamic content to load
|
||
|
||
const contactInfo = await page.evaluate(() => {
|
||
const info = {
|
||
email: '',
|
||
phone: '',
|
||
ownerName: '',
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: ''
|
||
};
|
||
|
||
// Extract email - multiple possible selectors (specific IDs first)
|
||
const emailSelectors = [
|
||
'#people-contact-email-id',
|
||
'[data-person-id="people-contact-email-id"]',
|
||
'a[href^="mailto:"]',
|
||
'[data-test*="email"]',
|
||
'[data-testid*="email"]',
|
||
'.email-address',
|
||
'.owner-email',
|
||
'.contact-info [data-test*="email"]'
|
||
];
|
||
|
||
for (const selector of emailSelectors) {
|
||
const emailEl = document.querySelector(selector);
|
||
if (emailEl) {
|
||
info.email = emailEl.innerText || emailEl.textContent;
|
||
// Clean up email if it's in a mailto: link
|
||
if (info.email.startsWith('mailto:')) {
|
||
info.email = info.email.replace('mailto:', '');
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract phone - multiple possible selectors (specific IDs first)
|
||
const phoneSelectors = [
|
||
'#people-contact-phone-1',
|
||
'#people-contact-phone-2',
|
||
'#people-contact-phone-3',
|
||
'[data-person-id="people-contact-phone-1"]',
|
||
'[data-person-id="people-contact-phone-2"]',
|
||
'[data-person-id="people-contact-phone-3"]',
|
||
'a[href^="tel:"]',
|
||
'[data-test*="phone"]',
|
||
'[data-testid*="phone"]',
|
||
'.phone-number',
|
||
'.contact-info [data-test*="phone"]',
|
||
'.owner-phone'
|
||
];
|
||
|
||
for (const selector of phoneSelectors) {
|
||
const phoneEl = document.querySelector(selector);
|
||
if (phoneEl) {
|
||
info.phone = phoneEl.innerText || phoneEl.textContent;
|
||
// Clean up phone if it's in a tel: link
|
||
if (info.phone.startsWith('tel:')) {
|
||
info.phone = info.phone.replace('tel:', '');
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Also try to extract from text content by regex
|
||
const bodyText = document.body.innerText;
|
||
|
||
// Email regex patterns
|
||
const emailPatterns = [
|
||
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
||
/Email[:\s]*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i
|
||
];
|
||
|
||
if (!info.email) {
|
||
for (const pattern of emailPatterns) {
|
||
const match = bodyText.match(pattern);
|
||
if (match && match[0]) {
|
||
info.email = match[0].replace(/^email[:\s]*/i, '');
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Phone regex patterns
|
||
const phonePatterns = [
|
||
/\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
|
||
/\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
|
||
/Phone[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i,
|
||
/Tel[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i
|
||
];
|
||
|
||
if (!info.phone) {
|
||
for (const pattern of phonePatterns) {
|
||
const matches = bodyText.match(pattern);
|
||
if (matches) {
|
||
// Use the first valid phone number found
|
||
info.phone = matches[0].replace(/^phone[:\s]*/i, '').replace(/^tel[:\s]*/i, '');
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract owner name from property page
|
||
const ownerSelectors = [
|
||
'[data-test*="owner"]',
|
||
'[data-testid*="owner"]',
|
||
'.owner-name',
|
||
'.owner',
|
||
'h1',
|
||
'h2'
|
||
];
|
||
|
||
for (const selector of ownerSelectors) {
|
||
const ownerEl = document.querySelector(selector);
|
||
if (ownerEl) {
|
||
const text = ownerEl.innerText || ownerEl.textContent;
|
||
if (text && text.length > 2 && text.length < 100) {
|
||
info.ownerName = text;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract property address
|
||
const addressSelectors = [
|
||
'[data-test*="address"]',
|
||
'[data-testid*="address"]',
|
||
'.property-address',
|
||
'.address',
|
||
'h1',
|
||
'h2'
|
||
];
|
||
|
||
for (const selector of addressSelectors) {
|
||
const addrEl = document.querySelector(selector);
|
||
if (addrEl) {
|
||
const text = addrEl.innerText || addrEl.textContent;
|
||
if (text && text.match(/\d+/)) {
|
||
info.propertyAddress = text;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract property type
|
||
const typeSelectors = [
|
||
'[data-test*="type"]',
|
||
'[data-testid*="type"]',
|
||
'.property-type',
|
||
'.type'
|
||
];
|
||
|
||
for (const selector of typeSelectors) {
|
||
const typeEl = document.querySelector(selector);
|
||
if (typeEl) {
|
||
info.propertyType = typeEl.innerText || typeEl.textContent;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract square footage
|
||
const sfSelectors = [
|
||
'[data-test*="sf"]',
|
||
'[data-testid*="sf"]',
|
||
'.square-footage',
|
||
'.sf',
|
||
'.sqft'
|
||
];
|
||
|
||
for (const selector of sfSelectors) {
|
||
const sfEl = document.querySelector(selector);
|
||
if (sfEl) {
|
||
info.squareFootage = sfEl.innerText || sfEl.textContent;
|
||
break;
|
||
}
|
||
}
|
||
|
||
return info;
|
||
});
|
||
|
||
log(` 📧 Email: ${contactInfo.email || 'Not found'}`);
|
||
log(` 📞 Phone: ${contactInfo.phone || 'Not found'}`);
|
||
|
||
return contactInfo;
|
||
|
||
} catch (error) {
|
||
log(` ⚠️ Error extracting from property page: ${error.message}`);
|
||
return {
|
||
email: '',
|
||
phone: '',
|
||
ownerName: '',
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: ''
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract contact info from an owner detail page
|
||
*/
|
||
async function extractOwnerContactInfo(page, ownerUrl) {
|
||
log(` 👤 Visiting owner: ${ownerUrl}`);
|
||
|
||
try {
|
||
await page.goto(ownerUrl, {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000); // Wait for dynamic content to load
|
||
|
||
// DEBUG: Save screenshot
|
||
const ownerMatch = ownerUrl.match(/person\/([a-zA-Z0-9_-]+)/);
|
||
const ownerId = ownerMatch ? ownerMatch[1] : 'unknown';
|
||
const debugPath = `/tmp/reonomy-owner-${ownerId}.png`;
|
||
await page.screenshot({ path: debugPath, fullPage: true });
|
||
log(` 📸 Debug screenshot saved: ${debugPath}`);
|
||
|
||
// DEBUG: Save HTML content
|
||
const htmlPath = `/tmp/reonomy-owner-${ownerId}.html`;
|
||
const htmlContent = await page.content();
|
||
fs.writeFileSync(htmlPath, htmlContent);
|
||
log(` 📄 Debug HTML saved: ${htmlPath}`);
|
||
|
||
const contactInfo = await page.evaluate(() => {
|
||
const info = {
|
||
email: '',
|
||
phone: '',
|
||
ownerName: '',
|
||
ownerLocation: '',
|
||
propertyCount: ''
|
||
};
|
||
|
||
// Extract email - multiple possible selectors (specific IDs first)
|
||
const emailSelectors = [
|
||
'#people-contact-email-id',
|
||
'[data-person-id="people-contact-email-id"]',
|
||
'a[href^="mailto:"]',
|
||
'[data-test*="email"]',
|
||
'[data-testid*="email"]',
|
||
'.email-address',
|
||
'.owner-email',
|
||
'.contact-info [data-test*="email"]'
|
||
];
|
||
|
||
for (const selector of emailSelectors) {
|
||
const emailEl = document.querySelector(selector);
|
||
if (emailEl) {
|
||
info.email = emailEl.innerText || emailEl.textContent;
|
||
// Clean up email if it's in a mailto: link
|
||
if (info.email.startsWith('mailto:')) {
|
||
info.email = info.email.replace('mailto:', '');
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract phone - multiple possible selectors (specific IDs first)
|
||
const phoneSelectors = [
|
||
'#people-contact-phone-1',
|
||
'#people-contact-phone-2',
|
||
'#people-contact-phone-3',
|
||
'[data-person-id="people-contact-phone-1"]',
|
||
'[data-person-id="people-contact-phone-2"]',
|
||
'[data-person-id="people-contact-phone-3"]',
|
||
'a[href^="tel:"]',
|
||
'[data-test*="phone"]',
|
||
'[data-testid*="phone"]',
|
||
'.phone-number',
|
||
'.contact-info [data-test*="phone"]',
|
||
'.owner-phone'
|
||
];
|
||
|
||
for (const selector of phoneSelectors) {
|
||
const phoneEl = document.querySelector(selector);
|
||
if (phoneEl) {
|
||
info.phone = phoneEl.innerText || phoneEl.textContent;
|
||
// Clean up phone if it's in a tel: link
|
||
if (info.phone.startsWith('tel:')) {
|
||
info.phone = info.phone.replace('tel:', '');
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Also try to extract from text content by regex
|
||
const bodyText = document.body.innerText;
|
||
|
||
// Email regex patterns
|
||
const emailPatterns = [
|
||
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
||
/Email[:\s]*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i
|
||
];
|
||
|
||
if (!info.email) {
|
||
for (const pattern of emailPatterns) {
|
||
const match = bodyText.match(pattern);
|
||
if (match && match[0]) {
|
||
info.email = match[0].replace(/^email[:\s]*/i, '');
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Phone regex patterns
|
||
const phonePatterns = [
|
||
/\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
|
||
/\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
|
||
/Phone[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i,
|
||
/Tel[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i
|
||
];
|
||
|
||
if (!info.phone) {
|
||
for (const pattern of phonePatterns) {
|
||
const matches = bodyText.match(pattern);
|
||
if (matches) {
|
||
// Use the first valid phone number found
|
||
info.phone = matches[0].replace(/^phone[:\s]*/i, '').replace(/^tel[:\s]*/i, '');
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract owner name
|
||
const nameSelectors = [
|
||
'[data-test*="name"]',
|
||
'[data-testid*="name"]',
|
||
'.owner-name',
|
||
'.person-name',
|
||
'h1',
|
||
'h2'
|
||
];
|
||
|
||
for (const selector of nameSelectors) {
|
||
const nameEl = document.querySelector(selector);
|
||
if (nameEl) {
|
||
const text = nameEl.innerText || nameEl.textContent;
|
||
if (text && text.length > 2 && text.length < 100) {
|
||
info.ownerName = text;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract owner location
|
||
const locationSelectors = [
|
||
'[data-test*="location"]',
|
||
'[data-testid*="location"]',
|
||
'.location',
|
||
'.owner-location',
|
||
'.city-state'
|
||
];
|
||
|
||
for (const selector of locationSelectors) {
|
||
const locEl = document.querySelector(selector);
|
||
if (locEl) {
|
||
const text = locEl.innerText || locEl.textContent;
|
||
if (text && text.includes(',')) {
|
||
info.ownerLocation = text;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract property count
|
||
const countSelectors = [
|
||
'[data-test*="property-count"]',
|
||
'[data-testid*="property-count"]',
|
||
'.property-count',
|
||
'.properties-owned',
|
||
'.total-properties'
|
||
];
|
||
|
||
for (const selector of countSelectors) {
|
||
const countEl = document.querySelector(selector);
|
||
if (countEl) {
|
||
const text = countEl.innerText || countEl.textContent;
|
||
if (text.match(/\d+/)) {
|
||
info.propertyCount = text;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Also try to extract property count from text
|
||
if (!info.propertyCount) {
|
||
const countMatch = bodyText.match(/(\d+)\s*propert(?:y|ies)/i);
|
||
if (countMatch) {
|
||
info.propertyCount = countMatch[1];
|
||
}
|
||
}
|
||
|
||
return info;
|
||
});
|
||
|
||
log(` 📧 Email: ${contactInfo.email || 'Not found'}`);
|
||
log(` 📞 Phone: ${contactInfo.phone || 'Not found'}`);
|
||
|
||
return contactInfo;
|
||
|
||
} catch (error) {
|
||
log(` ⚠️ Error extracting from owner page: ${error.message}`);
|
||
return {
|
||
email: '',
|
||
phone: '',
|
||
ownerName: '',
|
||
ownerLocation: '',
|
||
propertyCount: ''
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Main scraper function
|
||
*/
|
||
async function scrapeLeads() {
|
||
log('🚀 Starting Reonomy Lead Scraper...\n');
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: HEADLESS ? 'new' : false,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
let sheetId;
|
||
|
||
try {
|
||
// Step 1: Setup Google Sheet
|
||
sheetId = await getOrCreateSheet();
|
||
|
||
// If we have a sheet, initialize headers
|
||
if (sheetId) {
|
||
// Check if sheet has headers by trying to get them
|
||
try {
|
||
const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`);
|
||
if (!existingData.includes('Owner Name')) {
|
||
await initializeSheet(sheetId);
|
||
}
|
||
} catch (error) {
|
||
// Sheet might be empty, initialize it
|
||
await initializeSheet(sheetId);
|
||
}
|
||
} else {
|
||
// No sheet available, prepare to save to file
|
||
log('💾 Will save leads to: reonomy-leads.json');
|
||
}
|
||
|
||
// Step 2: Login to Reonomy
|
||
log('\n📍 Step 1: Logging into Reonomy...');
|
||
await page.goto('https://app.reonomy.com/#!/account', {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(2000);
|
||
|
||
// Fill credentials
|
||
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
||
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
||
|
||
// Submit login
|
||
await page.click('button[type="submit"]');
|
||
log('⏳ Logging in...');
|
||
|
||
await sleep(8000);
|
||
|
||
// Check if we're logged in
|
||
const url = page.url();
|
||
if (url.includes('login') || url.includes('auth')) {
|
||
throw new Error('Login failed. Please check credentials.');
|
||
}
|
||
|
||
log('✅ Successfully logged in!');
|
||
|
||
// Step 3: Navigate to search
|
||
log('\n📍 Step 2: Navigating to search...');
|
||
await page.goto('https://app.reonomy.com/#!/search', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 60000
|
||
});
|
||
|
||
await sleep(3000);
|
||
log('✅ On search page');
|
||
|
||
// Step 4: Enter search query
|
||
log(`\n📍 Step 3: Searching for: ${SEARCH_LOCATION}`);
|
||
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
|
||
timeout: 10000
|
||
}).catch(() => {
|
||
// Try alternative selector
|
||
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
|
||
});
|
||
|
||
if (searchInput) {
|
||
await searchInput.click({ clickCount: 3 }); // Select all
|
||
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
|
||
await sleep(1000);
|
||
|
||
// Press Enter to search
|
||
await page.keyboard.press('Enter');
|
||
log('⏳ Searching...');
|
||
|
||
// Wait for results to load
|
||
await sleep(5000);
|
||
} else {
|
||
log('⚠️ Could not find search input, trying alternative method...');
|
||
}
|
||
|
||
// Step 5: Extract leads from the page
|
||
log('\n📍 Step 4: Finding owner links (contact info is on owner pages)...');
|
||
|
||
// Extract property and owner links from the page
|
||
const { propertyLinks, ownerLinks } = await extractLinksFromPage(page);
|
||
|
||
log(`👤 Found ${ownerLinks.length} owner links`);
|
||
|
||
const leads = [];
|
||
const scrapeDate = new Date().toISOString().split('T')[0];
|
||
|
||
// Skip property pages - no contact info there
|
||
log('\n📍 Step 5: Skipping property pages (no contact info)...');
|
||
|
||
// Step 6: Visit owner pages to extract contact info
|
||
log('\n📍 Step 6: Extracting contact info from owner pages...');
|
||
const ownersToScrape = ownerLinks.slice(0, MAX_OWNERS);
|
||
|
||
for (let i = 0; i < ownersToScrape.length; i++) {
|
||
log(`\n[${i + 1}/${ownersToScrape.length}]`);
|
||
|
||
const ownerUrl = ownersToScrape[i];
|
||
const contactInfo = await extractOwnerContactInfo(page, ownerUrl);
|
||
|
||
// Parse owner ID from URL
|
||
const ownerMatch = ownerUrl.match(/person\/([^/]+)/);
|
||
const ownerId = ownerMatch ? ownerMatch[1] : '';
|
||
|
||
const lead = {
|
||
scrapeDate,
|
||
ownerName: contactInfo.ownerName || ownerId,
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: '',
|
||
ownerLocation: contactInfo.ownerLocation || '',
|
||
propertyCount: contactInfo.propertyCount || '',
|
||
propertyUrl: '',
|
||
ownerUrl: ownerUrl,
|
||
email: contactInfo.email || '',
|
||
phone: contactInfo.phone || ''
|
||
};
|
||
|
||
leads.push(lead);
|
||
|
||
// Rate limiting between page visits
|
||
if (i < ownersToScrape.length - 1) {
|
||
await sleep(PAGE_DELAY_MS);
|
||
}
|
||
}
|
||
|
||
log(`\n✅ Found ${leads.length} total leads`);
|
||
|
||
if (leads.length === 0) {
|
||
log('\n⚠️ No leads extracted. The page structure may have changed.');
|
||
log(' Please check the screenshot and logs for details.');
|
||
|
||
// Save screenshot for debugging
|
||
await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
|
||
log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
|
||
} else {
|
||
// Step 8: Save leads
|
||
log('\n📍 Step 7: Saving leads...');
|
||
|
||
for (const lead of leads) {
|
||
await appendToSheet(sheetId, lead);
|
||
await sleep(500); // Rate limiting
|
||
}
|
||
|
||
// If no sheet, save to JSON
|
||
if (!sheetId && jsonLeads.length > 0) {
|
||
saveToJsonFile(jsonLeads);
|
||
}
|
||
}
|
||
|
||
log('\n✅ Scraping complete!');
|
||
if (sheetId) {
|
||
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
|
||
} else {
|
||
log('💾 Leads saved to: reonomy-leads.json');
|
||
}
|
||
log(`📝 Log file: ${LOG_FILE}`);
|
||
|
||
return { sheetId, leadCount: leads.length, jsonFile: sheetId ? null : 'reonomy-leads.json' };
|
||
|
||
} catch (error) {
|
||
log(`\n❌ Error: ${error.message}`);
|
||
log(error.stack);
|
||
|
||
// Save error screenshot
|
||
try {
|
||
await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
|
||
log('📸 Error screenshot saved: /tmp/reonomy-error.png');
|
||
} catch (e) {
|
||
// Ignore screenshot errors
|
||
}
|
||
|
||
throw error;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
log('\n🔚 Browser closed');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract property and owner links from the current page
|
||
*/
|
||
async function extractLinksFromPage(page) {
|
||
const propertyLinks = [];
|
||
const ownerLinks = [];
|
||
|
||
try {
|
||
const links = await page.evaluate(() => {
|
||
const propertyUrls = [];
|
||
const ownerUrls = [];
|
||
|
||
// Find all anchor elements
|
||
const anchors = Array.from(document.querySelectorAll('a'));
|
||
|
||
anchors.forEach(anchor => {
|
||
const href = anchor.href || '';
|
||
|
||
// Extract property URLs
|
||
if (href.includes('/property/')) {
|
||
// Extract the property ID and reconstruct the full URL
|
||
const match = href.match(/property\/([a-zA-Z0-9_-]+)/);
|
||
if (match) {
|
||
propertyUrls.push(`https://app.reonomy.com/#!/property/${match[1]}`);
|
||
}
|
||
}
|
||
|
||
// Extract owner/person URLs
|
||
if (href.includes('/person/') || href.includes('/owner/')) {
|
||
// Extract the person ID and reconstruct the full URL
|
||
const match = href.match(/(?:person|owner)\/([a-zA-Z0-9_-]+)/);
|
||
if (match) {
|
||
ownerUrls.push(`https://app.reonomy.com/#!/person/${match[1]}`);
|
||
}
|
||
}
|
||
});
|
||
|
||
return {
|
||
propertyUrls: [...new Set(propertyUrls)], // Remove duplicates
|
||
ownerUrls: [...new Set(ownerUrls)] // Remove duplicates
|
||
};
|
||
});
|
||
|
||
propertyLinks.push(...links.propertyUrls);
|
||
ownerLinks.push(...links.ownerUrls);
|
||
|
||
} catch (error) {
|
||
log(`⚠️ Error extracting links: ${error.message}`);
|
||
}
|
||
|
||
return { propertyLinks, ownerLinks };
|
||
}
|
||
|
||
/**
|
||
* Extract leads from search results page (legacy, kept for compatibility)
|
||
*/
|
||
async function extractLeadsFromPage(page) {
|
||
const leads = [];
|
||
|
||
try {
|
||
// Try to find property cards/listings
|
||
const properties = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
// Look for property cards - various possible selectors
|
||
const selectors = [
|
||
'[data-test*="property"]',
|
||
'[data-testid*="property"]',
|
||
'.property-card',
|
||
'.listing-card',
|
||
'.search-result',
|
||
'.result-item'
|
||
];
|
||
|
||
for (const selector of selectors) {
|
||
const elements = document.querySelectorAll(selector);
|
||
if (elements.length > 0) {
|
||
elements.forEach(el => {
|
||
results.push(el.innerText);
|
||
});
|
||
break;
|
||
}
|
||
}
|
||
|
||
// If no structured cards, try to extract from the whole page
|
||
if (results.length === 0) {
|
||
const bodyText = document.body.innerText;
|
||
|
||
// Look for patterns that might be addresses
|
||
const addressPattern = /\d+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s*\d{5}/g;
|
||
const addresses = bodyText.match(addressPattern) || [];
|
||
|
||
addresses.forEach(addr => {
|
||
results.push(addr);
|
||
});
|
||
}
|
||
|
||
return results.slice(0, 50); // Limit results
|
||
});
|
||
|
||
// Parse extracted data into lead objects
|
||
const scrapeDate = new Date().toISOString().split('T')[0];
|
||
|
||
for (const prop of properties) {
|
||
const lead = parsePropertyData(prop, scrapeDate);
|
||
if (lead && lead.propertyAddress) {
|
||
leads.push(lead);
|
||
}
|
||
}
|
||
|
||
} catch (error) {
|
||
log(`⚠️ Error extracting from page: ${error.message}`);
|
||
}
|
||
|
||
return leads;
|
||
}
|
||
|
||
/**
|
||
* Extract leads from dashboard (legacy, kept for compatibility)
|
||
*/
|
||
async function extractLeadsFromDashboard(page) {
|
||
const leads = [];
|
||
const scrapeDate = new Date().toISOString().split('T')[0];
|
||
|
||
try {
|
||
// Extract recently viewed properties
|
||
const properties = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
// Look for property links
|
||
const links = Array.from(document.querySelectorAll('a[href*="/property/"]'));
|
||
links.forEach(link => {
|
||
results.push({
|
||
text: link.innerText || link.textContent,
|
||
url: link.href
|
||
});
|
||
});
|
||
|
||
return results.slice(0, 20);
|
||
});
|
||
|
||
for (const prop of properties) {
|
||
const lead = parsePropertyData(prop.text, scrapeDate);
|
||
if (lead && lead.propertyAddress) {
|
||
lead.propertyUrl = prop.url;
|
||
leads.push(lead);
|
||
}
|
||
}
|
||
|
||
// Extract recently viewed owners
|
||
const owners = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
const links = Array.from(document.querySelectorAll('a[href*="/person/"]'));
|
||
links.forEach(link => {
|
||
results.push({
|
||
text: link.innerText || link.textContent,
|
||
url: link.href
|
||
});
|
||
});
|
||
|
||
return results.slice(0, 20);
|
||
});
|
||
|
||
for (const owner of owners) {
|
||
const ownerLead = parseOwnerData(owner.text, scrapeDate);
|
||
if (ownerLead && ownerLead.ownerName) {
|
||
ownerLead.ownerUrl = owner.url;
|
||
leads.push(ownerLead);
|
||
}
|
||
}
|
||
|
||
} catch (error) {
|
||
log(`⚠️ Error extracting from dashboard: ${error.message}`);
|
||
}
|
||
|
||
return leads;
|
||
}
|
||
|
||
/**
|
||
* Parse property data from text
|
||
*/
|
||
function parsePropertyData(text, scrapeDate) {
|
||
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
|
||
|
||
return {
|
||
scrapeDate,
|
||
ownerName: '',
|
||
propertyAddress: lines[0] || '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: lines.find(l => l.includes('SF') || l.includes('Industrial') || l.includes('Office')) || '',
|
||
squareFootage: extractSquareFootage(text),
|
||
ownerLocation: '',
|
||
propertyCount: '',
|
||
propertyUrl: '',
|
||
ownerUrl: '',
|
||
email: '',
|
||
phone: ''
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Parse owner data from text
|
||
*/
|
||
function parseOwnerData(text, scrapeDate) {
|
||
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
|
||
|
||
return {
|
||
scrapeDate,
|
||
ownerName: lines[0] || '',
|
||
propertyAddress: '',
|
||
city: '',
|
||
state: '',
|
||
zip: '',
|
||
propertyType: '',
|
||
squareFootage: '',
|
||
ownerLocation: lines.find(l => l.includes(',')) || '',
|
||
propertyCount: extractPropertyCount(text),
|
||
propertyUrl: '',
|
||
ownerUrl: '',
|
||
email: '',
|
||
phone: ''
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Extract square footage from text
|
||
*/
|
||
function extractSquareFootage(text) {
|
||
const match = text.match(/(\d+\.?\d*)\s*k?\s*SF/i);
|
||
return match ? match[1] + (match[0].includes('k') ? 'k SF' : ' SF') : '';
|
||
}
|
||
|
||
/**
|
||
* Extract property count from text
|
||
*/
|
||
function extractPropertyCount(text) {
|
||
const match = text.match(/(\d+)\s*propert(?:y|ies)/i);
|
||
return match ? match[1] : '';
|
||
}
|
||
|
||
// Run scraper
|
||
scrapeLeads()
|
||
.then(result => {
|
||
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
|
||
if (result.sheetId) {
|
||
console.log(`\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
|
||
}
|
||
process.exit(0);
|
||
})
|
||
.catch(error => {
|
||
log(`\n💥 Scraper failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|