clawdbot-workspace/reonomy-scraper.js.bak

1110 lines
31 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Lead Scraper
*
* Scrapes property and owner leads from Reonomy and exports to Google Sheets.
*
* Usage:
* node reonomy-scraper.js [options]
*
* Environment Variables:
* REONOMY_EMAIL - Reonomy login email
* REONOMY_PASSWORD - Reonomy login password
* REONOMY_SHEET_ID - Google Sheet ID (optional, will create new sheet if not provided)
* REONOMY_LOCATION - Search location (e.g., "New York, NY")
* HEADLESS - Set to "true" for headless mode
*/
const puppeteer = require('puppeteer');
const { execSync } = require('child_process');
const fs = require('fs');
const path = require('path');
// Configuration from environment variables
const REONOMY_EMAIL = process.env.REONOMY_EMAIL;
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD;
const SHEET_ID = process.env.REONOMY_SHEET_ID;
const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20; // Skip property pages (no contact info there)
const MAX_OWNERS = 2; // Limit number of owners to scrape to avoid rate limiting
const PAGE_DELAY_MS = 3000; // Delay between page visits for rate limiting
// Validate credentials
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
console.error(' Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper.js');
process.exit(1);
}
// Log file
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute gog CLI command
*/
function gogCommand(command) {
try {
// Add account if specified
let fullCommand = `gog ${command}`;
const account = process.env.GOG_ACCOUNT;
if (account) {
fullCommand = `gog --account "${account}" ${command}`;
}
const output = execSync(fullCommand, {
encoding: 'utf-8',
timeout: 30000,
stdio: ['pipe', 'pipe', 'pipe']
});
// Combine stdout and stderr
const combinedOutput = (output || '').trim();
return combinedOutput;
} catch (error) {
// Check if it's a real error or just stderr output
if (error.status !== 0) {
const stderr = error.stderr ? error.stderr.toString() : '';
const stdout = error.stdout ? error.stdout.toString() : '';
// If we got useful output in stdout despite the error status, return it
if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) {
return stdout.trim();
}
// Otherwise throw the error
if (stderr.includes('error') || stderr.includes('Error')) {
throw new Error(`gog command failed: ${stderr}`);
}
throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`);
}
throw error;
}
}
/**
* Get or create Google Sheet
*/
async function getOrCreateSheet() {
log('📊 Checking Google Sheets...');
if (SHEET_ID) {
log(`✅ Using existing sheet: ${SHEET_ID}`);
return SHEET_ID;
}
try {
// Create a new sheet
log('📝 Creating new Google Sheet...');
const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`);
try {
const result = JSON.parse(output);
const newSheetId = result.spreadsheetId || result.id;
log(`✅ Created new sheet: ${newSheetId}`);
return newSheetId;
} catch (error) {
// Try to extract ID from text output
const match = output.match(/([0-9A-Za-z_-]{20,})/);
if (match) {
log(`✅ Created new sheet: ${match[1]}`);
return match[1];
}
throw new Error('Could not parse sheet ID from gog output');
}
} catch (error) {
log(`⚠️ Could not create Google Sheet: ${error.message}`);
log('💾 Leads will be saved to JSON file instead');
return null;
}
}
/**
* Initialize sheet with headers
*/
async function initializeSheet(sheetId) {
log('📋 Initializing sheet headers...');
const headers = [
'Scrape Date',
'Owner Name',
'Property Address',
'City',
'State',
'ZIP',
'Property Type',
'Square Footage',
'Owner Location',
'Property Count',
'Property URL',
'Owner URL',
'Email',
'Phone'
];
const headerString = headers.map(h => `"${h}"`).join(' ');
try {
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
log('✅ Sheet headers initialized');
} catch (error) {
log(`⚠️ Could not set headers: ${error.message}`);
}
}
/**
* Append row to Google Sheet or save to JSON file
*/
async function appendToSheet(sheetId, rowData) {
if (sheetId) {
const values = Object.values(rowData).map(v => {
if (v === null || v === undefined) return '';
// Escape quotes
const str = String(v).replace(/"/g, '""');
return `"${str}"`;
}).join(' ');
try {
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
log(`✅ Added: ${rowData.ownerName} - ${rowData.propertyAddress}`);
} catch (error) {
log(`❌ Error appending to sheet: ${error.message}`);
}
} else {
// Save to JSON file
jsonLeads.push(rowData);
log(`✅ Collected: ${rowData.ownerName} - ${rowData.propertyAddress}`);
}
}
/**
* Save leads to JSON file
*/
function saveToJsonFile(leads) {
const filename = path.join(__dirname, 'reonomy-leads.json');
const data = {
scrapeDate: new Date().toISOString(),
leadCount: leads.length,
location: SEARCH_LOCATION,
leads: leads
};
try {
fs.writeFileSync(filename, JSON.stringify(data, null, 2));
log(`💾 Saved ${leads.length} leads to ${filename}`);
return filename;
} catch (error) {
log(`❌ Error saving to JSON: ${error.message}`);
return null;
}
}
// Global array to store leads when not using Google Sheets
let jsonLeads = [];
/**
* Extract contact info from a property detail page
*/
async function extractPropertyContactInfo(page, propertyUrl) {
log(` 🏠 Visiting property: ${propertyUrl}`);
try {
await page.goto(propertyUrl, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(2000); // Wait for dynamic content to load
const contactInfo = await page.evaluate(() => {
const info = {
email: '',
phone: '',
ownerName: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: ''
};
// Extract email - multiple possible selectors (specific IDs first)
const emailSelectors = [
'#people-contact-email-id',
'[data-person-id="people-contact-email-id"]',
'a[href^="mailto:"]',
'[data-test*="email"]',
'[data-testid*="email"]',
'.email-address',
'.owner-email',
'.contact-info [data-test*="email"]'
];
for (const selector of emailSelectors) {
const emailEl = document.querySelector(selector);
if (emailEl) {
info.email = emailEl.innerText || emailEl.textContent;
// Clean up email if it's in a mailto: link
if (info.email.startsWith('mailto:')) {
info.email = info.email.replace('mailto:', '');
}
break;
}
}
// Extract phone - multiple possible selectors (specific IDs first)
const phoneSelectors = [
'#people-contact-phone-1',
'#people-contact-phone-2',
'#people-contact-phone-3',
'[data-person-id="people-contact-phone-1"]',
'[data-person-id="people-contact-phone-2"]',
'[data-person-id="people-contact-phone-3"]',
'a[href^="tel:"]',
'[data-test*="phone"]',
'[data-testid*="phone"]',
'.phone-number',
'.contact-info [data-test*="phone"]',
'.owner-phone'
];
for (const selector of phoneSelectors) {
const phoneEl = document.querySelector(selector);
if (phoneEl) {
info.phone = phoneEl.innerText || phoneEl.textContent;
// Clean up phone if it's in a tel: link
if (info.phone.startsWith('tel:')) {
info.phone = info.phone.replace('tel:', '');
}
break;
}
}
// Also try to extract from text content by regex
const bodyText = document.body.innerText;
// Email regex patterns
const emailPatterns = [
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
/Email[:\s]*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i
];
if (!info.email) {
for (const pattern of emailPatterns) {
const match = bodyText.match(pattern);
if (match && match[0]) {
info.email = match[0].replace(/^email[:\s]*/i, '');
break;
}
}
}
// Phone regex patterns
const phonePatterns = [
/\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
/\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
/Phone[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i,
/Tel[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i
];
if (!info.phone) {
for (const pattern of phonePatterns) {
const matches = bodyText.match(pattern);
if (matches) {
// Use the first valid phone number found
info.phone = matches[0].replace(/^phone[:\s]*/i, '').replace(/^tel[:\s]*/i, '');
break;
}
}
}
// Extract owner name from property page
const ownerSelectors = [
'[data-test*="owner"]',
'[data-testid*="owner"]',
'.owner-name',
'.owner',
'h1',
'h2'
];
for (const selector of ownerSelectors) {
const ownerEl = document.querySelector(selector);
if (ownerEl) {
const text = ownerEl.innerText || ownerEl.textContent;
if (text && text.length > 2 && text.length < 100) {
info.ownerName = text;
break;
}
}
}
// Extract property address
const addressSelectors = [
'[data-test*="address"]',
'[data-testid*="address"]',
'.property-address',
'.address',
'h1',
'h2'
];
for (const selector of addressSelectors) {
const addrEl = document.querySelector(selector);
if (addrEl) {
const text = addrEl.innerText || addrEl.textContent;
if (text && text.match(/\d+/)) {
info.propertyAddress = text;
break;
}
}
}
// Extract property type
const typeSelectors = [
'[data-test*="type"]',
'[data-testid*="type"]',
'.property-type',
'.type'
];
for (const selector of typeSelectors) {
const typeEl = document.querySelector(selector);
if (typeEl) {
info.propertyType = typeEl.innerText || typeEl.textContent;
break;
}
}
// Extract square footage
const sfSelectors = [
'[data-test*="sf"]',
'[data-testid*="sf"]',
'.square-footage',
'.sf',
'.sqft'
];
for (const selector of sfSelectors) {
const sfEl = document.querySelector(selector);
if (sfEl) {
info.squareFootage = sfEl.innerText || sfEl.textContent;
break;
}
}
return info;
});
log(` 📧 Email: ${contactInfo.email || 'Not found'}`);
log(` 📞 Phone: ${contactInfo.phone || 'Not found'}`);
return contactInfo;
} catch (error) {
log(` ⚠️ Error extracting from property page: ${error.message}`);
return {
email: '',
phone: '',
ownerName: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: ''
};
}
}
/**
* Extract contact info from an owner detail page
*/
async function extractOwnerContactInfo(page, ownerUrl) {
log(` 👤 Visiting owner: ${ownerUrl}`);
try {
await page.goto(ownerUrl, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(2000); // Wait for dynamic content to load
// DEBUG: Save screenshot
const ownerMatch = ownerUrl.match(/person\/([a-zA-Z0-9_-]+)/);
const ownerId = ownerMatch ? ownerMatch[1] : 'unknown';
const debugPath = `/tmp/reonomy-owner-${ownerId}.png`;
await page.screenshot({ path: debugPath, fullPage: true });
log(` 📸 Debug screenshot saved: ${debugPath}`);
// DEBUG: Save HTML content
const htmlPath = `/tmp/reonomy-owner-${ownerId}.html`;
const htmlContent = await page.content();
fs.writeFileSync(htmlPath, htmlContent);
log(` 📄 Debug HTML saved: ${htmlPath}`);
const contactInfo = await page.evaluate(() => {
const info = {
email: '',
phone: '',
ownerName: '',
ownerLocation: '',
propertyCount: ''
};
// Extract email - multiple possible selectors (specific IDs first)
const emailSelectors = [
'#people-contact-email-id',
'[data-person-id="people-contact-email-id"]',
'a[href^="mailto:"]',
'[data-test*="email"]',
'[data-testid*="email"]',
'.email-address',
'.owner-email',
'.contact-info [data-test*="email"]'
];
for (const selector of emailSelectors) {
const emailEl = document.querySelector(selector);
if (emailEl) {
info.email = emailEl.innerText || emailEl.textContent;
// Clean up email if it's in a mailto: link
if (info.email.startsWith('mailto:')) {
info.email = info.email.replace('mailto:', '');
}
break;
}
}
// Extract phone - multiple possible selectors (specific IDs first)
const phoneSelectors = [
'#people-contact-phone-1',
'#people-contact-phone-2',
'#people-contact-phone-3',
'[data-person-id="people-contact-phone-1"]',
'[data-person-id="people-contact-phone-2"]',
'[data-person-id="people-contact-phone-3"]',
'a[href^="tel:"]',
'[data-test*="phone"]',
'[data-testid*="phone"]',
'.phone-number',
'.contact-info [data-test*="phone"]',
'.owner-phone'
];
for (const selector of phoneSelectors) {
const phoneEl = document.querySelector(selector);
if (phoneEl) {
info.phone = phoneEl.innerText || phoneEl.textContent;
// Clean up phone if it's in a tel: link
if (info.phone.startsWith('tel:')) {
info.phone = info.phone.replace('tel:', '');
}
break;
}
}
// Also try to extract from text content by regex
const bodyText = document.body.innerText;
// Email regex patterns
const emailPatterns = [
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
/Email[:\s]*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i
];
if (!info.email) {
for (const pattern of emailPatterns) {
const match = bodyText.match(pattern);
if (match && match[0]) {
info.email = match[0].replace(/^email[:\s]*/i, '');
break;
}
}
}
// Phone regex patterns
const phonePatterns = [
/\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
/\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/g,
/Phone[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i,
/Tel[:\s]*[+]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})/i
];
if (!info.phone) {
for (const pattern of phonePatterns) {
const matches = bodyText.match(pattern);
if (matches) {
// Use the first valid phone number found
info.phone = matches[0].replace(/^phone[:\s]*/i, '').replace(/^tel[:\s]*/i, '');
break;
}
}
}
// Extract owner name
const nameSelectors = [
'[data-test*="name"]',
'[data-testid*="name"]',
'.owner-name',
'.person-name',
'h1',
'h2'
];
for (const selector of nameSelectors) {
const nameEl = document.querySelector(selector);
if (nameEl) {
const text = nameEl.innerText || nameEl.textContent;
if (text && text.length > 2 && text.length < 100) {
info.ownerName = text;
break;
}
}
}
// Extract owner location
const locationSelectors = [
'[data-test*="location"]',
'[data-testid*="location"]',
'.location',
'.owner-location',
'.city-state'
];
for (const selector of locationSelectors) {
const locEl = document.querySelector(selector);
if (locEl) {
const text = locEl.innerText || locEl.textContent;
if (text && text.includes(',')) {
info.ownerLocation = text;
break;
}
}
}
// Extract property count
const countSelectors = [
'[data-test*="property-count"]',
'[data-testid*="property-count"]',
'.property-count',
'.properties-owned',
'.total-properties'
];
for (const selector of countSelectors) {
const countEl = document.querySelector(selector);
if (countEl) {
const text = countEl.innerText || countEl.textContent;
if (text.match(/\d+/)) {
info.propertyCount = text;
break;
}
}
}
// Also try to extract property count from text
if (!info.propertyCount) {
const countMatch = bodyText.match(/(\d+)\s*propert(?:y|ies)/i);
if (countMatch) {
info.propertyCount = countMatch[1];
}
}
return info;
});
log(` 📧 Email: ${contactInfo.email || 'Not found'}`);
log(` 📞 Phone: ${contactInfo.phone || 'Not found'}`);
return contactInfo;
} catch (error) {
log(` ⚠️ Error extracting from owner page: ${error.message}`);
return {
email: '',
phone: '',
ownerName: '',
ownerLocation: '',
propertyCount: ''
};
}
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Lead Scraper...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
let sheetId;
try {
// Step 1: Setup Google Sheet
sheetId = await getOrCreateSheet();
// If we have a sheet, initialize headers
if (sheetId) {
// Check if sheet has headers by trying to get them
try {
const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`);
if (!existingData.includes('Owner Name')) {
await initializeSheet(sheetId);
}
} catch (error) {
// Sheet might be empty, initialize it
await initializeSheet(sheetId);
}
} else {
// No sheet available, prepare to save to file
log('💾 Will save leads to: reonomy-leads.json');
}
// Step 2: Login to Reonomy
log('\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
// Fill credentials
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
// Submit login
await page.click('button[type="submit"]');
log('⏳ Logging in...');
await sleep(8000);
// Check if we're logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 3: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
log('✅ On search page');
// Step 4: Enter search query
log(`\n📍 Step 3: Searching for: ${SEARCH_LOCATION}`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
// Try alternative selector
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 }); // Select all
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
// Press Enter to search
await page.keyboard.press('Enter');
log('⏳ Searching...');
// Wait for results to load
await sleep(5000);
} else {
log('⚠️ Could not find search input, trying alternative method...');
}
// Step 5: Extract leads from the page
log('\n📍 Step 4: Finding owner links (contact info is on owner pages)...');
// Extract property and owner links from the page
const { propertyLinks, ownerLinks } = await extractLinksFromPage(page);
log(`👤 Found ${ownerLinks.length} owner links`);
const leads = [];
const scrapeDate = new Date().toISOString().split('T')[0];
// Skip property pages - no contact info there
log('\n📍 Step 5: Skipping property pages (no contact info)...');
// Step 6: Visit owner pages to extract contact info
log('\n📍 Step 6: Extracting contact info from owner pages...');
const ownersToScrape = ownerLinks.slice(0, MAX_OWNERS);
for (let i = 0; i < ownersToScrape.length; i++) {
log(`\n[${i + 1}/${ownersToScrape.length}]`);
const ownerUrl = ownersToScrape[i];
const contactInfo = await extractOwnerContactInfo(page, ownerUrl);
// Parse owner ID from URL
const ownerMatch = ownerUrl.match(/person\/([^/]+)/);
const ownerId = ownerMatch ? ownerMatch[1] : '';
const lead = {
scrapeDate,
ownerName: contactInfo.ownerName || ownerId,
propertyAddress: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: '',
ownerLocation: contactInfo.ownerLocation || '',
propertyCount: contactInfo.propertyCount || '',
propertyUrl: '',
ownerUrl: ownerUrl,
email: contactInfo.email || '',
phone: contactInfo.phone || ''
};
leads.push(lead);
// Rate limiting between page visits
if (i < ownersToScrape.length - 1) {
await sleep(PAGE_DELAY_MS);
}
}
log(`\n✅ Found ${leads.length} total leads`);
if (leads.length === 0) {
log('\n⚠ No leads extracted. The page structure may have changed.');
log(' Please check the screenshot and logs for details.');
// Save screenshot for debugging
await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
} else {
// Step 8: Save leads
log('\n📍 Step 7: Saving leads...');
for (const lead of leads) {
await appendToSheet(sheetId, lead);
await sleep(500); // Rate limiting
}
// If no sheet, save to JSON
if (!sheetId && jsonLeads.length > 0) {
saveToJsonFile(jsonLeads);
}
}
log('\n✅ Scraping complete!');
if (sheetId) {
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
} else {
log('💾 Leads saved to: reonomy-leads.json');
}
log(`📝 Log file: ${LOG_FILE}`);
return { sheetId, leadCount: leads.length, jsonFile: sheetId ? null : 'reonomy-leads.json' };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Save error screenshot
try {
await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-error.png');
} catch (e) {
// Ignore screenshot errors
}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
/**
* Extract property and owner links from the current page
*/
async function extractLinksFromPage(page) {
const propertyLinks = [];
const ownerLinks = [];
try {
const links = await page.evaluate(() => {
const propertyUrls = [];
const ownerUrls = [];
// Find all anchor elements
const anchors = Array.from(document.querySelectorAll('a'));
anchors.forEach(anchor => {
const href = anchor.href || '';
// Extract property URLs
if (href.includes('/property/')) {
// Extract the property ID and reconstruct the full URL
const match = href.match(/property\/([a-zA-Z0-9_-]+)/);
if (match) {
propertyUrls.push(`https://app.reonomy.com/#!/property/${match[1]}`);
}
}
// Extract owner/person URLs
if (href.includes('/person/') || href.includes('/owner/')) {
// Extract the person ID and reconstruct the full URL
const match = href.match(/(?:person|owner)\/([a-zA-Z0-9_-]+)/);
if (match) {
ownerUrls.push(`https://app.reonomy.com/#!/person/${match[1]}`);
}
}
});
return {
propertyUrls: [...new Set(propertyUrls)], // Remove duplicates
ownerUrls: [...new Set(ownerUrls)] // Remove duplicates
};
});
propertyLinks.push(...links.propertyUrls);
ownerLinks.push(...links.ownerUrls);
} catch (error) {
log(`⚠️ Error extracting links: ${error.message}`);
}
return { propertyLinks, ownerLinks };
}
/**
* Extract leads from search results page (legacy, kept for compatibility)
*/
async function extractLeadsFromPage(page) {
const leads = [];
try {
// Try to find property cards/listings
const properties = await page.evaluate(() => {
const results = [];
// Look for property cards - various possible selectors
const selectors = [
'[data-test*="property"]',
'[data-testid*="property"]',
'.property-card',
'.listing-card',
'.search-result',
'.result-item'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
elements.forEach(el => {
results.push(el.innerText);
});
break;
}
}
// If no structured cards, try to extract from the whole page
if (results.length === 0) {
const bodyText = document.body.innerText;
// Look for patterns that might be addresses
const addressPattern = /\d+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s*\d{5}/g;
const addresses = bodyText.match(addressPattern) || [];
addresses.forEach(addr => {
results.push(addr);
});
}
return results.slice(0, 50); // Limit results
});
// Parse extracted data into lead objects
const scrapeDate = new Date().toISOString().split('T')[0];
for (const prop of properties) {
const lead = parsePropertyData(prop, scrapeDate);
if (lead && lead.propertyAddress) {
leads.push(lead);
}
}
} catch (error) {
log(`⚠️ Error extracting from page: ${error.message}`);
}
return leads;
}
/**
* Extract leads from dashboard (legacy, kept for compatibility)
*/
async function extractLeadsFromDashboard(page) {
const leads = [];
const scrapeDate = new Date().toISOString().split('T')[0];
try {
// Extract recently viewed properties
const properties = await page.evaluate(() => {
const results = [];
// Look for property links
const links = Array.from(document.querySelectorAll('a[href*="/property/"]'));
links.forEach(link => {
results.push({
text: link.innerText || link.textContent,
url: link.href
});
});
return results.slice(0, 20);
});
for (const prop of properties) {
const lead = parsePropertyData(prop.text, scrapeDate);
if (lead && lead.propertyAddress) {
lead.propertyUrl = prop.url;
leads.push(lead);
}
}
// Extract recently viewed owners
const owners = await page.evaluate(() => {
const results = [];
const links = Array.from(document.querySelectorAll('a[href*="/person/"]'));
links.forEach(link => {
results.push({
text: link.innerText || link.textContent,
url: link.href
});
});
return results.slice(0, 20);
});
for (const owner of owners) {
const ownerLead = parseOwnerData(owner.text, scrapeDate);
if (ownerLead && ownerLead.ownerName) {
ownerLead.ownerUrl = owner.url;
leads.push(ownerLead);
}
}
} catch (error) {
log(`⚠️ Error extracting from dashboard: ${error.message}`);
}
return leads;
}
/**
* Parse property data from text
*/
function parsePropertyData(text, scrapeDate) {
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
return {
scrapeDate,
ownerName: '',
propertyAddress: lines[0] || '',
city: '',
state: '',
zip: '',
propertyType: lines.find(l => l.includes('SF') || l.includes('Industrial') || l.includes('Office')) || '',
squareFootage: extractSquareFootage(text),
ownerLocation: '',
propertyCount: '',
propertyUrl: '',
ownerUrl: '',
email: '',
phone: ''
};
}
/**
* Parse owner data from text
*/
function parseOwnerData(text, scrapeDate) {
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
return {
scrapeDate,
ownerName: lines[0] || '',
propertyAddress: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: '',
ownerLocation: lines.find(l => l.includes(',')) || '',
propertyCount: extractPropertyCount(text),
propertyUrl: '',
ownerUrl: '',
email: '',
phone: ''
};
}
/**
* Extract square footage from text
*/
function extractSquareFootage(text) {
const match = text.match(/(\d+\.?\d*)\s*k?\s*SF/i);
return match ? match[1] + (match[0].includes('k') ? 'k SF' : ' SF') : '';
}
/**
* Extract property count from text
*/
function extractPropertyCount(text) {
const match = text.match(/(\d+)\s*propert(?:y|ies)/i);
return match ? match[1] : '';
}
// Run scraper
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
if (result.sheetId) {
console.log(`\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
}
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});