443 lines
13 KiB
JavaScript
443 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Simple Reonomy Lead Scraper - v2
|
|
*
|
|
* Focus: Capture ANY available data without getting stuck on empty email/phone fields
|
|
*/
|
|
|
|
const puppeteer = require('puppeteer');
|
|
const { execSync } = require('child_process');
|
|
const fs = require('fs');
|
|
|
|
// Configuration
|
|
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
|
|
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
|
|
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
|
|
const MAX_LEADS = 2; // Just scrape 2 owners as user requested
|
|
|
|
// Validate credentials
|
|
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
|
|
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
|
|
console.error(' Set them like:');
|
|
console.error(` REONOMY_EMAIL="your@email.com"`);
|
|
console.error(` REONOMY_PASSWORD="yourpassword"`);
|
|
console.error(' Or run: REONOMY_EMAIL="your@email.com" REONOMY_PASSWORD="yourpassword" node reonomy-scraper.js');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Log file
|
|
const LOG_FILE = '/Users/jakeshore/.clawdbot/workspace/reonomy-simple.log';
|
|
|
|
function log(message) {
|
|
const timestamp = new Date().toISOString();
|
|
const logMessage = `[${timestamp}] ${message}\n`;
|
|
console.log(message);
|
|
fs.appendFileSync(LOG_FILE, logMessage);
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Execute gog CLI command
|
|
*/
|
|
function gogCommand(command) {
|
|
try {
|
|
return execSync(`gog ${command}`, { encoding: 'utf-8', timeout: 30000 }).trim();
|
|
} catch (error) {
|
|
log(`⚠️ gog command failed: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get or create Google Sheet
|
|
*/
|
|
async function getOrCreateSheet() {
|
|
log('📊 Checking Google Sheets...');
|
|
|
|
const SHEET_ID = process.env.REONOMY_SHEET_ID;
|
|
|
|
if (SHEET_ID) {
|
|
log(`✅ Using existing sheet: ${SHEET_ID}`);
|
|
return SHEET_ID;
|
|
}
|
|
|
|
// Create a new sheet
|
|
log('📝 Creating new Google Sheet...');
|
|
const output = gogCommand(`sheets create "Reonomy Leads" --json`);
|
|
|
|
try {
|
|
const result = JSON.parse(output);
|
|
const newSheetId = result.spreadsheetId || result.id;
|
|
log(`✅ Created new sheet: ${newSheetId}`);
|
|
return newSheetId;
|
|
} catch (error) {
|
|
log(`⚠️ Could not create Google Sheet: ${error.message}`);
|
|
|
|
// Try to extract ID from text output
|
|
const match = output.match(/([0-9A-Za-z_-]{20,})/);
|
|
if (match) {
|
|
log(`✅ Extracted sheet ID from output: ${match[0]}`);
|
|
return match[0];
|
|
}
|
|
|
|
throw new Error('Could not parse sheet ID from gog output');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Initialize sheet with headers
|
|
*/
|
|
async function initializeSheet(sheetId) {
|
|
log('📋 Initializing sheet headers...');
|
|
|
|
const headers = [
|
|
'Scrape Date', 'Owner Name', 'Property Address', 'City', 'State', 'ZIP',
|
|
'Property Type', 'Square Footage', 'Owner Location', 'Property Count',
|
|
'Property URL', 'Owner URL', 'Email', 'Phone'
|
|
];
|
|
|
|
const headerString = headers.map(h => `"${h}"`).join(' ');
|
|
|
|
try {
|
|
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
|
|
log('✅ Sheet headers initialized');
|
|
} catch (error) {
|
|
log(`⚠️ Could not set headers: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Append row to Google Sheet
|
|
*/
|
|
async function appendToSheet(sheetId, rowData) {
|
|
const values = Object.values(rowData).map(v => {
|
|
if (v === null || v === undefined) return '';
|
|
const str = String(v).replace(/"/g, '""');
|
|
return `"${str}"`;
|
|
}).join(' ');
|
|
|
|
try {
|
|
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
|
|
log(`✅ Added: ${rowData.ownerName}`);
|
|
return true;
|
|
} catch (error) {
|
|
log(`❌ Error appending to sheet: ${error.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract ANY data from page (simple, robust approach)
|
|
*/
|
|
async function extractAnyAvailableData(page, url) {
|
|
const data = {
|
|
scrapeDate: new Date().toISOString().split('T')[0],
|
|
propertyUrl: url,
|
|
ownerUrl: url,
|
|
email: '',
|
|
phone: '',
|
|
ownerName: '',
|
|
propertyAddress: '',
|
|
city: '',
|
|
state: '',
|
|
zip: '',
|
|
propertyType: '',
|
|
squareFootage: '',
|
|
ownerLocation: '',
|
|
propertyCount: '',
|
|
propertyUrl: '',
|
|
ownerUrl: ''
|
|
};
|
|
|
|
// Method 1: Try to find ANY email address
|
|
try {
|
|
const emailSelectors = [
|
|
'a[href^="mailto:"]',
|
|
'[data-test*="email"]',
|
|
'.email-address',
|
|
'.owner-email'
|
|
];
|
|
|
|
for (const selector of emailSelectors) {
|
|
const el = await page.waitForSelector(selector, { timeout: 5000 });
|
|
if (el) {
|
|
const href = await el.evaluate(e => e.getAttribute('href'));
|
|
if (href && href.startsWith('mailto:')) {
|
|
data.email = href.replace('mailto:', '');
|
|
log(`📧 Email found: ${data.email}`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method 2: Try to find owner name
|
|
const nameSelectors = [
|
|
'[data-person-id="people-contact-phone-1"]',
|
|
'[data-person-id="people-contact-phone-2"]',
|
|
'[data-person-id="people-contact-phone-3"]',
|
|
'.owner-name',
|
|
'h1', '.h2', 'h3'
|
|
];
|
|
|
|
for (const selector of nameSelectors) {
|
|
const el = await page.waitForSelector(selector, { timeout: 5000 });
|
|
if (el) {
|
|
const name = await el.evaluate(e => e.textContent);
|
|
if (name && name.trim().length > 2) {
|
|
data.ownerName = name.trim();
|
|
log(`👤 Owner name: ${data.ownerName}`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method 3: Try to find phone
|
|
const phoneSelectors = [
|
|
'a[href^="tel:"]',
|
|
'[data-test*="phone"]',
|
|
'.phone-number',
|
|
'.owner-phone'
|
|
];
|
|
|
|
for (const selector of phoneSelectors) {
|
|
const el = await page.waitForSelector(selector, { timeout: 5000 });
|
|
if (el) {
|
|
const text = await el.evaluate(e => e.textContent || el.getAttribute('href'));
|
|
|
|
// Try to match phone patterns
|
|
const phonePatterns = [
|
|
/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
|
/\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
|
/^\(?\d{3}\)?[-.\s]*\d{3}[-.\s]?\d{4}/g
|
|
];
|
|
|
|
for (const pattern of phonePatterns) {
|
|
const match = text.match(pattern);
|
|
if (match) {
|
|
// Try to format phone number
|
|
let phone = match[0];
|
|
if (phone.startsWith('+')) {
|
|
phone = phone.replace(/^\+1?/, '+1 ');
|
|
}
|
|
if (phone.includes('-')) {
|
|
phone = phone.replace(/-/g, ' ');
|
|
}
|
|
if (phone.includes('.')) {
|
|
phone = phone.replace(/\./g, ' ');
|
|
}
|
|
|
|
// Remove common prefixes
|
|
phone = phone.replace(/^tel:/i, '')
|
|
.replace(/^phone:/i, '')
|
|
.replace(/^(Phone:|Tel:)/i, '')
|
|
.trim();
|
|
|
|
data.phone = phone;
|
|
log(`📞 Phone found: ${data.phone}`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method 4: Try to extract property details
|
|
const propertyDetails = await page.evaluate(() => {
|
|
const results = [];
|
|
|
|
// Look for address patterns
|
|
const addressPattern = /\d+\s+[A-Z][a-z]+,\s*[A-Z]{2}\s*\d{5}/g;
|
|
const addressMatch = document.body.innerText.match(addressPattern);
|
|
if (addressMatch) {
|
|
data.propertyAddress = addressMatch[0];
|
|
}
|
|
|
|
// Look for property type
|
|
const typePattern = /(General Industrial|Office|Retail|Multifamily|Warehouse|Mixed Use|Apartment|Hotel|Motel|Hospital|School|Health Care|Other)/i;
|
|
const typeMatch = document.body.innerText.match(typePattern);
|
|
if (typeMatch) {
|
|
data.propertyType = typeMatch[0];
|
|
}
|
|
|
|
// Look for square footage
|
|
const sfPattern = /(\d+\.?\d*k\s*SF|k\s*\s*sq\s*ft)/i;
|
|
const sfMatch = document.body.innerText.match(sfPattern);
|
|
if (sfMatch) {
|
|
data.squareFootage = sfMatch[0];
|
|
}
|
|
|
|
return results;
|
|
});
|
|
|
|
} catch (error) {
|
|
log(`⚠️ Error extracting data: ${error.message}`);
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
/**
|
|
* Main scraper function
|
|
*/
|
|
async function scrapeLeads() {
|
|
log('🚀 Starting Reonomy Lead Scraper (Simple Mode)...\\n');
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: process.env.HEADLESS === 'true' ? 'new' : false,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
let leads = [];
|
|
let sheetId;
|
|
|
|
try {
|
|
// Step 1: Get or create sheet
|
|
sheetId = await getOrCreateSheet();
|
|
await initializeSheet(sheetId);
|
|
|
|
// Step 2: Login
|
|
log('\\n📍 Step 1: Logging into Reonomy...');
|
|
await page.goto('https://app.reonomy.com/#!/account', {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
await sleep(2000);
|
|
|
|
// Fill credentials
|
|
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
|
|
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
|
|
|
|
// Submit login
|
|
await page.click('button[type="submit"]');
|
|
log('⏳ Logging in...');
|
|
|
|
// Wait for redirect
|
|
await sleep(8000);
|
|
|
|
// Check if logged in
|
|
const currentUrl = page.url();
|
|
if (currentUrl.includes('login') || currentUrl.includes('auth')) {
|
|
throw new Error('Login failed. Please check credentials.');
|
|
}
|
|
|
|
log('✅ Successfully logged in!');
|
|
|
|
// Step 3: Navigate to search
|
|
log('\\n📍 Step 2: Navigating to search...');
|
|
await page.goto(`https://app.reonomy.com/#!/search`, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 30000
|
|
});
|
|
|
|
log('✅ On search page');
|
|
|
|
// Step 4: Search
|
|
log(`\\n📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
|
|
|
|
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="location"], input[placeholder*="Search"]', {
|
|
timeout: 10000
|
|
});
|
|
|
|
if (searchInput) {
|
|
await searchInput.click({ clickCount: 3 });
|
|
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
|
|
await searchInput.press('Enter');
|
|
log('⏳ Searching...');
|
|
|
|
// Wait for results
|
|
await sleep(5000);
|
|
}
|
|
|
|
// Step 5: Find owner links
|
|
log('\\n📍 Step 4: Finding owner links...');
|
|
const ownerLinks = await page.evaluate((maxLeads) => {
|
|
const links = [];
|
|
|
|
const linkElements = document.querySelectorAll('a[href*="/person/"]');
|
|
linkElements.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
if (href) {
|
|
links.push({
|
|
ownerUrl: href,
|
|
ownerId: href.split('/').pop()
|
|
});
|
|
}
|
|
});
|
|
|
|
return links.slice(0, maxLeads);
|
|
}, MAX_LEADS);
|
|
|
|
log(`👤 Found ${ownerLinks.length} owner links`);
|
|
|
|
// Step 6: Extract data from owner pages
|
|
log('\\n📍 Step 5: Extracting data from owner pages (email, phone)...');
|
|
|
|
for (let i = 0; i < ownerLinks.length && i < MAX_LEADS; i++) {
|
|
const ownerUrl = ownerLinks[i].ownerUrl;
|
|
log(`\\n[${i + 1}/${ownerLinks.length}] Visiting owner: ${ownerUrl}`);
|
|
|
|
const data = await extractAnyAvailableData(page, ownerUrl);
|
|
|
|
// Ensure we have at least some data
|
|
if (data.ownerName || data.email || data.phone || data.propertyAddress) {
|
|
leads.push(data);
|
|
log(` ✅ Collected: ${data.ownerName || data.email || 'Owner info'} - ${data.phone || 'Contact info'}`);
|
|
} else {
|
|
log(` ⚠️ No contact info found for owner`);
|
|
}
|
|
}
|
|
|
|
log(`\\n✅ Found ${leads.length} total leads`);
|
|
|
|
// Step 7: Save leads
|
|
log('\\n📍 Step 6: Saving leads to Google Sheet...');
|
|
|
|
for (const lead of leads) {
|
|
const success = await appendToSheet(sheetId, lead);
|
|
if (!success) {
|
|
log(` ❌ Failed to save lead: ${lead.ownerName}`);
|
|
}
|
|
|
|
await sleep(500);
|
|
}
|
|
|
|
log(`\\n✅ Scraping complete!`);
|
|
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
|
|
log(`📝 Log file: ${LOG_FILE}`);
|
|
|
|
return { sheetId, leadCount: leads.length };
|
|
|
|
} catch (error) {
|
|
log(`\\n❌ Error: ${error.message}`);
|
|
log(error.stack);
|
|
|
|
// Save error screenshot
|
|
try {
|
|
await page.screenshot({ path: '/tmp/reonomy-simple-error.png', fullPage: true });
|
|
log('📸 Error screenshot saved: /tmp/reonomy-simple-error.png');
|
|
} finally {
|
|
await browser.close();
|
|
log('\\n🔚 Browser closed');
|
|
}
|
|
}
|
|
|
|
process.exit(0);
|
|
}
|
|
|
|
// Run scraper
|
|
scrapeLeads().then(result => {
|
|
log(`\\n🎉 Success! ${result.leadCount} leads scraped.`);
|
|
console.log(`\\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
|
|
process.exit(0);
|
|
}).catch(error => {
|
|
console.error(`\\n💥 Scraper failed: ${error.message}`);
|
|
process.exit(1);
|
|
});
|