clawdbot-workspace/reonomy-simple-scraper-v2.js

443 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Simple Reonomy Lead Scraper - v2
*
* Focus: Capture ANY available data without getting stuck on empty email/phone fields
*/
const puppeteer = require('puppeteer');
const { execSync } = require('child_process');
const fs = require('fs');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
const MAX_LEADS = 2; // Just scrape 2 owners as user requested
// Validate credentials
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
console.error(' Set them like:');
console.error(` REONOMY_EMAIL="your@email.com"`);
console.error(` REONOMY_PASSWORD="yourpassword"`);
console.error(' Or run: REONOMY_EMAIL="your@email.com" REONOMY_PASSWORD="yourpassword" node reonomy-scraper.js');
process.exit(1);
}
// Log file
const LOG_FILE = '/Users/jakeshore/.clawdbot/workspace/reonomy-simple.log';
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute gog CLI command
*/
function gogCommand(command) {
try {
return execSync(`gog ${command}`, { encoding: 'utf-8', timeout: 30000 }).trim();
} catch (error) {
log(`⚠️ gog command failed: ${error.message}`);
return null;
}
}
/**
* Get or create Google Sheet
*/
async function getOrCreateSheet() {
log('📊 Checking Google Sheets...');
const SHEET_ID = process.env.REONOMY_SHEET_ID;
if (SHEET_ID) {
log(`✅ Using existing sheet: ${SHEET_ID}`);
return SHEET_ID;
}
// Create a new sheet
log('📝 Creating new Google Sheet...');
const output = gogCommand(`sheets create "Reonomy Leads" --json`);
try {
const result = JSON.parse(output);
const newSheetId = result.spreadsheetId || result.id;
log(`✅ Created new sheet: ${newSheetId}`);
return newSheetId;
} catch (error) {
log(`⚠️ Could not create Google Sheet: ${error.message}`);
// Try to extract ID from text output
const match = output.match(/([0-9A-Za-z_-]{20,})/);
if (match) {
log(`✅ Extracted sheet ID from output: ${match[0]}`);
return match[0];
}
throw new Error('Could not parse sheet ID from gog output');
}
}
/**
* Initialize sheet with headers
*/
async function initializeSheet(sheetId) {
log('📋 Initializing sheet headers...');
const headers = [
'Scrape Date', 'Owner Name', 'Property Address', 'City', 'State', 'ZIP',
'Property Type', 'Square Footage', 'Owner Location', 'Property Count',
'Property URL', 'Owner URL', 'Email', 'Phone'
];
const headerString = headers.map(h => `"${h}"`).join(' ');
try {
gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
log('✅ Sheet headers initialized');
} catch (error) {
log(`⚠️ Could not set headers: ${error.message}`);
}
}
/**
* Append row to Google Sheet
*/
async function appendToSheet(sheetId, rowData) {
const values = Object.values(rowData).map(v => {
if (v === null || v === undefined) return '';
const str = String(v).replace(/"/g, '""');
return `"${str}"`;
}).join(' ');
try {
gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
log(`✅ Added: ${rowData.ownerName}`);
return true;
} catch (error) {
log(`❌ Error appending to sheet: ${error.message}`);
return false;
}
}
/**
* Extract ANY data from page (simple, robust approach)
*/
async function extractAnyAvailableData(page, url) {
const data = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyUrl: url,
ownerUrl: url,
email: '',
phone: '',
ownerName: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
propertyType: '',
squareFootage: '',
ownerLocation: '',
propertyCount: '',
propertyUrl: '',
ownerUrl: ''
};
// Method 1: Try to find ANY email address
try {
const emailSelectors = [
'a[href^="mailto:"]',
'[data-test*="email"]',
'.email-address',
'.owner-email'
];
for (const selector of emailSelectors) {
const el = await page.waitForSelector(selector, { timeout: 5000 });
if (el) {
const href = await el.evaluate(e => e.getAttribute('href'));
if (href && href.startsWith('mailto:')) {
data.email = href.replace('mailto:', '');
log(`📧 Email found: ${data.email}`);
break;
}
}
}
// Method 2: Try to find owner name
const nameSelectors = [
'[data-person-id="people-contact-phone-1"]',
'[data-person-id="people-contact-phone-2"]',
'[data-person-id="people-contact-phone-3"]',
'.owner-name',
'h1', '.h2', 'h3'
];
for (const selector of nameSelectors) {
const el = await page.waitForSelector(selector, { timeout: 5000 });
if (el) {
const name = await el.evaluate(e => e.textContent);
if (name && name.trim().length > 2) {
data.ownerName = name.trim();
log(`👤 Owner name: ${data.ownerName}`);
break;
}
}
}
// Method 3: Try to find phone
const phoneSelectors = [
'a[href^="tel:"]',
'[data-test*="phone"]',
'.phone-number',
'.owner-phone'
];
for (const selector of phoneSelectors) {
const el = await page.waitForSelector(selector, { timeout: 5000 });
if (el) {
const text = await el.evaluate(e => e.textContent || el.getAttribute('href'));
// Try to match phone patterns
const phonePatterns = [
/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
/\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
/^\(?\d{3}\)?[-.\s]*\d{3}[-.\s]?\d{4}/g
];
for (const pattern of phonePatterns) {
const match = text.match(pattern);
if (match) {
// Try to format phone number
let phone = match[0];
if (phone.startsWith('+')) {
phone = phone.replace(/^\+1?/, '+1 ');
}
if (phone.includes('-')) {
phone = phone.replace(/-/g, ' ');
}
if (phone.includes('.')) {
phone = phone.replace(/\./g, ' ');
}
// Remove common prefixes
phone = phone.replace(/^tel:/i, '')
.replace(/^phone:/i, '')
.replace(/^(Phone:|Tel:)/i, '')
.trim();
data.phone = phone;
log(`📞 Phone found: ${data.phone}`);
break;
}
}
}
}
// Method 4: Try to extract property details
const propertyDetails = await page.evaluate(() => {
const results = [];
// Look for address patterns
const addressPattern = /\d+\s+[A-Z][a-z]+,\s*[A-Z]{2}\s*\d{5}/g;
const addressMatch = document.body.innerText.match(addressPattern);
if (addressMatch) {
data.propertyAddress = addressMatch[0];
}
// Look for property type
const typePattern = /(General Industrial|Office|Retail|Multifamily|Warehouse|Mixed Use|Apartment|Hotel|Motel|Hospital|School|Health Care|Other)/i;
const typeMatch = document.body.innerText.match(typePattern);
if (typeMatch) {
data.propertyType = typeMatch[0];
}
// Look for square footage
const sfPattern = /(\d+\.?\d*k\s*SF|k\s*\s*sq\s*ft)/i;
const sfMatch = document.body.innerText.match(sfPattern);
if (sfMatch) {
data.squareFootage = sfMatch[0];
}
return results;
});
} catch (error) {
log(`⚠️ Error extracting data: ${error.message}`);
}
return data;
}
/**
* Main scraper function
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Lead Scraper (Simple Mode)...\\n');
const browser = await puppeteer.launch({
headless: process.env.HEADLESS === 'true' ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
let leads = [];
let sheetId;
try {
// Step 1: Get or create sheet
sheetId = await getOrCreateSheet();
await initializeSheet(sheetId);
// Step 2: Login
log('\\n📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
// Fill credentials
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
// Submit login
await page.click('button[type="submit"]');
log('⏳ Logging in...');
// Wait for redirect
await sleep(8000);
// Check if logged in
const currentUrl = page.url();
if (currentUrl.includes('login') || currentUrl.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 3: Navigate to search
log('\\n📍 Step 2: Navigating to search...');
await page.goto(`https://app.reonomy.com/#!/search`, {
waitUntil: 'networkidle2',
timeout: 30000
});
log('✅ On search page');
// Step 4: Search
log(`\\n📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="location"], input[placeholder*="Search"]', {
timeout: 10000
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await searchInput.press('Enter');
log('⏳ Searching...');
// Wait for results
await sleep(5000);
}
// Step 5: Find owner links
log('\\n📍 Step 4: Finding owner links...');
const ownerLinks = await page.evaluate((maxLeads) => {
const links = [];
const linkElements = document.querySelectorAll('a[href*="/person/"]');
linkElements.forEach(link => {
const href = link.getAttribute('href');
if (href) {
links.push({
ownerUrl: href,
ownerId: href.split('/').pop()
});
}
});
return links.slice(0, maxLeads);
}, MAX_LEADS);
log(`👤 Found ${ownerLinks.length} owner links`);
// Step 6: Extract data from owner pages
log('\\n📍 Step 5: Extracting data from owner pages (email, phone)...');
for (let i = 0; i < ownerLinks.length && i < MAX_LEADS; i++) {
const ownerUrl = ownerLinks[i].ownerUrl;
log(`\\n[${i + 1}/${ownerLinks.length}] Visiting owner: ${ownerUrl}`);
const data = await extractAnyAvailableData(page, ownerUrl);
// Ensure we have at least some data
if (data.ownerName || data.email || data.phone || data.propertyAddress) {
leads.push(data);
log(` ✅ Collected: ${data.ownerName || data.email || 'Owner info'} - ${data.phone || 'Contact info'}`);
} else {
log(` ⚠️ No contact info found for owner`);
}
}
log(`\\n✅ Found ${leads.length} total leads`);
// Step 7: Save leads
log('\\n📍 Step 6: Saving leads to Google Sheet...');
for (const lead of leads) {
const success = await appendToSheet(sheetId, lead);
if (!success) {
log(` ❌ Failed to save lead: ${lead.ownerName}`);
}
await sleep(500);
}
log(`\\n✅ Scraping complete!`);
log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
log(`📝 Log file: ${LOG_FILE}`);
return { sheetId, leadCount: leads.length };
} catch (error) {
log(`\\n❌ Error: ${error.message}`);
log(error.stack);
// Save error screenshot
try {
await page.screenshot({ path: '/tmp/reonomy-simple-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-simple-error.png');
} finally {
await browser.close();
log('\\n🔚 Browser closed');
}
}
process.exit(0);
}
// Run scraper
scrapeLeads().then(result => {
log(`\\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
process.exit(0);
}).catch(error => {
console.error(`\\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});