clawdbot-workspace/reonomy-scraper-v9-owner-tab.js

389 lines
12 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v9 - OWNER TAB EXTRACTION
*
* Key insight: Page has 3 tabs - Owner, Building & Lot, Occupants
* Owner tab is default view with contact info
* No "View Contact" button needed - data is visible by default
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v9-owner-tab.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v9.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Extract ALL data from Owner tab
*/
async function extractOwnerTabData(page) {
return await page.evaluate(() => {
const info = {
propertyId: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: '',
emails: [],
phones: [],
ownerNames: [],
pageTitle: document.title,
bodyTextSample: ''
};
// Extract property ID from URL
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
info.propertyId = propIdMatch[1];
}
// Extract property address from h1, h2, h3
const headingSelectors = ['h1', 'h2', 'h3'];
for (const sel of headingSelectors) {
const heading = document.querySelector(sel);
if (heading) {
const text = heading.textContent.trim();
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.propertyAddress = addressMatch[0];
info.city = addressMatch[1]?.trim();
info.state = addressMatch[2]?.trim();
info.zip = addressMatch[3]?.trim();
break;
}
}
}
// Extract property details (SF, type)
const bodyText = document.body.innerText;
// Square footage
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
info.squareFootage = sfMatch[0];
}
// Property type
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
for (const type of typePatterns) {
if (bodyText.includes(type)) {
info.propertyType = type;
break;
}
}
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5 && !info.emails.includes(email)) {
info.emails.push(email);
}
});
// Also try email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailRegex);
if (emailMatches) {
emailMatches.forEach(email => {
if (!info.emails.includes(email)) {
info.emails.push(email);
}
});
}
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length >= 10 && !info.phones.includes(phone)) {
info.phones.push(phone);
}
});
// Extract owner names from Owner tab section
const ownerPatterns = [
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
/Owns\s+\d+\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !info.ownerNames.includes(owner)) {
info.ownerNames.push(owner);
}
});
}
}
// Save sample for debugging
info.bodyTextSample = bodyText.substring(0, 500);
return info;
});
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
}
/**
* Main scraper
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v9 (OWNER TAB EXTRACTION)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Perform search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
const searchInput = await page.waitForSelector('input[placeholder*="address"], input[placeholder*="Search"]', {
timeout: 10000
}).catch(() => {
return page.waitForSelector('input[type="text"]', { timeout: 5000 });
});
if (searchInput) {
await searchInput.click({ clickCount: 3 });
await searchInput.type(SEARCH_LOCATION, { delay: 100 });
await sleep(1000);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await sleep(5000);
}
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs
log('\n📍 Step 4: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 5: Processing ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Click on property button (navigate to it)
log(` 🔗 Clicking property...`);
const clicked = await page.evaluateHandle((propData) => {
const buttons = Array.from(document.querySelectorAll('button'));
const target = buttons.find(b => {
const link = b.querySelector('a[href*="/property/"]');
return link && link.href.includes(propData.id);
});
if (target) {
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
target.click();
return { clicked: true };
}
}, { id: prop.id }).catch(() => {
return { clicked: false };
});
if (!clicked.clicked) {
log(` ⚠️ Could not click property, trying to navigate directly...`);
await page.goto(prop.url, {
waitUntil: 'networkidle2',
timeout: 30000
});
}
// Wait for property page to load with Owner tab
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(8000);
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const propertyData = await extractOwnerTabData(page);
log(` 📧 Emails: ${propertyData.emails.length} found`);
log(` 📞 Phones: ${propertyData.phones.length} found`);
log(` 👤 Owners: ${propertyData.ownerNames.length} found`);
log(` 🏢 Address: ${propertyData.propertyAddress || 'N/A'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: propertyData.propertyId,
propertyUrl: propertyData.pageTitle?.includes('property') ? `https://app.reonomy.com/#!/property/${propertyData.propertyId}` : page.url(),
address: propertyData.propertyAddress || '',
city: propertyData.city || '',
state: propertyData.state || '',
zip: propertyData.zip || '',
squareFootage: propertyData.squareFootage || '',
propertyType: propertyData.propertyType || '',
ownerNames: propertyData.ownerNames.join('; ') || '',
emails: propertyData.emails,
phones: propertyData.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Go back to search results for next property
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(3000);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v9-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v9-error.png');
} catch (e) {}
throw error;
} finally {
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});