clawdbot-workspace/reonomy-scraper-v11-simple.js

404 lines
12 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v11 Simple - PLAYWRIGHT VERSION (NO FILTERS)
*
* This is a simpler version to verify Playwright works.
* Filters removed for testing purposes.
*/
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;
// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v11-simple.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v11-simple.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
/**
* Extract ALL data from Owner tab using Playwright
*/
async function extractOwnerTabData(page) {
return await page.evaluate(() => {
const info = {
propertyId: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: '',
emails: [],
phones: [],
ownerNames: []
};
// Extract property ID from URL
const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
info.propertyId = propIdMatch[1];
}
// Extract property address from h1, h2, h3
const headingSelectors = ['h1', 'h2', 'h3'];
for (const sel of headingSelectors) {
const heading = document.querySelector(sel);
if (heading) {
const text = heading.textContent.trim();
const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
if (addressMatch) {
info.propertyAddress = addressMatch[0];
info.city = addressMatch[1]?.trim();
info.state = addressMatch[2]?.trim();
info.zip = addressMatch[3]?.trim();
break;
}
}
}
// Extract property details (SF, type)
const bodyText = document.body.innerText;
// Square footage
const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
info.squareFootage = sfMatch[0];
}
// Property type
const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
for (const type of typePatterns) {
if (bodyText.includes(type)) {
info.propertyType = type;
break;
}
}
// Extract emails from mailto: links
document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
const email = a.href.replace('mailto:', '');
if (email && email.length > 5 && !info.emails.includes(email)) {
info.emails.push(email);
}
});
// Also try email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const emailMatches = bodyText.match(emailRegex);
if (emailMatches) {
emailMatches.forEach(email => {
if (!info.emails.includes(email)) {
info.emails.push(email);
}
});
}
// Extract phones from tel: links
document.querySelectorAll('a[href^="tel:"]').forEach(a => {
const phone = a.href.replace('tel:', '');
if (phone && phone.length >= 10 && !info.phones.includes(phone)) {
info.phones.push(phone);
}
});
// Also try phone patterns in text
const phoneRegex = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
const phoneMatches = bodyText.match(phoneRegex);
if (phoneMatches) {
phoneMatches.forEach(phone => {
if (!info.phones.includes(phone)) {
info.phones.push(phone);
}
});
}
// Extract owner names from Owner tab section
const ownerPatterns = [
/Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)/g,
/Owns\s+\d+\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)/i
];
for (const pattern of ownerPatterns) {
const matches = bodyText.match(pattern);
if (matches) {
matches.forEach(m => {
const owner = typeof m === 'string' ? m : m[1];
if (owner && owner.length > 3 && !info.ownerNames.includes(owner)) {
info.ownerNames.push(owner);
}
});
}
}
return info;
});
}
/**
* Extract property IDs from search results
*/
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
}
/**
* Wait for contact details using Playwright's waitForFunction
*/
async function waitForContactDetails(page, timeoutMs = 30000) {
log(` ⏳ Waiting for contact details (up to ${timeoutMs/1000}s)...`);
try {
await page.waitForFunction(
() => {
const emails = document.querySelectorAll('a[href^="mailto:"]');
const phones = document.querySelectorAll('a[href^="tel:"]');
// Also check for email patterns in text
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const bodyText = document.body.innerText;
const emailMatches = bodyText.match(emailRegex);
return emails.length > 0 || phones.length > 0 || (emailMatches && emailMatches.length > 0);
},
{ timeout: timeoutMs }
);
const data = await extractOwnerTabData(page);
log(` ✅ Contact details found! (${data.emails.length} emails, ${data.phones.length} phones)`);
return true;
} catch (error) {
// Timeout is expected if no contacts found
log(' ⚠️ No contact details found after timeout');
return false;
}
}
/**
* Main scraper using Playwright
*/
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v11 Simple (PLAYWRIGHT - NO FILTERS)...\n');
// Launch browser
const browser = await chromium.launch({
headless: HEADLESS,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
const leads = [];
try {
// Login
log('📍 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait for email input
await page.waitForSelector('input[type="email"]', { timeout: 10000 });
await page.fill('input[type="email"]', REONOMY_EMAIL);
await page.fill('input[type="password"]', REONOMY_PASSWORD);
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await page.waitForTimeout(10000);
// Check if logged in
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto('https://app.reonomy.com/#!/search', {
waitUntil: 'networkidle',
timeout: 60000
});
// Perform initial search
log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);
// Find and fill search input
const searchInput = page.locator('input[placeholder*="address"], input[placeholder*="Search"], input[type="text"]').first();
await searchInput.waitFor({ state: 'visible', timeout: 10000 });
await searchInput.fill(SEARCH_LOCATION);
await page.keyboard.press('Enter');
log('⏳ Searching...');
await page.waitForTimeout(5000);
// Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Extract property IDs
log('\n📍 Step 4: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
log('⚠️ No property IDs found.');
throw new Error('No properties found on search page.');
}
// Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 5: Processing ${propertiesToScrape.length} properties...`);
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate directly to property URL
log(` 🔗 Navigating to property...`);
await page.goto(prop.url, { waitUntil: 'networkidle', timeout: 30000 });
// Wait for page to load
log(` ⏳ Waiting for Owner tab to load...`);
// Wait for any heading or content to appear
await page.waitForSelector('h1, h2, h3, [role="heading"]', { timeout: 15000 }).catch(() => {
log(' ⚠️ No heading found, continuing anyway');
});
// Smart wait for contact details using Playwright's waitForFunction
await waitForContactDetails(page, 30000);
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const propertyData = await extractOwnerTabData(page);
log(` 📧 Emails: ${propertyData.emails.length} found`);
log(` 📞 Phones: ${propertyData.phones.length} found`);
log(` 👤 Owners: ${propertyData.ownerNames.length} found`);
log(` 🏢 Address: ${propertyData.propertyAddress || 'N/A'}`);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: propertyData.propertyId,
propertyUrl: page.url(),
address: propertyData.propertyAddress || '',
city: propertyData.city || '',
state: propertyData.state || '',
zip: propertyData.zip || '',
squareFootage: propertyData.squareFootage || '',
propertyType: propertyData.propertyType || '',
ownerNames: propertyData.ownerNames.join('; ') || '',
emails: propertyData.emails,
phones: propertyData.phones,
searchLocation: SEARCH_LOCATION,
searchId: searchId
};
leads.push(lead);
// Go back to search results for next property
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle',
timeout: 30000
});
await page.waitForTimeout(2000);
}
// Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
location: SEARCH_LOCATION,
searchId: searchId,
leadCount: leads.length,
framework: 'Playwright',
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
} catch (error) {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
try {
await page.screenshot({ path: '/tmp/reonomy-v11-simple-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v11-simple-error.png');
} catch (e) {}
throw error;
} finally {
await context.close();
await browser.close();
log('\n🔚 Browser closed');
}
}
// Run
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n💥 Scraper failed: ${error.message}`);
process.exit(1);
});