clawdbot-workspace/reonomy-scraper-v9-simple.js

298 lines
8.3 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Reonomy Scraper v9-SIMPLE - PUPPETEER EDITION
*
* Simplified version without complex regex
* Extracts: Owner names, Property details (Address, City, State, ZIP, SF, Type)
* Removes broken email/phone extraction to avoid issues
*
* Goal: Get working data quickly, add emails/phones later
*/
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
const HEADLESS = process.env.HEADLESS !== 'false';
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v9-simple.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v9-simple.log');
function log(message) {
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
console.log(message);
fs.appendFileSync(LOG_FILE, logMessage);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function extractOwnerTabData(page) {
log('📊 Extracting Owner tab data...');
// Get snapshot
const bodyText = await page.evaluate(() => {
return document.body.innerText;
});
const bodyTextContent = JSON.parse(bodyText).result || '';
// Initialize data object
const ownerData = {
propertyId: '',
propertyAddress: '',
city: '',
state: '',
zip: '',
squareFootage: '',
propertyType: '',
ownerNames: [],
emails: [],
phones: []
};
// Extract property ID from URL
const propIdMatch = page.url().match(/property\/([a-f0-9-]+)/);
if (propIdMatch) {
ownerData.propertyId = propIdMatch[1];
}
// Extract property address from h1-h6
const headingText = bodyTextContent;
// Simple address pattern (city, state, zip)
const addressPattern = /(\d+[^,\n]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/;
const addressMatch = headingText.match(addressPattern);
if (addressMatch) {
ownerData.propertyAddress = addressMatch[0];
ownerData.city = addressMatch[1]?.trim() || '';
ownerData.state = addressMatch[2]?.trim() || '';
ownerData.zip = addressMatch[3]?.trim() || '';
log(` 📍 Address: ${ownerData.propertyAddress}`);
}
// Extract square footage
const sfMatch = headingText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
if (sfMatch) {
ownerData.squareFootage = sfMatch[0];
log(` 📐 Square Footage: ${sfMatch[0]}`);
}
// Extract property type (simple patterns)
const typePatterns = [
'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
'General Industrial', 'Medical Building', 'School', 'Religious',
'Supermarket', 'Financial Building'
];
for (const type of typePatterns) {
if (headingText.includes(type)) {
ownerData.propertyType = type;
log(` 🏢 Property Type: ${type}`);
break;
}
}
// Extract owner names (simplifed - just get "Owner" + name pattern)
const ownerLines = headingText.split('\n');
for (const line of ownerLines) {
const ownerMatch = line.match(/Owner:\s*([A-Z][a-z\s,]+)/i);
if (ownerMatch) {
const owner = ownerMatch[1].trim();
if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) {
ownerData.ownerNames.push(owner);
}
}
}
log(` 👤 Owners found: ${ownerData.ownerNames.length}`);
// Return object
return {
...ownerData
};
}
async function extractPropertyIds(page) {
return await page.evaluate(() => {
const ids = [];
const links = document.querySelectorAll('a[href*="/property/"]');
links.forEach(link => {
const href = link.href;
const match = href.match(/property\/([a-f0-9-]+)/);
if (match) {
ids.push({
id: match[1],
url: href
});
}
});
return ids;
});
}
async function scrapeLeads() {
log('🚀 Starting Reonomy Scraper v9-SIMPLE (Puppeteer edition)...\n');
const browser = await puppeteer.launch({
headless: HEADLESS ? 'new' : false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
// Step 1: Login
log('\n🔐 Step 1: Logging into Reonomy...');
await page.goto('https://app.reonomy.com/#!/account', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await sleep(2000);
await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
await page.click('button[type="submit"]');
log('⏳ Waiting for login...');
await sleep(15000);
const url = page.url();
if (url.includes('login') || url.includes('auth')) {
throw new Error('Login failed. Please check credentials.');
}
log('✅ Successfully logged in!');
// Step 2: Navigate to search
log('\n📍 Step 2: Navigating to search...');
await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
waitUntil: 'networkidle2',
timeout: 60000
});
await sleep(3000);
// Step 3: Extract search ID from URL
const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
if (!urlMatch) {
throw new Error('Could not extract search ID from URL');
}
const searchId = urlMatch[1];
log(`✅ Search ID: ${searchId}`);
// Step 4: Extract property IDs
log('\n📍 Step 3: Extracting property IDs...');
const propertyIds = await extractPropertyIds(page);
log(`✅ Found ${propertyIds.length} property IDs`);
if (propertyIds.length === 0) {
throw new Error('No properties found on search page.');
}
// Step 5: Process each property
const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);
const leads = [];
for (let i = 0; i < propertiesToScrape.length; i++) {
const prop = propertiesToScrape[i];
log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);
// Navigate to property ownership page directly
log(` 🔗 Navigating to ownership page...`);
const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
await page.goto(ownershipUrl, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for Owner tab to load
log(` ⏳ Waiting for Owner tab to load...`);
await sleep(8000);
// Extract data from Owner tab
log(` 📊 Extracting data from Owner tab...`);
const ownerData = await extractOwnerTabData(page);
const lead = {
scrapeDate: new Date().toISOString().split('T')[0],
propertyId: prop.id,
propertyUrl: ownershipUrl,
...ownerData
};
log(` 👤 Owners: ${lead.ownerNames.length}`);
log(` 📍 Address: ${lead.propertyAddress || 'N/A'}`);
leads.push(lead);
// Go back to search results for next property
log(` 🔙 Going back to search results...`);
await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
waitUntil: 'networkidle2',
timeout: 30000
});
await sleep(3000);
}
// Step 6: Save results
if (leads.length > 0) {
log(`\n✅ Total leads scraped: ${leads.length}`);
const outputData = {
scrapeDate: new Date().toISOString(),
searchId: searchId,
leadCount: leads.length,
leads: leads
};
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
log(`💾 Saved to: ${OUTPUT_FILE}`);
} else {
log('\n⚠ No leads scraped.');
}
log('\n✅ Scraping complete!');
return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}
scrapeLeads()
.then(result => {
log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
console.log(`\n💾 View your leads at: ${result.outputFile}`);
process.exit(0);
})
.catch(error => {
log(`\n❌ Error: ${error.message}`);
log(error.stack);
// Take screenshot of error state
try {
page.screenshot({ path: '/tmp/reonomy-v9-simple-error.png', fullPage: true });
log('📸 Error screenshot saved: /tmp/reonomy-v9-simple-error.png');
} catch (e) {}
throw error;
process.exit(1);
});