clawdbot-workspace/reonomy-scraper-v14.js
2026-02-18 23:01:51 -05:00

890 lines
33 KiB
JavaScript
Executable File

#!/usr/bin/env node
/**
* Reonomy Scraper v14 - Self-Configuring Search + Extract
*
* Sets up search filters IN THE SAME browser session, then scrapes.
* No more search ID handoff problem.
*
* ENV CONFIG:
* REONOMY_STATE - Location filter (e.g., "New Jersey")
* REONOMY_TYPES - Comma-separated property types (e.g., "Industrial")
* REONOMY_MIN_SF - Min building area in SF (e.g., "50000")
* REONOMY_SALE_FILTER - "not_within_10y", "not_within_5y", "not_within_2y", "within_10y", etc.
* REONOMY_OWNER_PHONE - "true" to require phone
* REONOMY_OWNER_EMAIL - "true" to require email
* MAX_PROPERTIES - Max properties to scrape (default 20)
* HEADLESS - "false" to show browser
*/
const { execSync } = require('child_process');
const fs = require('fs');
const path = require('path');
const CONFIG = {
authStatePath: path.join(process.env.HOME, '.clawdbot/workspace/reonomy-auth.json'),
outputPath: path.join(process.env.HOME, '.clawdbot/workspace/reonomy-leads-v14.json'),
logPath: path.join(process.env.HOME, '.clawdbot/workspace/reonomy-scraper-v14.log'),
dailyLogPath: path.join(process.env.HOME, '.clawdbot/workspace/reonomy-daily-stats.json'),
maxProperties: parseInt(process.env.MAX_PROPERTIES) || 20,
maxDailyProperties: 50,
email: process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com',
password: process.env.REONOMY_PASSWORD || '9082166532',
// Search filters
searchState: process.env.REONOMY_STATE || 'New Jersey',
propertyTypes: (process.env.REONOMY_TYPES || 'Industrial').split(',').map(s => s.trim()),
minSF: process.env.REONOMY_MIN_SF || '',
saleFilter: process.env.REONOMY_SALE_FILTER || '', // e.g., "not_within_10y"
ownerPhone: process.env.REONOMY_OWNER_PHONE === 'true',
ownerEmail: process.env.REONOMY_OWNER_EMAIL === 'true',
};
function log(msg) {
const timestamp = new Date().toISOString();
const line = `[${timestamp}] ${msg}`;
console.log(line);
fs.appendFileSync(CONFIG.logPath, line + '\n');
}
function ab(cmd, options = {}) {
const fullCmd = `agent-browser ${cmd}`;
if (options.verbose !== false) log(` > ${fullCmd.substring(0, 120)}`);
try {
const result = execSync(fullCmd, {
encoding: 'utf8',
timeout: options.timeout || 30000,
stdio: ['pipe', 'pipe', 'pipe']
});
return { success: true, output: result.trim() };
} catch (err) {
const stderr = err.stderr?.toString() || err.message;
if (options.verbose !== false) log(` ! Error: ${stderr.substring(0, 200)}`);
return { success: false, error: stderr };
}
}
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
function randomDelay(min, max) {
return sleep(Math.floor(Math.random() * (max - min + 1)) + min);
}
function shuffle(arr) {
const a = [...arr];
for (let i = a.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[a[i], a[j]] = [a[j], a[i]];
}
return a;
}
function getDailyStats() {
const today = new Date().toISOString().split('T')[0];
try {
const data = JSON.parse(fs.readFileSync(CONFIG.dailyLogPath, 'utf8'));
if (data.date === today) return data;
} catch (e) {}
return { date: today, propertiesScraped: 0, leadsFound: 0 };
}
function saveDailyStats(stats) {
fs.writeFileSync(CONFIG.dailyLogPath, JSON.stringify(stats, null, 2));
}
// ── LOGIN ──
async function login() {
log(' Navigating to login...');
ab('open "https://app.reonomy.com/!/login"');
await sleep(4000);
const snap = ab('snapshot -i');
if (!snap.output?.includes('textbox "Email"')) {
const url = ab('eval "window.location.href"');
if (url.output?.includes('app.reonomy.com') && !url.output?.includes('login') && !url.output?.includes('auth.reonomy.com')) {
log(' Already logged in!');
return true;
}
throw new Error('Login form not found');
}
const emailRef = snap.output.match(/textbox "Email" \[ref=(e\d+)\]/)?.[1];
const passRef = snap.output.match(/textbox "Password" \[ref=(e\d+)\]/)?.[1];
const loginRef = snap.output.match(/button "Log In" \[ref=(e\d+)\]/)?.[1];
if (!emailRef || !passRef || !loginRef) throw new Error('Login form elements not found');
ab(`fill @${emailRef} "${CONFIG.email}"`);
await sleep(1000);
ab(`fill @${passRef} "${CONFIG.password}"`);
await sleep(1000);
ab(`click @${loginRef}`);
await randomDelay(12000, 16000);
const postUrl = ab('eval "window.location.href"');
if (postUrl.output?.includes('auth.reonomy.com') || postUrl.output?.includes('login')) {
throw new Error('Login failed');
}
ab(`state save "${CONFIG.authStatePath}"`);
log(' Login successful!');
return true;
}
// ── BUILD SEARCH ──
async function buildSearch() {
log('\n== Building search with filters ==');
// Go to search page via Advanced Search
ab('open "https://app.reonomy.com/!/search"');
await sleep(5000);
// Dismiss banners
for (let i = 0; i < 5; i++) {
const s = ab('snapshot -i', { verbose: false });
const lg = s.output?.match(/button "Let's Go" \[ref=(e\d+)\]/);
if (!lg) break;
ab(`click @${lg[1]}`, { verbose: false });
await sleep(800);
}
// Click Advanced Search if visible
let snap = ab('snapshot -i');
const advRef = snap.output?.match(/link "Advanced Search" \[ref=(e\d+)\]/)?.[1];
if (advRef) {
ab(`click @${advRef}`);
await sleep(3000);
}
// ── Location ──
log(` Setting location: ${CONFIG.searchState}`);
// Take FRESH snapshot after Advanced Search click (refs changed)
snap = ab('snapshot -i');
let searchBox = snap.output?.match(/textbox "(?:Search by address, location, or owner|Address, Location, or Owner)" \[ref=(e\d+)\]/)?.[1];
if (!searchBox) throw new Error('Search box not found');
// CRITICAL: Use click + press (individual keystrokes) instead of fill/type.
// fill sets .value directly, skipping React synthetic onChange → autocomplete never fires.
// press sends real keyboard events that React detects.
ab(`click @${searchBox}`);
await sleep(1000);
ab('press Control+a');
await sleep(200);
ab('press Backspace');
await sleep(500);
for (const char of CONFIG.searchState) {
if (char === ' ') ab('press Space');
else ab(`press ${char}`);
await sleep(150);
}
await sleep(3000);
// Retry up to 4 times to find the state suggestion
let stateRef = null;
for (let attempt = 0; attempt < 4 && !stateRef; attempt++) {
snap = ab('snapshot -i');
stateRef = snap.output?.match(new RegExp(`menuitem "${CONFIG.searchState}" \\[ref=(e\\d+)\\]`))?.[1];
if (!stateRef) {
log(` State not found (attempt ${attempt + 1}), retrying...`);
// Re-find search box (ref might have changed)
searchBox = snap.output?.match(/textbox "(?:Search by address|Address)[^"]*" \[ref=(e\d+)\]/)?.[1]
|| snap.output?.match(/textbox "[^"]*" \[ref=(e\d+)\]/)?.[1];
if (searchBox) {
// Clear and retype char-by-char via press for React autocomplete
ab(`click @${searchBox}`);
await sleep(500);
ab('press Control+a');
await sleep(200);
ab('press Backspace');
await sleep(500);
for (const char of CONFIG.searchState) {
if (char === ' ') ab('press Space');
else ab(`press ${char}`);
await sleep(150);
}
await sleep(4000);
} else {
await sleep(2000);
}
}
}
if (!stateRef) throw new Error(`State "${CONFIG.searchState}" not found in suggestions`);
ab(`click @${stateRef}`);
await sleep(6000);
// ── Property Type ──
log(` Setting property types: ${CONFIG.propertyTypes.join(', ')}`);
snap = ab('snapshot -i');
const ptRef = snap.output?.match(/button "Property Type" \[ref=(e\d+)\]/)?.[1];
if (!ptRef) throw new Error('Property Type button not found');
ab(`click @${ptRef}`);
await sleep(2000);
snap = ab('snapshot -i');
for (const ptype of CONFIG.propertyTypes) {
const cbRef = snap.output?.match(new RegExp(`checkbox "${ptype}" \\[ref=(e\\d+)\\]`))?.[1];
if (cbRef) {
ab(`click @${cbRef}`);
await sleep(500);
} else {
log(` ! Property type "${ptype}" not found in quick list, checking See All...`);
// Try See All Property Types
const seeAllRef = snap.output?.match(/button "See All Property Types" \[ref=(e\d+)\]/)?.[1];
if (seeAllRef) {
ab(`click @${seeAllRef}`);
await sleep(2000);
snap = ab('snapshot -i');
const cbRef2 = snap.output?.match(new RegExp(`checkbox "${ptype}" \\[ref=(e\\d+)\\]`))?.[1];
if (cbRef2) {
ab(`click @${cbRef2}`);
await sleep(500);
} else {
log(` ! Could not find "${ptype}" checkbox`);
}
}
}
}
// Click Apply for property type
snap = ab('snapshot -i');
const ptApply = snap.output?.match(/button "Apply" \[ref=(e\d+)\]/)?.[1];
if (ptApply) {
ab(`click @${ptApply}`);
await sleep(5000);
}
// ── Building Area (SF) ──
if (CONFIG.minSF) {
log(` Setting min building area: ${CONFIG.minSF} SF`);
snap = ab('snapshot -i');
const sizeRef = snap.output?.match(/button "Size" \[ref=(e\d+)\]/)?.[1];
if (sizeRef) {
ab(`click @${sizeRef}`);
await sleep(2000);
// Click the Building Area min dropdown button to open presets
// The full snapshot shows: Building Area (SF) with a button containing textbox "min"
// We need the SECOND button with textbox "min" (first is Total Units)
snap = ab('snapshot -i');
// Find the second min textbox (Building Area)
const minRefs = [...(snap.output?.matchAll(/textbox "min" \[ref=(e\d+)\]/g) || [])];
if (minRefs.length >= 2) {
const areaMinRef = minRefs[1][1];
// Click the field to open dropdown
ab(`click @${areaMinRef}`);
await sleep(1500);
// Type the value
ab(`type @${areaMinRef} "${CONFIG.minSF}"`);
await sleep(1000);
// Check for preset dropdown option (e.g., "50k sf")
snap = ab('snapshot -i');
const sfK = Math.round(parseInt(CONFIG.minSF) / 1000);
const presetPatterns = [`${sfK}k sf`, `${CONFIG.minSF}`, `${sfK},000`];
let presetClicked = false;
for (const pat of presetPatterns) {
const presetRef = snap.output?.match(new RegExp(`(?:menuitem|option|button|listitem) "${pat}[^"]*" \\[ref=(e\\d+)\\]`, 'i'))?.[1];
if (presetRef) {
log(` Clicking preset: ${pat}`);
ab(`click @${presetRef}`);
presetClicked = true;
await sleep(2000);
break;
}
}
if (!presetClicked) {
// Press Enter to commit the typed value
log(' No preset found, pressing Enter to commit');
ab(`eval "document.querySelectorAll('input[placeholder=\\"min\\"]')[1]?.dispatchEvent(new KeyboardEvent('keydown', {key: 'Enter', keyCode: 13, bubbles: true}))"`);
await sleep(1000);
}
}
// Now try to click Apply
await sleep(1000);
snap = ab('snapshot -i');
const sizeApply = snap.output?.match(/button "Apply" \[ref=(e\d+)\](?!\s*\[disabled\])/)?.[1];
if (sizeApply) {
ab(`click @${sizeApply}`);
await sleep(5000);
} else {
// Check if filter tag already shows (e.g., "50000+ SF")
const filterTag = snap.output?.match(/button "\d+.*SF" \[ref=(e\d+)\]/);
if (filterTag) {
log(' Size filter appears applied via tag');
} else {
// Try pressing Escape then check
log(' Pressing Escape to close size panel');
ab('eval "document.dispatchEvent(new KeyboardEvent(\'keydown\', {key: \'Escape\', bubbles: true}))"');
await sleep(2000);
}
}
}
// Verify size filter is applied
snap = ab('snapshot -i');
const sfTag = snap.output?.match(/button "\d+.*SF" \[ref=(e\d+)\]/);
if (sfTag) {
log(` Size filter confirmed: ${sfTag[0]}`);
} else {
log(' WARNING: Size filter may not be applied');
}
}
// ── Sale Date Filter ──
if (CONFIG.saleFilter) {
log(` Setting sale filter: ${CONFIG.saleFilter}`);
// Make sure no dropdowns are blocking
await sleep(1000);
snap = ab('snapshot -i');
// Close any open panels first by clicking the page body
ab('eval "document.body.click()"');
await sleep(1000);
snap = ab('snapshot -i');
const moreRef = snap.output?.match(/button "More [Ff]ilters" \[ref=(e\d+)\]/)?.[1];
if (moreRef) {
ab(`click @${moreRef}`);
await sleep(3000);
// Click Sales tab
snap = ab('snapshot -i');
const salesRef = snap.output?.match(/tab "Sales[^"]*" \[ref=(e\d+)\]/)?.[1];
if (salesRef) {
ab(`click @${salesRef}`);
await sleep(2000);
// Parse filter config
const [withinType, period] = CONFIG.saleFilter.split('_within_');
const notWithin = withinType === 'not';
const periodMap = {
'90d': 'Past 90 days', '1y': 'Past year', '2y': 'Past 2 years',
'5y': 'Past 5 years', '10y': 'Past 10 years'
};
const periodText = periodMap[period] || 'Past 10 years';
// Click Not Within / Within using JS
const withinLabel = notWithin ? 'Not Within' : 'Within';
ab(`eval "
const divs = Array.from(document.querySelectorAll('div'));
const btn = divs.find(d => d.textContent.trim() === '${withinLabel}' && d.className.includes('jss'));
if (btn) btn.click();
'${withinLabel}: ' + !!btn;
"`);
await sleep(1500);
ab(`eval "
const divs = Array.from(document.querySelectorAll('div'));
const btn = divs.find(d => d.textContent.trim() === '${periodText}' && d.className.includes('jss'));
if (btn) btn.click();
'${periodText}: ' + !!btn;
"`);
await sleep(1500);
// Click Apply in more filters
snap = ab('snapshot -i');
const salesApply = snap.output?.match(/button "Apply" \[ref=(e\d+)\]/)?.[1];
if (salesApply) {
ab(`click @${salesApply}`);
await sleep(5000);
}
}
}
}
// ── Owner filters (phone/email) ──
if (CONFIG.ownerPhone || CONFIG.ownerEmail) {
log(` Setting owner filters: phone=${CONFIG.ownerPhone}, email=${CONFIG.ownerEmail}`);
snap = ab('snapshot -i');
let moreRef = snap.output?.match(/button "More [Ff]ilters" \[ref=(e\d+)\]/)?.[1];
// If More Filters panel is already open, look for the close/filter button
if (!moreRef) {
// Panel might already be open from sales filter
const closeRef = snap.output?.match(/button "Close" \[ref=(e\d+)\]/)?.[1];
if (closeRef) {
// Close and reopen
ab(`click @${closeRef}`);
await sleep(1000);
snap = ab('snapshot -i');
moreRef = snap.output?.match(/button "More [Ff]ilters" \[ref=(e\d+)\]/)?.[1];
}
}
if (moreRef) {
ab(`click @${moreRef}`);
await sleep(2000);
}
// Click Owner tab
snap = ab('snapshot -i');
const ownerTabRef = snap.output?.match(/tab "Owner[^"]*" \[ref=(e\d+)\]/)?.[1];
if (ownerTabRef) {
ab(`click @${ownerTabRef}`);
await sleep(2000);
// Use JS to click phone/email toggles
if (CONFIG.ownerPhone) {
ab(`eval "
const divs = Array.from(document.querySelectorAll('div'));
const phone = divs.find(d => d.textContent.trim() === 'Includes Phone Number' && d.className.includes('jss'));
if (phone) phone.click();
'phone: ' + !!phone;
"`);
await sleep(1000);
}
if (CONFIG.ownerEmail) {
ab(`eval "
const divs = Array.from(document.querySelectorAll('div'));
const email = divs.find(d => d.textContent.trim() === 'Includes Email Address' && d.className.includes('jss'));
if (email) email.click();
'email: ' + !!email;
"`);
await sleep(1000);
}
snap = ab('snapshot -i');
const ownerApply = snap.output?.match(/button "Apply" \[ref=(e\d+)\]/)?.[1];
if (ownerApply) {
ab(`click @${ownerApply}`);
await sleep(5000);
}
}
}
// ── Get search ID and property count ──
await sleep(2000);
const url = ab('eval "window.location.href"');
const searchIdMatch = url.output?.match(/search\/([a-f0-9-]+)/);
const searchId = searchIdMatch ? searchIdMatch[1] : 'unknown';
snap = ab('snapshot');
const countMatch = snap.output?.match(/heading "([0-9,]+) properties"/);
const propertyCount = countMatch ? countMatch[1] : '?';
log(`\n Search ready: ${propertyCount} properties`);
log(` Search ID: ${searchId}`);
log(` URL: ${url.output}`);
return searchId;
}
// ── EXTRACT OWNERS FROM TAB ──
function extractOwnersFromTab(snapshot) {
if (!snapshot) return [];
const owners = [];
const lines = snapshot.split('\n');
let currentOwner = null;
for (const line of lines) {
const ownerMatch = line.match(/link "([^"]+)" \[ref=e\d+\]/);
if (ownerMatch && !ownerMatch[1].includes('Call') && !ownerMatch[1].includes('Send')
&& !ownerMatch[1].includes('Sign') && !ownerMatch[1].includes('Advanced')
&& !ownerMatch[1].includes('http') && !ownerMatch[1].includes("Don't")
&& !ownerMatch[1].includes('Google') && !ownerMatch[1].includes('Terms')
&& !ownerMatch[1].includes('Report')) {
const name = ownerMatch[1];
if (name.length > 2 && name.length < 80 && /[A-Z]/.test(name)) {
if (currentOwner && (currentOwner.phones.length > 0 || currentOwner.emails.length > 0)) {
owners.push(currentOwner);
}
const cleanName = name
.replace(/\s+(?:President|CEO|Manager|Member|Director|Officer|Offi|Secretary|Treasurer|VP|Vice President|Partner|Owner|Agent|Trustee|Chairman|Principal|Chief (?:Executive|Financial|Operating|Marketing)|Senior (?:Account|Vice|Manager|Director)|SHAR(?:EHOLDER)?|Shareholder|Authorized (?:Person|Agent|Rep)|Registered Agent|Statutory Agent|General Partner|Limited Partner|Managing Member|Sole Member|Organizer).*$/i, '')
.replace(/\s+(?:senior|chief|managing|authorized|registered|offi)$/i, '')
.trim();
currentOwner = { name: cleanName, phones: [], emails: [] };
}
}
const phoneMatch = line.match(/button "(?:1-)?(\d{3}-\d{3}-\d{4})(?:\s+(\w+))?" \[ref=e\d+\]/);
if (phoneMatch && currentOwner) {
const phone = line.includes('1-') ? `1-${phoneMatch[1]}` : phoneMatch[1];
currentOwner.phones.push({ number: phone, type: phoneMatch[2] || 'Unknown' });
}
const longPhoneMatch = line.match(/button "(\d{10,14})" \[ref=e\d+\]/);
if (longPhoneMatch && currentOwner) {
currentOwner.phones.push({ number: longPhoneMatch[1], type: 'Unknown' });
}
const emailMatch = line.match(/button "([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})" \[ref=e\d+\]/);
if (emailMatch && currentOwner) {
currentOwner.emails.push(emailMatch[1]);
}
}
if (currentOwner && (currentOwner.phones.length > 0 || currentOwner.emails.length > 0)) {
owners.push(currentOwner);
}
// Dedupe
const deduped = [];
const seen = new Map();
for (const o of owners) {
const key = o.name.toLowerCase();
if (seen.has(key)) {
const ex = seen.get(key);
for (const p of o.phones) { if (!ex.phones.find(ep => ep.number === p.number)) ex.phones.push(p); }
for (const e of o.emails) { if (!ex.emails.includes(e)) ex.emails.push(e); }
} else {
seen.set(key, { ...o });
deduped.push(seen.get(key));
}
}
return deduped;
}
// ── SCRAPE PROPERTIES ──
async function scrapeProperties(searchId) {
log('\n== Scraping properties ==');
const dailyStats = getDailyStats();
const remainingToday = CONFIG.maxDailyProperties - dailyStats.propertiesScraped;
const maxThisRun = Math.min(CONFIG.maxProperties, remainingToday);
log(` Daily: ${dailyStats.propertiesScraped} done, ${remainingToday} remaining, this run: max ${maxThisRun}`);
if (maxThisRun <= 0) {
log(' Daily limit reached!');
return [];
}
const leads = [];
const processed = new Set(); // Track processed addresses
// Dismiss banners
for (let i = 0; i < 5; i++) {
const s = ab('snapshot -i', { verbose: false });
const lg = s.output?.match(/button "Let's Go" \[ref=(e\d+)\]/);
if (!lg) break;
ab(`click @${lg[1]}`, { verbose: false });
await sleep(800);
}
const parseAddresses = (snapOutput) => {
const props = [];
const matches = snapOutput?.matchAll(/heading "([^"]+)" \[ref=(e\d+)\] \[level=6\]/g) || [];
for (const m of matches) {
const text = m[1];
if (text.includes('properties') || text.includes('Recently') ||
text.includes('Get the most') || text.includes('What would') ||
text.length < 10) continue;
if (/\d+.*,\s*[A-Z]{2}\s*\d{5}/i.test(text) ||
/\d+.*(?:st|ave|blvd|dr|ln|rd|way|ct|highway|pl|cir|route|tpke|pkwy|pike|hwy|terr?|loop|pass|trail|sq|park|grove|run|plz)/i.test(text)) {
props.push({ name: text.substring(0, 80), ref: m[2] });
}
}
return props;
};
// Process properties one at a time: always use CURRENT page state
let scraped = 0;
let scrollCount = 0;
const maxScrolls = 10;
while (scraped < maxThisRun && scrollCount <= maxScrolls) {
// Take fresh snapshot of current search results
let snap = ab('snapshot');
const visible = parseAddresses(snap.output);
// Find next unprocessed property
const next = visible.find(p => !processed.has(p.name));
if (!next) {
// All visible properties processed, scroll for more
scrollCount++;
log(` All visible processed, scrolling... (${scrollCount}/${maxScrolls})`);
ab('scroll down 600', { verbose: false });
await sleep(2000);
continue;
}
processed.add(next.name);
scraped++;
log(`\n [${scraped}/${maxThisRun}] ${next.name.substring(0, 60)}`);
try {
// Click the property (ref is fresh from current snapshot)
let clickResult = ab(`click @${next.ref}`);
if (!clickResult.success) {
log(' Click failed, skipping');
dailyStats.propertiesScraped++;
continue;
}
await randomDelay(5000, 8000);
// Get property URL and ID
const propUrl = ab('eval "window.location.href"');
const propertyId = propUrl.output?.match(/property\/([a-f0-9-]+)/)?.[1] || 'unknown';
// Verify we're on a property page (not still on search)
if (!propUrl.output?.includes('/property/')) {
log(' Did not navigate to property page, skipping');
dailyStats.propertiesScraped++;
continue;
}
// Use address from search results (already validated by parseAddresses)
// DO NOT overwrite with detail page headings — they contain pagination text like "1 of 2,792 properties"
let propertyAddress = next.name;
// Extract property info using eval on the detail page DOM
const propertyInfo = {};
// Parse city/state/zip from the address string (e.g. "9835 N Virginia St, Reno, NV 89506")
const addrParts = propertyAddress.match(/,\s*([^,]+),\s*([A-Z]{2})\s*(\d{5})/i);
if (addrParts) {
// Title-case the city
propertyInfo.city = addrParts[1].trim().replace(/\b\w/g, c => c.toUpperCase());
propertyInfo.state = addrParts[2].toUpperCase();
propertyInfo.zip = addrParts[3];
}
// Extract property details: get page text via eval, then parse in Node
// Limit to first 5000 chars to avoid huge output issues
const pageTextResult = ab('eval "document.body.innerText.substring(0, 5000)"', { timeout: 15000 });
const pageText = pageTextResult.output || '';
// Also try clicking Building and Lot tab for more details
let fullPageText = pageText;
// Tab name on Reonomy is "Building & Lot" (with ampersand)
let blTabResult = ab('find role tab click --name "Building & Lot"', { timeout: 15000 });
if (!blTabResult.success) {
// Try alternate name
blTabResult = ab('find role tab click --name "Building and Lot"', { timeout: 10000 });
}
if (blTabResult.success) {
await sleep(3000);
const blText = ab('eval "document.body.innerText.substring(0, 5000)"', { timeout: 15000 });
fullPageText = pageText + '\n' + (blText.output || '');
}
// Save page text for debugging (first 2 properties only)
if (scraped <= 2) {
fs.writeFileSync(`/tmp/reonomy-pagetext-${scraped}.txt`, fullPageText.substring(0, 10000));
log(` Saved page text to /tmp/reonomy-pagetext-${scraped}.txt (${fullPageText.length} chars)`);
}
// NOTE: agent-browser eval returns innerText with literal \n (backslash-n) not actual newlines
// Replace literal \n with actual newlines for easier parsing
const normalizedText = fullPageText.replace(/\\n/g, '\n');
// Try to get a properly-cased address from the detail page text
const properCaseAddr = normalizedText.match(/\n([\d]+ [^\n]+, [A-Z]{2} \d{5})\n/);
if (properCaseAddr && propertyInfo.city && properCaseAddr[1].toLowerCase().includes(propertyInfo.city.toLowerCase())) {
propertyAddress = properCaseAddr[1];
// Re-extract city with proper casing
const newParts = propertyAddress.match(/,\s*([^,]+),\s*([A-Z]{2})\s*(\d{5})/i);
if (newParts) {
propertyInfo.city = newParts[1].trim();
}
}
// Strategy: First try B&L tab data (most accurate), then search listing, then fallback
// 1. Try "Building Area\n178,880 sf" from the Building & Lot tab
const baMatch = normalizedText.match(/Building Area\s+([\d,]+(?:\.\d+)?)\s*sf/i);
if (baMatch) {
propertyInfo.squareFootage = baMatch[1].replace(/,/g, '');
}
// 2. Try the search listing format: find THIS property's address then grab the next "NNk SF"
if (!propertyInfo.squareFootage) {
// The property address in the detail page uses proper casing
// Search for the address (case-insensitive) followed by NNk SF
const escapedAddr = propertyAddress.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const addrSfMatch = normalizedText.match(new RegExp(escapedAddr + '\\s+((?:[\\d,]+(?:\\.\\d+)?)[km]?)\\s*(?:SF|sq\\.?\\s*ft)', 'i'));
if (addrSfMatch) {
let sfVal = addrSfMatch[1].replace(/,/g, '');
if (sfVal.toLowerCase().endsWith('k')) {
sfVal = String(Math.round(parseFloat(sfVal.slice(0, -1)) * 1000));
} else if (sfVal.toLowerCase().endsWith('m')) {
sfVal = String(Math.round(parseFloat(sfVal.slice(0, -1)) * 1000000));
}
propertyInfo.squareFootage = sfVal;
}
}
// 3. Fallback: find any SF value that's not the filter badge (> 500 and not right after "min")
if (!propertyInfo.squareFootage) {
const allSfMatches = [...normalizedText.matchAll(/([\d,]+(?:\.\d+)?[km]?)\s*(?:SF|sf|sq\.?\s*ft)/g)];
for (const m of allSfMatches) {
const idx = m.index;
const prefix = normalizedText.substring(Math.max(0, idx - 30), idx);
if (prefix.includes('min:') || prefix.includes('min ') || prefix.includes('min\n')) continue;
let sfVal = m[1].replace(/,/g, '');
if (sfVal.toLowerCase().endsWith('k')) {
sfVal = String(Math.round(parseFloat(sfVal.slice(0, -1)) * 1000));
} else if (sfVal.toLowerCase().endsWith('m')) {
sfVal = String(Math.round(parseFloat(sfVal.slice(0, -1)) * 1000000));
}
if (parseInt(sfVal) > 500) {
propertyInfo.squareFootage = sfVal;
break;
}
}
}
// Year built: "Year Built\n2025" (B&L tab) or "Built in 2025" (search listing)
const yearMatch = normalizedText.match(/Year Built\s+(\d{4})/i)
|| normalizedText.match(/Built in (\d{4})/i);
if (yearMatch) propertyInfo.yearBuilt = yearMatch[1];
// Property type: try "Property Type\nIndustrial | General Industrial" from B&L tab
// Use case-sensitive "Type" to avoid the filter badge "Property type: Industrial" (lowercase 't')
const ptBLMatch = normalizedText.match(/Property Type\s*\n\s*([^\n]+)/);
if (ptBLMatch && ptBLMatch[1].length < 60 && ptBLMatch[1].length > 2) {
// Clean up "Industrial | General Industrial" → "General Industrial"
const parts = ptBLMatch[1].split('|').map(s => s.trim());
propertyInfo.propertyType = parts[parts.length - 1];
}
// Fallback: extract from "NNk SF TypeName" pattern in search listing
if (!propertyInfo.propertyType) {
const sfTypeMatch = normalizedText.match(/(?:SF|sf|sq\.?\s*ft)\s+((?:General |Light )?(?:Industrial|Warehouse|Manufacturing|Distribution|Flex|Storage|Transportation|Office|Retail|Mixed Use|Hotel|Factory|Special Purpose))/i);
if (sfTypeMatch) propertyInfo.propertyType = sfTypeMatch[1].trim();
}
// Lot size: "Lot Area Acres\n14.22 acres" or "14.22 Acres"
const lotMatch = normalizedText.match(/Lot Area Acres\s+([\d.]+)\s*acres/i)
|| normalizedText.match(/([\d.]+)\s*Acres/i);
if (lotMatch) propertyInfo.lotSize = lotMatch[1] + ' Acres';
// Units: "Total Units\n2" or "N Units"
const unitsMatch = normalizedText.match(/Total Units\s+(\d+)/i)
|| normalizedText.match(/(\d+)\s*Units?(?!\w)/i);
if (unitsMatch) propertyInfo.units = unitsMatch[1];
// Fallback: try snapshot text for SF if eval didn't get it
if (!propertyInfo.squareFootage) {
const iSnap = ab('snapshot -i');
const buildText = iSnap.output || '';
const sfFallback = buildText.match(/(\d[\d,]*(?:\.\d+)?)\s*(?:SF|Sq\.?\s*Ft)/i);
if (sfFallback) propertyInfo.squareFootage = sfFallback[1].replace(/,/g, '');
}
log(` ${propertyAddress} | ${JSON.stringify(propertyInfo)}`);
// Click Owner tab
log(' Opening Owner tab...');
ab('find role tab click --name "Owner"');
await randomDelay(4000, 6000);
// Extract contacts
const ownerSnap = ab('snapshot -i');
const owners = extractOwnersFromTab(ownerSnap.output || '');
if (owners.length === 0) {
log(' No contacts found');
} else {
const totalPhones = owners.reduce((s, o) => s + o.phones.length, 0);
const totalEmails = owners.reduce((s, o) => s + o.emails.length, 0);
log(` ${owners.length} owners, ${totalPhones} phones, ${totalEmails} emails`);
leads.push({
scrapeDate: new Date().toISOString(),
propertyId,
propertyAddress,
...propertyInfo,
owners: owners.map(o => ({ name: o.name, phones: o.phones, emails: o.emails }))
});
dailyStats.leadsFound++;
log(' Lead captured!');
}
dailyStats.propertiesScraped++;
saveDailyStats(dailyStats);
// Navigate back to search
ab(`open "https://app.reonomy.com/!/search/${searchId}"`);
await randomDelay(5000, 8000);
// Random longer break
if (Math.random() < 0.15) {
log(' Taking a short break...');
await randomDelay(8000, 15000);
}
} catch (propError) {
log(` Error: ${propError.message}`);
ab(`open "https://app.reonomy.com/!/search/${searchId}"`);
await sleep(6000);
dailyStats.propertiesScraped++;
saveDailyStats(dailyStats);
}
}
return leads;
}
// ── MAIN ──
async function main() {
// Clear log
fs.writeFileSync(CONFIG.logPath, '');
log('=== Reonomy Scraper v14 ===');
log(`Filters: state=${CONFIG.searchState}, types=${CONFIG.propertyTypes}, minSF=${CONFIG.minSF}, sale=${CONFIG.saleFilter}`);
log(`Owner filters: phone=${CONFIG.ownerPhone}, email=${CONFIG.ownerEmail}`);
log(`Max properties: ${CONFIG.maxProperties}`);
try {
// Login
log('\n== Step 1: Login ==');
await login();
// Build search
const searchId = await buildSearch();
// Scrape
const leads = await scrapeProperties(searchId);
// Save results
log('\n== Saving results ==');
let allLeads = [];
try {
const existing = JSON.parse(fs.readFileSync(CONFIG.outputPath, 'utf8'));
allLeads = existing.leads || [];
} catch (e) {}
allLeads = [...allLeads, ...leads];
const output = {
lastUpdated: new Date().toISOString(),
searchId,
filters: {
state: CONFIG.searchState,
propertyTypes: CONFIG.propertyTypes,
minSF: CONFIG.minSF,
saleFilter: CONFIG.saleFilter,
ownerPhone: CONFIG.ownerPhone,
ownerEmail: CONFIG.ownerEmail,
},
totalLeads: allLeads.length,
leads: allLeads
};
fs.writeFileSync(CONFIG.outputPath, JSON.stringify(output, null, 2));
log(`Saved ${leads.length} new leads (${allLeads.length} total)`);
return leads;
} catch (error) {
log(`\nFATAL: ${error.message}`);
ab('screenshot /tmp/reonomy-v14-error.png');
throw error;
} finally {
log('\nClosing browser...');
ab('close');
}
}
main()
.then(leads => {
log(`\nDone! ${leads.length} leads scraped.`);
process.exit(0);
})
.catch(err => {
log(`\nFailed: ${err.message}`);
process.exit(1);
});