clawdbot-workspace/reonomy-scraper-v10-agent-browser.js.backup

#!/usr/bin/env node
/**
 * Reonomy Scraper v10 - AGENT-BROWSER EDITION
 *
 * Key improvements over v9:
 * - Uses agent-browser instead of Puppeteer (faster, more reliable)
 * - State save/load for auth persistence (skip repeated login)
 * - Extracts from BOTH "Builder and Lot" AND "Owner" tabs
 * - Ref-based navigation for AI-friendly interaction
 * - Semantic locators instead of fragile CSS selectors
 *
 * Usage:
 *   SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v10-agent-browser.js
 *   Or configure via environment variables
 */

const { spawn } = require('child_process');
const fs = require('fs');
const path = require('path');

// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v10-agent-browser.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v10.log');

const STATE_FILE = path.join(__dirname, 'reonomy-auth-state.txt');

// Log function
function log(message) {
  const timestamp = new Date().toISOString();
  const logMessage = `[${timestamp}] ${message}\n`;
  console.log(message);
  fs.appendFileSync(LOG_FILE, logMessage);
}

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

/**
 * Execute agent-browser command and capture output
 */
async function execAgentBrowser(args, description = '') {
  const command = 'agent-browser';
  const fullArgs = args.length > 0 ? [command, ...args] : [command];

  log(`🔧 ${description}`);
  log(`   Command: ${fullArgs.join(' ')}`);

  return new Promise((resolve, reject) => {
    const child = spawn(command, fullArgs);

    let stdout = '';
    let stderr = '';

    child.stdout.on('data', (data) => {
      stdout += data.toString();
    });

    child.stderr.on('data', (data) => {
      stderr += data.toString();
    });

    child.on('close', (code) => {
      if (code === 0) {
        log(`   ✅ Success`);
        resolve(stdout.trim());
      } else {
        log(`   ❌ Failed (code ${code})`);
        if (stderr) {
          log(`   Error: ${stderr.trim()}`);
        }
        reject(new Error(`agent-browser failed with code ${code}: ${stderr.trim()}`));
      }
    });
  });
}

/**
 * Execute agent-browser command and parse JSON output
 */
async function execAgentBrowserJson(args, description = '') {
  const output = await execAgentBrowser([...args, '--json'], description);
  try {
    return JSON.parse(output);
  } catch (error) {
    log(`   ⚠️  JSON parse error: ${error.message}`);
    return null;
  }
}

/**
 * Execute agent-browser command and return success boolean
 */
async function execAgentBrowserSuccess(args, description = '') {
  const output = await execAgentBrowser(args, description);
  return output.includes('✓') || !output.includes('error');
}

/**
 * Check if auth state file exists and load it
 */
async function loadAuthState() {
  if (fs.existsSync(STATE_FILE)) {
    const state = fs.readFileSync(STATE_FILE, 'utf8');
    log('🔑 Loading saved auth state...');
    log(`   State file: ${STATE_FILE}`);
    return state.trim();
  }
  return null;
}

/**
 * Save auth state to file
 */
async function saveAuthState(state) {
  fs.writeFileSync(STATE_FILE, state);
  log('🔑 Saved auth state to file');
  log(`   State file: ${STATE_FILE}`);
}

/**
 * Take screenshot for debugging
 */
async function takeScreenshot(filename) {
  const screenshotPath = `/tmp/${filename}`;
  const outputPath = await execAgentBrowser(['screenshot', screenshotPath], 'Taking screenshot');
  if (outputPath.includes('Saved')) {
    log(`   📸 Screenshot saved: ${screenshotPath}`);
  }
  return screenshotPath;
}

/**
 * Extract data from Builder and Lot tab
 */
async function extractBuilderLotData() {
  log('📊 Extracting Builder and Lot data...');

  // Get snapshot of all interactive elements
  const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get interactive elements');
  const snapshot = JSON.parse(snapshotResult);

  log(`   Found ${Object.keys(snapshot.refs || {}).length} interactive elements`);

  // Extract property details using semantic locators
  let propertyData = {
    propertyAddress: '',
    city: '',
    state: '',
    zip: '',
    squareFootage: '',
    propertyType: ''
  };

  // Try heading first (property address)
  for (const [ref, element] of Object.entries(snapshot.refs || {})) {
    if (element.role === 'heading') {
      const addressMatch = element.name.match(/(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
      if (addressMatch) {
        propertyData.propertyAddress = element.name.trim();
        propertyData.city = addressMatch[1]?.trim() || '';
        propertyData.state = addressMatch[2]?.trim() || '';
        propertyData.zip = addressMatch[3]?.trim() || '';
        log(`   📍 Address: ${element.name}`);
        break;
      }
    }
  }

  // Extract property type from body text
  const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
  const bodyText = JSON.parse(bodyTextResult).result || '';

  const typePatterns = [
    'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
    'General Industrial', 'Medical Building', 'School', 'Religious',
    'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
    'Tax Exempt', 'Mixed Use'
  ];

  for (const type of typePatterns) {
    if (bodyText.includes(type)) {
      propertyData.propertyType = type;
      log(`   🏢 Property Type: ${type}`);
      break;
    }
  }

  // Extract square footage from body text
  const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
  if (sfMatch) {
    propertyData.squareFootage = sfMatch[0];
    log(`   📐 Square Footage: ${sfMatch[0]}`);
  }

  return propertyData;
}

/**
 * Extract data from Owner tab
 */
async function extractOwnerData() {
  log('👤 Extracting Owner tab data...');

  // Get snapshot of Owner tab
  const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get Owner tab elements');
  const snapshot = JSON.parse(snapshotResult);

  const ownerData = {
    ownerNames: [],
    emails: [],
    phones: []
  };

  // Extract owner names from page text
  const bodyTextResult = await execAgentBrowser(['eval', 'document.body.innerText'], 'Get body text');
  const bodyText = JSON.parse(bodyTextResult).result || '';

  // Owner name patterns (from previous scraper)
  const ownerPatterns = [
    /Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management)))/g
  ];

  for (const pattern of ownerPatterns) {
    const matches = bodyText.match(pattern);
    if (matches) {
      matches.forEach(m => {
        const owner = typeof m === 'string' ? m : m[1];
        if (owner && owner.length > 3 && !ownerData.ownerNames.includes(owner)) {
          ownerData.ownerNames.push(owner);
        }
      });
    }
  }

  // Extract phones using user-provided CSS selector
  const phoneResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text && text.length >= 10)`], 'Extract phones');
  const phoneData = JSON.parse(phoneResult);

  if (phoneData.result && Array.isArray(phoneData.result)) {
    phoneData.result.forEach(phone => {
      // Clean phone numbers (remove extra spaces, formatting)
      const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
      if (cleanPhone.length >= 10 && !ownerData.phones.includes(cleanPhone)) {
        ownerData.phones.push(cleanPhone);
      }
    });
    log(`   📞 Phones found: ${ownerData.phones.length}`);
  }

  // Extract emails using mailto links (more robust pattern)
  const emailResult = await execAgentBrowser(['eval', `Array.from(document.querySelectorAll('a[href^=\"mailto:\"], a[href*=\"@\"]')).map(a => {
    const href = a.getAttribute('href');
    if (href && href.includes('mailto:')) {
      return href.replace('mailto:', '');
    } else if (href && href.includes('@')) {
      return href;
    }
    return '';
  }).filter(email => email && email.length > 3 && email.includes('@'))"], 'Extract emails');
  const emailData = JSON.parse(emailResult);

  if (emailData.result && Array.isArray(emailData.result)) {
    const newEmails = emailData.result.filter(email => !ownerData.emails.includes(email));
    newEmails.forEach(email => {
      ownerData.emails.push(email);
    });
    log(`   📧 Emails found: ${ownerData.emails.length} (new: ${newEmails.length})`);
  }

  return ownerData;
}

/**
 * Main scraper function
 */
async function scrapeLeads() {
  log('🚀 Starting Reonomy Scraper v10 (AGENT-BROWSER EDITION)...\n');

  // Step 1: Check for saved auth state
  const savedState = await loadAuthState();
  if (savedState) {
    log(`✅ Found saved auth state! Skipping login flow.`);
    log(`   Saved state: ${savedState.substring(0, 100)}...`);
  }

  // Step 2: Navigate to search using search ID
  log('\n📍 Step 1: Navigating to search...');
  const searchUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}`;

  await execAgentBrowser(['open', searchUrl], 'Open search URL');
  await sleep(3000);

  // Step 3: Extract property IDs from search results
  log('\n📍 Step 2: Extracting property IDs...');
  const snapshotResult = await execAgentBrowserJson(['snapshot', '-c'], 'Get property links from search');
  const snapshot = JSON.parse(snapshotResult);

  const propertyIds = [];

  // Find all property links from search results
  if (snapshot.data) {
    for (const [ref, element] of Object.entries(snapshot.data.refs || {})) {
      if (element.role === 'link') {
        const match = element.url?.match(/property\/([a-f0-9-]+)/);
        if (match) {
          propertyIds.push({
            id: match[1],
            url: `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${match[1]}`
          });
        }
      }
    }
  }

  log(`✅ Found ${propertyIds.length} property IDs`);

  if (propertyIds.length === 0) {
    log('⚠️  No property IDs found.');
    throw new Error('No properties found on search page.');
  }

  // Step 4: Process each property
  const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
  log(`\n📍 Step 3: Processing ${propertiesToScrape.length} properties...\n`);

  const leads = [];

  for (let i = 0; i < propertiesToScrape.length; i++) {
    const prop = propertiesToScrape[i];

    log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);

    // Navigate to property ownership page directly
    log(`  🔗 Navigating to ownership page...`);
    const ownershipUrl = `https://app.reonomy.com/#!/search/${SEARCH_ID}/property/${prop.id}/ownership`;

    await execAgentBrowser(['open', ownershipUrl], 'Open ownership URL');
    await sleep(8000); // Wait for page to load

    // Extract data from BOTH tabs
    log(`  📊 Extracting Builder and Lot data...`);
    const builderLotData = await extractBuilderLotData();

    log(`  👤 Extracting Owner tab data...`);
    const ownerData = await extractOwnerData();

    const lead = {
      scrapeDate: new Date().toISOString().split('T')[0],
      propertyId: prop.id,
      propertyUrl: ownershipUrl,
      ...builderLotData,
      ...ownerData,
      searchId: SEARCH_ID
    };

    log(`  📧 Emails: ${lead.emails.length}`);
    log(`  📞 Phones: ${lead.phones.length}`);
    log(`  👤 Owners: ${lead.ownerNames.length}`);
    log(`  📍 Address: ${lead.propertyAddress || 'N/A'}`);

    leads.push(lead);

    // Screenshot for debugging (first 3 properties only)
    if (i < 3) {
      await takeScreenshot(`reonomy-v10-property-${i + 1}.png`);
    }
  }

  // Step 5: Save results
  if (leads.length > 0) {
    log(`\n✅ Total leads scraped: ${leads.length}`);

    const outputData = {
      scrapeDate: new Date().toISOString(),
      searchId: SEARCH_ID,
      leadCount: leads.length,
      leads: leads
    };

    fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
    log(`💾 Saved to: ${OUTPUT_FILE}`);

    // Also save search ID for reuse
    fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID);
    log(`💾 Search ID saved to: reonomy-search-id.txt`);
  } else {
    log('\n⚠️  No leads scraped.');
  }

  log('\n✅ Scraping complete!');
  return { leadCount: leads.length, outputFile: OUTPUT_FILE };
}

/**
 * Main execution
 */
(async () => {
  try {
    // If no saved auth state, perform login
    const savedState = await loadAuthState();

    if (!savedState) {
      log('\n🔐 Step 0: Logging in to Reonomy...');

      // Navigate to login page
      await execAgentBrowser(['open', 'https://app.reonomy.com/#!/login'], 'Open login page');
      await sleep(2000);

      // Get snapshot for login form
      const snapshotResult = await execAgentBrowserJson(['snapshot', '-i'], 'Get login form');
      const snapshot = JSON.parse(snapshotResult);

      // Find email input
      let emailRef = null;
      let passwordRef = null;
      let loginButtonRef = null;

      if (snapshot.data && snapshot.data.refs) {
        for (const [ref, element] of Object.entries(snapshot.data.refs)) {
          if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('email')) {
            emailRef = ref;
          } else if (element.role === 'textbox' && element.placeholder && element.placeholder.toLowerCase().includes('password')) {
            passwordRef = ref;
          } else if (element.role === 'button' && element.name && element.name.toLowerCase().includes('log in')) {
            loginButtonRef = ref;
          }
        }
      }

      if (!emailRef || !passwordRef || !loginButtonRef) {
        log('⚠️  Could not find login form elements');
        throw new Error('Login form not found');
      }

      // Fill email using evaluate (safer than fill command)
      log('  📧 Filling email...');
      await execAgentBrowser(['eval', `document.querySelector('input[type=\"email\"]').value = '${REONOMY_EMAIL}'`], 'Fill email');
      await sleep(500);

      // Fill password using evaluate
      log('  🔒 Filling password...');
      await execAgentBrowser(['eval', `document.querySelector('input[type=\"password\"]').value = '${REONOMY_PASSWORD}'`], 'Fill password');
      await sleep(500);

      // Click login button
      log('  🔑 Clicking login button...');
      await execAgentBrowser(['click', loginButtonRef], 'Click login button');

      // Wait for login and redirect
      log('  ⏳ Waiting for login to complete (15s)...');
      await sleep(15000);

      // Check if we're on search page now
      const urlCheckResult = await execAgentBrowser(['eval', 'window.location.href'], 'Check current URL');
      const urlCheck = JSON.parse(urlCheckResult);

      if (urlCheck.result && urlCheck.result.includes('#!/search/')) {
        log('✅ Login successful!');

        // Extract search ID from current URL
        const searchIdMatch = urlCheck.result.match(/#!\/search\/([a-f0-9-]+)/);
        if (searchIdMatch) {
          const currentSearchId = searchIdMatch[1];

          // Save auth state
          log(`🔑 Saving auth state...`);
          await saveAuthState(urlCheck.result);

          // Update SEARCH_ID from environment or use captured
          const newSearchId = process.env.REONOMY_SEARCH_ID || currentSearchId;
          process.env.REONOMY_SEARCH_ID = newSearchId;
          SEARCH_ID = newSearchId;

          log(`📝 Search ID updated: ${SEARCH_ID}`);

          // Update the search ID file for reuse
          fs.writeFileSync(path.join(__dirname, 'reonomy-search-id.txt'), SEARCH_ID);
        }
      } else {
        log('⚠️  Could not confirm login - URL does not match expected pattern');
        throw new Error('Login may have failed');
      }
    } else {
      log('⚠️  Could not get current URL');
      throw new Error('Could not confirm login state');
    }
  }

  // Proceed with scraping
  await scrapeLeads();

  process.exit(0);

})().catch(error => {
  log(`\n❌ Error: ${error.message}`);
  log(error.stack);

  // Take screenshot of error state
  takeScreenshot('reonomy-v10-error.png');

  throw error;
});