clawdbot-workspace/reonomy-scraper-v2.js

#!/usr/bin/env node

/**
 * Reonomy Lead Scraper v2
 *
 * Improved scraper with better data extraction from dashboard
 * and search results.
 */

const puppeteer = require('puppeteer');
const { execSync } = require('child_process');
const fs = require('fs');
const path = require('path');

// Configuration from environment variables
const REONOMY_EMAIL = process.env.REONOMY_EMAIL;
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD;
const SHEET_ID = process.env.REONOMY_SHEET_ID;
const SHEET_TITLE = process.env.REONOMY_SHEET_TITLE || 'Reonomy Leads';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'New York, NY';
const HEADLESS = process.env.HEADLESS === 'true';

// Validate credentials
if (!REONOMY_EMAIL || !REONOMY_PASSWORD) {
  console.error('❌ Error: REONOMY_EMAIL and REONOMY_PASSWORD environment variables are required.');
  console.error('   Set them like: REONOMY_EMAIL="..." REONOMY_PASSWORD="..." node reonomy-scraper-v2.js');
  process.exit(1);
}

// Log file
const LOG_FILE = path.join(__dirname, 'reonomy-scraper.log');

function log(message) {
  const timestamp = new Date().toISOString();
  const logMessage = `[${timestamp}] ${message}\n`;
  console.log(message);
  fs.appendFileSync(LOG_FILE, logMessage);
}

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

/**
 * Execute gog CLI command
 */
function gogCommand(command) {
  try {
    let fullCommand = `gog ${command}`;
    const account = process.env.GOG_ACCOUNT;
    if (account) {
      fullCommand = `gog --account "${account}" ${command}`;
    }

    const output = execSync(fullCommand, {
      encoding: 'utf-8',
      timeout: 30000,
      stdio: ['pipe', 'pipe', 'pipe']
    });

    const combinedOutput = (output || '').trim();
    return combinedOutput;
  } catch (error) {
    if (error.status !== 0) {
      const stderr = error.stderr ? error.stderr.toString() : '';
      const stdout = error.stdout ? error.stdout.toString() : '';

      if (stdout && stdout.trim() && !stderr.includes('error') && !stderr.includes('Error')) {
        return stdout.trim();
      }

      if (stderr.includes('error') || stderr.includes('Error')) {
        throw new Error(`gog command failed: ${stderr}`);
      }
      throw new Error(`gog command failed: ${stderr || stdout || 'Unknown error'}`);
    }
    throw error;
  }
}

/**
 * Get or create Google Sheet
 */
async function getOrCreateSheet() {
  log('📊 Checking Google Sheets...');

  if (SHEET_ID) {
    log(`✅ Using existing sheet: ${SHEET_ID}`);
    return SHEET_ID;
  }

  try {
    log('📝 Creating new Google Sheet...');
    const output = gogCommand(`sheets create "${SHEET_TITLE}" --json`);

    try {
      const result = JSON.parse(output);
      const newSheetId = result.spreadsheetId || result.id;
      log(`✅ Created new sheet: ${newSheetId}`);
      return newSheetId;
    } catch (error) {
      const match = output.match(/([0-9A-Za-z_-]{20,})/);
      if (match) {
        log(`✅ Created new sheet: ${match[1]}`);
        return match[1];
      }
      throw new Error('Could not parse sheet ID from gog output');
    }
  } catch (error) {
    log(`⚠️  Could not create Google Sheet: ${error.message}`);
    log('💾 Leads will be saved to JSON file instead');
    return null;
  }
}

/**
 * Initialize sheet with headers
 */
async function initializeSheet(sheetId) {
  log('📋 Initializing sheet headers...');

  const headers = [
    'Scrape Date',
    'Owner Name',
    'Property Address',
    'City',
    'State',
    'ZIP',
    'Property Type',
    'Square Footage',
    'Owner Location',
    'Property Count',
    'Property URL',
    'Owner URL',
    'Email',
    'Phone'
  ];

  const headerString = headers.map(h => `"${h}"`).join(' ');

  try {
    gogCommand(`sheets update ${sheetId} "Sheet1!A1" ${headerString}`);
    log('✅ Sheet headers initialized');
  } catch (error) {
    log(`⚠️  Could not set headers: ${error.message}`);
  }
}

/**
 * Append row to Google Sheet or save to JSON file
 */
async function appendToSheet(sheetId, rowData) {
  if (sheetId) {
    const values = Object.values(rowData).map(v => {
      if (v === null || v === undefined) return '';
      const str = String(v).replace(/"/g, '""');
      return `"${str}"`;
    }).join(' ');

    try {
      gogCommand(`sheets append ${sheetId} "Sheet1!A:N" ${values}`);
      log(`✅ Added: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
    } catch (error) {
      log(`❌ Error appending to sheet: ${error.message}`);
    }
  } else {
    jsonLeads.push(rowData);
    log(`✅ Collected: ${rowData.ownerName || 'N/A'} - ${rowData.propertyAddress}`);
  }
}

/**
 * Save leads to JSON file
 */
function saveToJsonFile(leads) {
  const filename = path.join(__dirname, 'reonomy-leads.json');
  const data = {
    scrapeDate: new Date().toISOString(),
    leadCount: leads.length,
    location: SEARCH_LOCATION,
    leads: leads
  };

  try {
    fs.writeFileSync(filename, JSON.stringify(data, null, 2));
    log(`💾 Saved ${leads.length} leads to ${filename}`);
    return filename;
  } catch (error) {
    log(`❌ Error saving to JSON: ${error.message}`);
    return null;
  }
}

let jsonLeads = [];

/**
 * Extract property addresses and details from dashboard
 */
async function extractPropertiesFromDashboard(page) {
  log('🔍 Extracting property data from dashboard...');

  const properties = await page.evaluate(() => {
    const results = [];

    // Find all property links
    const propertyLinks = Array.from(document.querySelectorAll('a[href*="/property/"]'));

    propertyLinks.forEach(link => {
      const text = (link.innerText || link.textContent || '').trim();

      // Look for address patterns (starts with number, has comma)
      const addressMatch = text.match(/^(\d+.+),\s*([A-Za-z\s]+),\s*([A-Z]{2})\s*(\d{5})/);

      if (addressMatch) {
        results.push({
          fullText: text,
          address: addressMatch[1].trim(),
          city: addressMatch[2].trim(),
          state: addressMatch[3].trim(),
          zip: addressMatch[4].trim(),
          url: link.href,
          remainingText: text.substring(addressMatch[0].length).trim()
        });
      }
    });

    return results;
  });

  const scrapeDate = new Date().toISOString().split('T')[0];
  const leads = [];

  for (const prop of properties) {
    // Extract property type and square footage from remaining text
    const sqFtMatch = prop.remainingText.match(/(\d+\.?\d*)\s*k?\s*SF/i);
    const sqFt = sqFtMatch ? sqFtMatch[0] : '';
    const propertyType = prop.remainingText.replace(sqFt, '').trim() || '';

    const lead = {
      scrapeDate,
      ownerName: '',
      propertyAddress: prop.address,
      city: prop.city,
      state: prop.state,
      zip: prop.zip,
      propertyType,
      squareFootage: sqFt,
      ownerLocation: '',
      propertyCount: '',
      propertyUrl: prop.url,
      ownerUrl: '',
      email: '',
      phone: ''
    };

    leads.push(lead);
  }

  log(`✅ Extracted ${leads.length} properties`);
  return leads;
}

/**
 * Extract owner data from dashboard
 */
async function extractOwnersFromDashboard(page) {
  log('🔍 Extracting owner data from dashboard...');

  const owners = await page.evaluate(() => {
    const results = [];

    const ownerLinks = Array.from(document.querySelectorAll('a[href*="/person/"]'));

    ownerLinks.forEach(link => {
      const text = (link.innerText || link.textContent || '').trim();

      // Pattern: Owner name\nOwns X properties Location
      const lines = text.split('\n').map(l => l.trim()).filter(l => l);

      if (lines.length >= 2) {
        const ownerName = lines[0];
        const location = lines.find(l => l.includes(',')) || '';
        const propertyCountMatch = text.match(/(\d+)\s*propert/i);
        const propertyCount = propertyCountMatch ? propertyCountMatch[1] : '';

        results.push({
          ownerName,
          location,
          propertyCount,
          url: link.href,
          fullText: text
        });
      }
    });

    return results;
  });

  const scrapeDate = new Date().toISOString().split('T')[0];
  const leads = [];

  for (const owner of owners) {
    // Parse location more carefully - extract city and state
    // Format is: "Owns X properties City, State" or just "City, State"
    let city = '';
    let state = '';
    let ownerLocation = owner.location;

    if (ownerLocation.includes(',')) {
      const parts = ownerLocation.split(',').map(p => p.trim());

      // If the last part is a state (2 uppercase letters), use it
      if (parts.length >= 2 && /^[A-Z]{2}$/.test(parts[parts.length - 1])) {
        state = parts[parts.length - 1];
        // The city is the second-to-last part, but we need to remove "Owns X properties" prefix
        const cityWithPrefix = parts[parts.length - 2];
        const cityMatch = cityWithPrefix.match(/(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$/);
        city = cityMatch ? cityMatch[1] : '';
      } else if (parts.length === 2) {
        city = parts[0];
        state = parts[1];
      }
    }

    const lead = {
      scrapeDate,
      ownerName: owner.ownerName,
      propertyAddress: '',
      city,
      state,
      zip: '',
      propertyType: '',
      squareFootage: '',
      ownerLocation: owner.location,
      propertyCount: owner.propertyCount,
      propertyUrl: '',
      ownerUrl: owner.url,
      email: '',
      phone: ''
    };

    leads.push(lead);
  }

  log(`✅ Extracted ${leads.length} owners`);
  return leads;
}

/**
 * Main scraper function
 */
async function scrapeLeads() {
  log('🚀 Starting Reonomy Lead Scraper v2...\n');

  const browser = await puppeteer.launch({
    headless: HEADLESS ? 'new' : false,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
  });

  const page = await browser.newPage();
  await page.setViewport({ width: 1920, height: 1080 });

  let sheetId;

  try {
    // Setup Google Sheet
    sheetId = await getOrCreateSheet();

    if (sheetId) {
      try {
        const existingData = gogCommand(`sheets get ${sheetId} "Sheet1!A1:N1" --plain`);
        if (!existingData.includes('Owner Name')) {
          await initializeSheet(sheetId);
        }
      } catch (error) {
        await initializeSheet(sheetId);
      }
    } else {
      log('💾 Will save leads to: reonomy-leads.json');
    }

    // Login to Reonomy
    log('\n📍 Step 1: Logging into Reonomy...');
    await page.goto('https://app.reonomy.com/#!/account', {
      waitUntil: 'domcontentloaded',
      timeout: 60000
    });

    await sleep(2000);

    await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
    await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });

    await page.click('button[type="submit"]');
    log('⏳ Logging in...');

    await sleep(8000);

    const url = page.url();
    if (url.includes('login') || url.includes('auth')) {
      throw new Error('Login failed. Please check credentials.');
    }

    log('✅ Successfully logged in!');

    // Navigate to home/dashboard to extract recent data
    log('\n📍 Step 2: Navigating to dashboard...');
    await page.goto('https://app.reonomy.com/#!/home', {
      waitUntil: 'networkidle2',
      timeout: 60000
    });

    await sleep(3000);
    log('✅ On dashboard');

    // Extract leads
    log('\n📍 Step 3: Extracting lead data...');
    const allLeads = [];

    // Extract properties
    const properties = await extractPropertiesFromDashboard(page);
    allLeads.push(...properties);

    // Extract owners
    const owners = await extractOwnersFromDashboard(page);
    allLeads.push(...owners);

    log(`\n✅ Total leads extracted: ${allLeads.length}`);

    if (allLeads.length === 0) {
      log('\n⚠️  No leads found. Taking screenshot for debugging...');
      await page.screenshot({ path: '/tmp/reonomy-no-leads.png', fullPage: true });
      log('📸 Screenshot saved: /tmp/reonomy-no-leads.png');
    } else {
      // Save leads
      log('\n📍 Step 4: Saving leads...');

      for (const lead of allLeads) {
        await appendToSheet(sheetId, lead);
        await sleep(500);
      }

      if (!sheetId && jsonLeads.length > 0) {
        saveToJsonFile(jsonLeads);
      }
    }

    log('\n✅ Scraping complete!');
    if (sheetId) {
      log(`📊 Google Sheet: https://docs.google.com/spreadsheets/d/${sheetId}`);
    } else {
      log('💾 Leads saved to: reonomy-leads.json');
    }
    log(`📝 Log file: ${LOG_FILE}`);

    return { sheetId, leadCount: allLeads.length };

  } catch (error) {
    log(`\n❌ Error: ${error.message}`);
    log(error.stack);

    try {
      await page.screenshot({ path: '/tmp/reonomy-error.png', fullPage: true });
      log('📸 Error screenshot saved: /tmp/reonomy-error.png');
    } catch (e) {
      // Ignore screenshot errors
    }

    throw error;

  } finally {
    await browser.close();
    log('\n🔚 Browser closed');
  }
}

// Run scraper
scrapeLeads()
  .then(result => {
    log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
    if (result.sheetId) {
      console.log(`\n📊 View your leads at: https://docs.google.com/spreadsheets/d/${result.sheetId}`);
    }
    process.exit(0);
  })
  .catch(error => {
    log(`\n💥 Scraper failed: ${error.message}`);
    process.exit(1);
  });