clawdbot-workspace/reonomy-scraper-v12-fresh.js

#!/usr/bin/env node
/**
 * Reonomy Scraper v12 - FRESH START - CLEAN SLATE
 *
 * Proven foundation from v9 (Puppeteer)
 * Fixed email/phone extraction (no complex regex)
 * Extracts from BOTH Builder and Lot AND Owner tabs
 * Uses direct ownership URLs (from research)
 *
 * Key improvements over v9:
 * - Moved email/phone extraction BEFORE return statement (now executes!)
 * - Simplified regex patterns (avoids syntax errors)
 * - Added Builder and Lot tab extraction
 * - Uses your CSS selector for phones: p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2
 * - Uses direct ownership URL navigation (no property card clicking)
 *
 * Usage:
 *   SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v12-fresh.js
 *   Or set as environment variable
 */

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_ID = process.env.REONOMY_SEARCH_ID || '504a2d13-d88f-4213-9ac6-a7c8bc7c20c6';
const MAX_PROPERTIES = process.env.MAX_PROPERTIES || 20;
const HEADLESS = process.env.HEADLESS !== 'false';

const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v12-fresh.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v12.log');

function log(message) {
  const timestamp = new Date().toISOString();
  const logMessage = `[${timestamp}] ${message}\n`;
  console.log(message);
  fs.appendFileSync(LOG_FILE, logMessage);
}

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

/**
 * Extract data from Builder and Lot tab
 */
async function extractBuilderLotData(page) {
  log('📊 Extracting Builder and Lot data...');

  const data = await page.evaluate(() => {
    const result = {
      squareFootage: '',
      propertyType: ''
    };

    // Get page text
    const bodyText = document.body.innerText;

    // Extract square footage
    const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
    if (sfMatch) {
      result.squareFootage = sfMatch[0];
    }

    // Extract property type (simple patterns)
    const typePatterns = [
      'Warehouse', 'Office Building', 'Retail Stores', 'Industrial',
      'General Industrial', 'Medical Building', 'School', 'Religious',
      'Supermarket', 'Financial Building', 'Residential', 'Vacant Land',
      'Tax Exempt', 'Mixed Use'
    ];

    for (const type of typePatterns) {
      if (bodyText.includes(type)) {
        result.propertyType = type;
        break;
      }
    }

    return result;
  });

  log(`  📐 Square Footage: ${data.squareFootage}`);
  log(`  🏢 Property Type: ${data.propertyType}`);

  return data;
}

/**
 * Extract data from Owner tab (CRITICAL - emails + phones)
 */
async function extractOwnerTabData(page) {
  log('👤 Extracting Owner tab data...');

  const data = await page.evaluate(() => {
    const result = {
      emails: [],
      phones: [],
      ownerNames: []
    };

    // *** CRITICAL FIX: Extract emails BEFORE returning object ***
    // Extract emails from mailto: links (simple, robust)
    const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]'));
    mailtoLinks.forEach(a => {
      const email = a.href.replace('mailto:', '');
      if (email && email.length > 5 && !result.emails.includes(email)) {
        result.emails.push(email);
      }
    });

    // Also try email patterns in text
    const bodyText = document.body.innerText;
    const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
    const emailMatches = bodyText.match(emailPattern);
    if (emailMatches) {
      emailMatches.forEach(email => {
        if (!result.emails.includes(email)) {
          result.emails.push(email);
        }
      });
    }

    // Extract phones using your CSS selector (from your inspection)
    const phoneElements = Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2'));
    phoneElements.forEach(p => {
      const text = p.textContent.trim();
      // Clean phone numbers (remove extra spaces, formatting)
      const cleanPhone = text.replace(/[\s\-\(\)]/g, '');
      if (cleanPhone.length >= 10 && !result.phones.includes(cleanPhone)) {
        result.phones.push(cleanPhone);
      }
    });

    // Extract owner names (proven simple pattern from v9)
    const ownerLines = bodyText.split('\n');
    for (const line of ownerLines) {
      const ownerMatch = line.match(/Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+)/i);
      if (ownerMatch) {
        const owner = ownerMatch[1].trim();
        if (owner && owner.length > 3 && !result.ownerNames.includes(owner)) {
          result.ownerNames.push(owner);
        }
      }
    }

    return result;
  });

  log(`  📧 Emails: ${data.emails.length} found`);
  log(`  📞 Phones: ${data.phones.length} found`);
  log(`  👤 Owners: ${data.ownerNames.length} found`);

  return data;
}

/**
 * Main scraper
 */
async function scrapeLeads() {
  log('🚀 Starting Reonomy Scraper v12 (FRESH START)...\n');

  // Launch browser
  const browser = await puppeteer.launch({
    headless: HEADLESS ? 'new' : false,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080']
  });

  const page = await browser.newPage();
  await page.setViewport({ width: 1920, height: 1080 });

  try {
    // Step 1: Login to Reonomy
    log('\n📍 Step 1: Logging into Reonomy...');
    await page.goto('https://app.reonomy.com/#!/account', {
      waitUntil: 'domcontentloaded',
      timeout: 60000
    });

    await sleep(2000);

    await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
    await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
    await page.click('button[type="submit"]');

    log('⏳ Waiting for login...');
    await sleep(15000);

    // Check if logged in
    const url = page.url();
    if (url.includes('login') || url.includes('auth')) {
      throw new Error('Login failed. Please check credentials.');
    }

    log('✅ Successfully logged in!');

    // Step 2: Navigate to search
    log('\n📍 Step 2: Navigating to search...');
    await page.goto(`https://app.reonomy.com/#!/search/${SEARCH_ID}`, {
      waitUntil: 'networkidle2',
      timeout: 60000
    });

    await sleep(3000);

    // Step 3: Extract search ID from URL
    const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
    if (!urlMatch) {
      throw new Error('Could not extract search ID from URL');
    }
    const searchId = urlMatch[1];
    log(`✅ Search ID: ${searchId}`);

    // Step 4: Extract property IDs
    log('\n📍 Step 3: Extracting property IDs...');
    const propertyIds = await page.evaluate(() => {
      const ids = [];
      const links = document.querySelectorAll('a[href*="/property/"]');

      links.forEach(link => {
        const href = link.href;
        const match = href.match(/property\/([a-f0-9-]+)/);
        if (match) {
          ids.push({
            id: match[1],
            url: `https://app.reonomy.com/#!/search/${searchId}/property/${match[1]}`
          });
        }
      });

      return ids;
    });

    log(`✅ Found ${propertyIds.length} property IDs`);

    if (propertyIds.length === 0) {
      throw new Error('No properties found on search page.');
    }

    // Step 5: Process each property
    const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);
    log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);

    const leads = [];

    for (let i = 0; i < propertiesToScrape.length; i++) {
      const prop = propertiesToScrape[i];

      log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);

      // Navigate directly to ownership page (from research - no clicking property cards)
      const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
      log(`  🔗 Navigating to ownership page...`);

      await page.goto(ownershipUrl, {
        waitUntil: 'networkidle2',
        timeout: 30000
      });

      // Wait for page to load
      log(`  ⏳ Waiting for Owner tab to load...`);
      await sleep(5000);

      // Extract from Builder and Lot tab
      log(`  📊 Extracting Builder and Lot data...`);
      const builderLotData = await extractBuilderLotData(page);

      // Wait a bit before extracting from Owner tab
      await sleep(1000);

      // Extract from Owner tab (CRITICAL: emails + phones)
      log(`  👤 Extracting Owner tab data...`);
      const ownerData = await extractOwnerTabData(page);

      const lead = {
        scrapeDate: new Date().toISOString().split('T')[0],
        propertyId: prop.id,
        propertyUrl: ownershipUrl,
        ...builderLotData,
        ...ownerData
      };

      log(`  📧 Emails: ${lead.emails.length} found`);
      log(`  📞 Phones: ${lead.phones.length} found`);
      log(`  👤 Owners: ${lead.ownerNames.length} found`);
      log(`  📍 Address: ${lead.address || 'N/A'}`);
      log(`  🏢 Property Type: ${lead.propertyType || 'N/A'}`);
      log(`  📐 Square Footage: ${lead.squareFootage || 'N/A'}`);

      leads.push(lead);

      // Screenshot for debugging (first 3 properties only)
      if (i < 3) {
        const screenshotPath = `/tmp/reonomy-v12-property-${i + 1}.png`;
        await page.screenshot({ path: screenshotPath, fullPage: false });
        log(`  📸 Screenshot saved: ${screenshotPath}`);
      }
    }

    // Step 6: Save results
    if (leads.length > 0) {
      log(`\n✅ Total leads scraped: ${leads.length}`);

      const outputData = {
        scrapeDate: new Date().toISOString(),
        searchId: searchId,
        leadCount: leads.length,
        leads: leads
      };

      fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
      log(`💾 Saved to: ${OUTPUT_FILE}`);
    } else {
      log('\n⚠️  No leads scraped.');
    }

    log('\n✅ Scraping complete!');
    return { leadCount: leads.length, outputFile: OUTPUT_FILE };

  } catch (error) {
    log(`\n❌ Error: ${error.message}`);
    log(error.stack);

    // Take screenshot of error state
    try {
      await page.screenshot({ path: '/tmp/reonomy-v12-error.png', fullPage: true });
      log('📸 Error screenshot saved: /tmp/reonomy-v12-error.png');
    } catch (e) {
      log('Could not save error screenshot');
    }

    throw error;

  } finally {
    await browser.close();
    log('\n🔚 Browser closed');
    process.exit(0);
  }
}

// Run
scrapeLeads()
  .then(result => {
    log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
    console.log(`\n💾 View your leads at: ${result.outputFile}`);
    process.exit(0);
  })
  .catch(error => {
    log(`\n💥 Scraper failed: ${error.message}`);
    process.exit(1);
  });