clawdbot-workspace/reonomy-scraper-v11-puppeteer.js

#!/usr/bin/env node
/**
 * Reonomy Scraper v11 - PUPPETEER (PROVEN BASE + EMAILS/PHONES)
 *
 * Based on v9 (Puppeteer) - proven working version
 * Adds email and phone extraction logic to v9
 * Uses direct ownership URLs (no property card clicking)
 *
 * Usage:
 *   SEARCH_ID="504a2d13-d88f-4213-9ac6-a7c8bc7c20c6" node reonomy-scraper-v11-puppeteer.js
 *   Or set as environment variable
 */

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const MAX_PROPERTIES = parseInt(process.env.MAX_PROPERTIES) || 20;
const HEADLESS = process.env.HEADLESS !== 'false';

const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v11-puppeteer.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v11.log');

function log(message) {
  const timestamp = new Date().toISOString();
  const logMessage = `[${timestamp}] ${message}\n`;
  console.log(message);
  fs.appendFileSync(LOG_FILE, logMessage);
}

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

/**
 * Extract ALL data from Owner tab
 */
async function extractOwnerTabData(page) {
  log('📊 Extracting Owner tab data...');

  // Extract property ID from URL
  const propIdMatch = page.url().match(/property\/([a-f0-9-]+)/);
  const propertyId = propIdMatch ? propIdMatch[1] : '';

  // Extract property details using v9's proven approach
  const headingSelectors = ['h1', 'h2', 'h3'];
  let propertyAddress = '';
  let city = '';
  let state = '';
  let zip = '';
  let squareFootage = '';
  let propertyType = '';

  for (const sel of headingSelectors) {
    const heading = await page.$(sel);
    if (heading) {
      const text = (await page.evaluate(el => el.textContent, heading)).trim();
      const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
      if (addressMatch) {
        propertyAddress = addressMatch[0];
        city = addressMatch[1]?.trim() || '';
        state = addressMatch[2]?.trim() || '';
        zip = addressMatch[3]?.trim() || '';
        log(`   📍 Address: ${text}`);
        break;
      }
    }
  }

  // Extract property type and SF from body text
  const bodyText = await page.evaluate(() => document.body.innerText);
  const bodyTextContent = JSON.parse(bodyText).result || '';

  // Square footage
  const sfMatch = bodyTextContent.match(/(\d+\.?\d*\s*k?\s*SF)/i);
  if (sfMatch) {
    squareFootage = sfMatch[0];
    log(`   📐 Square Footage: ${sfMatch[0]}`);
  }

  // Property type
  const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
  for (const type of typePatterns) {
    if (bodyTextContent.includes(type)) {
      propertyType = type;
      log(`   🏢 Property Type: ${type}`);
      break;
    }
  }

  // Extract owner names using v9's proven regex patterns
  const ownerPatterns = [
    /Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/g,
    /Owns\s+(\d+)\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+(?:\s+(?:LLC|LLP|Inc|Corp|Co|Ltd|Partners|Housing|Properties|Realty|Estate|Investments|Management))/i
  ];

  let ownerNames = [];

  for (const pattern of ownerPatterns) {
    const matches = bodyTextContent.match(pattern);
    if (matches) {
      matches.forEach(m => {
        const owner = typeof m === 'string' ? m : m[1];
        if (owner && owner.length > 3 && !ownerNames.includes(owner)) {
          ownerNames.push(owner);
        }
      });
    }
  }

  log(`   👤 Owners found: ${ownerNames.length}`);

  // Extract phones using your CSS selector (proven to work)
  const phoneResult = await page.evaluateHandle(() => {
    return Array.from(document.querySelectorAll('p.MuiTypography-root.jss1797.jss1798.MuiTypography-body2')).map(p => p.textContent.trim()).filter(text => text.length >= 10);
  });

  let phones = [];
  if (phoneResult.result && Array.isArray(phoneResult.result)) {
    phoneResult.result.forEach(phone => {
      // Clean phone numbers (remove extra spaces, formatting)
      const cleanPhone = phone.replace(/[\s\-\(\)]/g, '');
      if (cleanPhone.length >= 10 && !phones.includes(cleanPhone)) {
        phones.push(cleanPhone);
      }
    });
    log(`   📞 Phones found: ${phones.length}`);
  }

  // Extract emails using mailto links (robust approach)
  const emailResult = await page.evaluateHandle(() => {
    // First try mailto links
    const mailtoLinks = Array.from(document.querySelectorAll('a[href^="mailto:"]')).map(a => a.href.replace('mailto:', ''));

    // Also try finding emails in text and from a/@ links
    const emailPattern = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
    const textEmails = bodyTextContent.match(emailPattern) || [];

    // Combine and deduplicate
    const allEmails = [...new Set([...mailtoLinks, ...textEmails])];
    allEmails.forEach(email => {
      if (email && email.length > 5 && !emails.includes(email)) {
        emails.push(email);
      }
    });

  log(`   📧 Emails found: ${emails.length}`);

  const ownerData = {
    propertyId: propertyId,
    propertyAddress: propertyAddress,
    city: city,
    state: state,
    zip: zip,
    squareFootage: squareFootage,
    propertyType: propertyType,
    ownerNames: ownerNames,
    emails: emails,
    phones: phones
  };

  return ownerData;
}

/**
 * Extract property IDs from search results
 */
async function extractPropertyIds(page) {
  return await page.evaluate(() => {
    const ids = [];
    const links = document.querySelectorAll('a[href*="/property/"]');

    links.forEach(link => {
      const href = link.href;
      const match = href.match(/property\/([a-f0-9-]+)/);
      if (match) {
        ids.push({
          id: match[1],
          url: `https://app.reonomy.com/#!/search/${window.location.href.split('/')[4]}/property/${match[1]}`
        });
      }
    });

    return ids;
  });
}

/**
 * Main scraper function
 */
async function scrapeLeads() {
  log('🚀 Starting Reonomy Scraper v11 (PUPPETEER + EMAILS/PHONES)...\n');

  const browser = await puppeteer.launch({
    headless: HEADLESS,
    args: ['--no-sandbox', '--disable-setuid-sandbox']
  });

  const page = await browser.newPage();
  await page.setViewport({ width: 1920, height: 1080 });

  // Step 1: Login to Reonomy
  log('\n🔐 Step 1: Logging into Reonomy...');

  await page.goto('https://app.reonomy.com/#!/account', {
    waitUntil: 'domcontentloaded',
    timeout: 60000
  });

  await sleep(2000);

  await page.type('input[type="email"]', REONOMY_EMAIL, { delay: 100 });
  await page.type('input[type="password"]', REONOMY_PASSWORD, { delay: 100 });
  await page.click('button[type="submit"]');

  log('⏳ Waiting for login...');
  await sleep(15000);

  // Check if logged in
  const url = page.url();
  if (url.includes('login') || url.includes('auth')) {
    throw new Error('Login failed. Please check credentials.');
  }

  log('✅ Successfully logged in!');

  // Step 2: Navigate to search
  log('\n📍 Step 2: Navigating to search...');

  await page.goto('https://app.reonomy.com/#!/search', {
    waitUntil: 'networkidle2',
    timeout: 60000
  });

  await sleep(3000);

  // Step 3: Extract search ID from URL
  const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
  if (!urlMatch) {
    throw new Error('Could not extract search ID from URL');
  }
  const searchId = urlMatch[1];
  log(`✅ Search ID: ${searchId}`);

  // Step 4: Extract property IDs
  log('\n📍 Step 3: Extracting property IDs...');

  const propertyIds = await extractPropertyIds(page);
  log(`✅ Found ${propertyIds.length} property IDs`);

  if (propertyIds.length === 0) {
    throw new Error('No properties found on search page.');
  }

  // Step 5: Process each property
  const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);

  log(`\n📍 Step 4: Processing ${propertiesToScrape.length} properties...\n`);

  const leads = [];

  for (let i = 0; i < propertiesToScrape.length; i++) {
    const prop = propertiesToScrape[i];

    log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);

    // Navigate directly to ownership page (from your research)
    const ownershipUrl = `https://app.reonomy.com/#!/search/${searchId}/property/${prop.id}/ownership`;
    log(`  🔗 Navigating to ownership page...`);

    await page.goto(ownershipUrl, {
      waitUntil: 'networkidle2',
      timeout: 30000
    });

    // Wait for Owner tab to load
    log(`  ⏳ Waiting for Owner tab to load...`);
    await sleep(8000);

    // Extract ALL data from Owner tab
    log(`  📊 Extracting data from Owner tab...`);
    const ownerData = await extractOwnerTabData(page);

    log(`  📧 Emails: ${ownerData.emails.length} found`);
    log(`  📞 Phones: ${ownerData.phones.length} found`);
    log(`  👤 Owners: ${ownerData.ownerNames.length} found`);
    log(`  📍 Address: ${ownerData.propertyAddress || 'N/A'}`);

    const lead = {
      scrapeDate: new Date().toISOString().split('T')[0],
      propertyId: prop.id,
      propertyUrl: ownershipUrl,
      address: ownerData.propertyAddress || '',
      city: ownerData.city || '',
      state: ownerData.state || '',
      zip: ownerData.zip || '',
      squareFootage: ownerData.squareFootage || '',
      propertyType: ownerData.propertyType || '',
      ownerNames: ownerData.ownerNames.join('; ') || '',
      emails: ownerData.emails,
      phones: ownerData.phones,
      searchLocation: SEARCH_LOCATION,
      searchId: searchId
    };

    leads.push(lead);

    // Screenshot for debugging (first 3 properties only)
    if (i < 3) {
      const screenshotPath = `/tmp/reonomy-v11-property-${i + 1}.png`;
      await page.screenshot({ path: screenshotPath, fullPage: false });
      log(`   📸 Screenshot saved: ${screenshotPath}`);
    }
  }

  // Step 6: Save results
  if (leads.length > 0) {
    log(`\n✅ Total leads scraped: ${leads.length}`);

    const outputData = {
      scrapeDate: new Date().toISOString(),
      searchId: searchId,
      searchLocation: SEARCH_LOCATION,
      leadCount: leads.length,
      leads: leads
    };

    fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
    log(`💾 Saved to: ${OUTPUT_FILE}`);
  } else {
    log('\n⚠️  No leads scraped.');
  }

  log('\n✅ Scraping complete!');
  return { leadCount: leads.length, outputFile: OUTPUT_FILE };

}

/**
 * Main execution
 */
(async () => {
  try {
    await scrapeLeads();
    process.exit(0);
  } catch (error) {
    log(`\n❌ Error: ${error.message}`);
    log(error.stack);

    // Take screenshot of error state
    try {
      await page.screenshot({ path: '/tmp/reonomy-v11-error.png', fullPage: true });
      log('📸 Error screenshot saved: /tmp/reonomy-v11-error.png');
    } catch (e) {
      log('Could not save error screenshot');
    }

    await browser.close();
    log('\n🔚 Browser closed');
    process.exit(1);
  }
})();