clawdbot-workspace/reonomy-scraper-v11-simple.js

#!/usr/bin/env node

/**
 * Reonomy Scraper v11 Simple - PLAYWRIGHT VERSION (NO FILTERS)
 *
 * This is a simpler version to verify Playwright works.
 * Filters removed for testing purposes.
 */

const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');

// Configuration
const REONOMY_EMAIL = process.env.REONOMY_EMAIL || 'henry@realestateenhanced.com';
const REONOMY_PASSWORD = process.env.REONOMY_PASSWORD || '9082166532';
const SEARCH_LOCATION = process.env.REONOMY_LOCATION || 'Eatontown, NJ';
const HEADLESS = process.env.HEADLESS === 'true';
const MAX_PROPERTIES = 20;

// Output files
const OUTPUT_FILE = path.join(__dirname, 'reonomy-leads-v11-simple.json');
const LOG_FILE = path.join(__dirname, 'reonomy-scraper-v11-simple.log');

function log(message) {
  const timestamp = new Date().toISOString();
  const logMessage = `[${timestamp}] ${message}\n`;
  console.log(message);
  fs.appendFileSync(LOG_FILE, logMessage);
}

/**
 * Extract ALL data from Owner tab using Playwright
 */
async function extractOwnerTabData(page) {
  return await page.evaluate(() => {
    const info = {
      propertyId: '',
      propertyAddress: '',
      city: '',
      state: '',
      zip: '',
      squareFootage: '',
      propertyType: '',
      emails: [],
      phones: [],
      ownerNames: []
    };

    // Extract property ID from URL
    const propIdMatch = window.location.href.match(/property\/([a-f0-9-]+)/);
    if (propIdMatch) {
      info.propertyId = propIdMatch[1];
    }

    // Extract property address from h1, h2, h3
    const headingSelectors = ['h1', 'h2', 'h3'];
    for (const sel of headingSelectors) {
      const heading = document.querySelector(sel);
      if (heading) {
        const text = heading.textContent.trim();
        const addressMatch = text.match(/^(\d+[^,]+),\s*([A-Za-z\s,]+),\s*([A-Z]{2})\s*(\d{5})/);
        if (addressMatch) {
          info.propertyAddress = addressMatch[0];
          info.city = addressMatch[1]?.trim();
          info.state = addressMatch[2]?.trim();
          info.zip = addressMatch[3]?.trim();
          break;
        }
      }
    }

    // Extract property details (SF, type)
    const bodyText = document.body.innerText;

    // Square footage
    const sfMatch = bodyText.match(/(\d+\.?\d*\s*k?\s*SF)/i);
    if (sfMatch) {
      info.squareFootage = sfMatch[0];
    }

    // Property type
    const typePatterns = ['Warehouse', 'Office Building', 'Retail Stores', 'Industrial', 'General Industrial', 'Medical Building', 'School', 'Religious', 'Supermarket', 'Financial Building'];
    for (const type of typePatterns) {
      if (bodyText.includes(type)) {
        info.propertyType = type;
        break;
      }
    }

    // Extract emails from mailto: links
    document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
      const email = a.href.replace('mailto:', '');
      if (email && email.length > 5 && !info.emails.includes(email)) {
        info.emails.push(email);
      }
    });

    // Also try email patterns in text
    const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
    const emailMatches = bodyText.match(emailRegex);
    if (emailMatches) {
      emailMatches.forEach(email => {
        if (!info.emails.includes(email)) {
          info.emails.push(email);
        }
      });
    }

    // Extract phones from tel: links
    document.querySelectorAll('a[href^="tel:"]').forEach(a => {
      const phone = a.href.replace('tel:', '');
      if (phone && phone.length >= 10 && !info.phones.includes(phone)) {
        info.phones.push(phone);
      }
    });

    // Also try phone patterns in text
    const phoneRegex = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
    const phoneMatches = bodyText.match(phoneRegex);
    if (phoneMatches) {
      phoneMatches.forEach(phone => {
        if (!info.phones.includes(phone)) {
          info.phones.push(phone);
        }
      });
    }

    // Extract owner names from Owner tab section
    const ownerPatterns = [
      /Owner:\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)/g,
      /Owns\s+\d+\s+properties?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)/i
    ];

    for (const pattern of ownerPatterns) {
      const matches = bodyText.match(pattern);
      if (matches) {
        matches.forEach(m => {
          const owner = typeof m === 'string' ? m : m[1];
          if (owner && owner.length > 3 && !info.ownerNames.includes(owner)) {
            info.ownerNames.push(owner);
          }
        });
      }
    }

    return info;
  });
}

/**
 * Extract property IDs from search results
 */
async function extractPropertyIds(page) {
  return await page.evaluate(() => {
    const ids = [];
    const links = document.querySelectorAll('a[href*="/property/"]');

    links.forEach(link => {
      const href = link.href;
      const match = href.match(/property\/([a-f0-9-]+)/);

      if (match) {
        ids.push({
          id: match[1],
          url: href
        });
      }
    });

    return ids;
  });
}

/**
 * Wait for contact details using Playwright's waitForFunction
 */
async function waitForContactDetails(page, timeoutMs = 30000) {
  log(`  ⏳ Waiting for contact details (up to ${timeoutMs/1000}s)...`);

  try {
    await page.waitForFunction(
      () => {
        const emails = document.querySelectorAll('a[href^="mailto:"]');
        const phones = document.querySelectorAll('a[href^="tel:"]');
        // Also check for email patterns in text
        const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
        const bodyText = document.body.innerText;
        const emailMatches = bodyText.match(emailRegex);

        return emails.length > 0 || phones.length > 0 || (emailMatches && emailMatches.length > 0);
      },
      { timeout: timeoutMs }
    );

    const data = await extractOwnerTabData(page);
    log(`  ✅ Contact details found! (${data.emails.length} emails, ${data.phones.length} phones)`);
    return true;

  } catch (error) {
    // Timeout is expected if no contacts found
    log('  ⚠️  No contact details found after timeout');
    return false;
  }
}

/**
 * Main scraper using Playwright
 */
async function scrapeLeads() {
  log('🚀 Starting Reonomy Scraper v11 Simple (PLAYWRIGHT - NO FILTERS)...\n');

  // Launch browser
  const browser = await chromium.launch({
    headless: HEADLESS,
    args: ['--no-sandbox', '--disable-setuid-sandbox']
  });

  const context = await browser.newContext({
    viewport: { width: 1920, height: 1080 }
  });

  const page = await context.newPage();

  const leads = [];

  try {
    // Login
    log('📍 Step 1: Logging into Reonomy...');
    await page.goto('https://app.reonomy.com/#!/account', {
      waitUntil: 'domcontentloaded',
      timeout: 60000
    });

    // Wait for email input
    await page.waitForSelector('input[type="email"]', { timeout: 10000 });
    await page.fill('input[type="email"]', REONOMY_EMAIL);
    await page.fill('input[type="password"]', REONOMY_PASSWORD);
    await page.click('button[type="submit"]');

    log('⏳ Waiting for login...');
    await page.waitForTimeout(10000);

    // Check if logged in
    const url = page.url();
    if (url.includes('login') || url.includes('auth')) {
      throw new Error('Login failed. Please check credentials.');
    }

    log('✅ Successfully logged in!');

    // Navigate to search
    log('\n📍 Step 2: Navigating to search...');
    await page.goto('https://app.reonomy.com/#!/search', {
      waitUntil: 'networkidle',
      timeout: 60000
    });

    // Perform initial search
    log(`📍 Step 3: Searching for: ${SEARCH_LOCATION}...`);

    // Find and fill search input
    const searchInput = page.locator('input[placeholder*="address"], input[placeholder*="Search"], input[type="text"]').first();
    await searchInput.waitFor({ state: 'visible', timeout: 10000 });
    await searchInput.fill(SEARCH_LOCATION);
    await page.keyboard.press('Enter');

    log('⏳ Searching...');
    await page.waitForTimeout(5000);

    // Extract search ID from URL
    const urlMatch = page.url().match(/search\/([a-f0-9-]+)/);
    if (!urlMatch) {
      throw new Error('Could not extract search ID from URL');
    }
    const searchId = urlMatch[1];
    log(`✅ Search ID: ${searchId}`);

    // Extract property IDs
    log('\n📍 Step 4: Extracting property IDs...');
    const propertyIds = await extractPropertyIds(page);
    log(`✅ Found ${propertyIds.length} property IDs`);

    if (propertyIds.length === 0) {
      log('⚠️  No property IDs found.');
      throw new Error('No properties found on search page.');
    }

    // Process each property
    const propertiesToScrape = propertyIds.slice(0, MAX_PROPERTIES);

    log(`\n📍 Step 5: Processing ${propertiesToScrape.length} properties...`);

    for (let i = 0; i < propertiesToScrape.length; i++) {
      const prop = propertiesToScrape[i];

      log(`\n[${i + 1}/${propertiesToScrape.length}] Property ID: ${prop.id}`);

      // Navigate directly to property URL
      log(`  🔗 Navigating to property...`);
      await page.goto(prop.url, { waitUntil: 'networkidle', timeout: 30000 });

      // Wait for page to load
      log(`  ⏳ Waiting for Owner tab to load...`);

      // Wait for any heading or content to appear
      await page.waitForSelector('h1, h2, h3, [role="heading"]', { timeout: 15000 }).catch(() => {
        log('  ⚠️  No heading found, continuing anyway');
      });

      // Smart wait for contact details using Playwright's waitForFunction
      await waitForContactDetails(page, 30000);

      // Extract data from Owner tab
      log(`  📊 Extracting data from Owner tab...`);
      const propertyData = await extractOwnerTabData(page);

      log(`  📧 Emails: ${propertyData.emails.length} found`);
      log(`  📞 Phones: ${propertyData.phones.length} found`);
      log(`  👤 Owners: ${propertyData.ownerNames.length} found`);
      log(`  🏢 Address: ${propertyData.propertyAddress || 'N/A'}`);

      const lead = {
        scrapeDate: new Date().toISOString().split('T')[0],
        propertyId: propertyData.propertyId,
        propertyUrl: page.url(),
        address: propertyData.propertyAddress || '',
        city: propertyData.city || '',
        state: propertyData.state || '',
        zip: propertyData.zip || '',
        squareFootage: propertyData.squareFootage || '',
        propertyType: propertyData.propertyType || '',
        ownerNames: propertyData.ownerNames.join('; ') || '',
        emails: propertyData.emails,
        phones: propertyData.phones,
        searchLocation: SEARCH_LOCATION,
        searchId: searchId
      };

      leads.push(lead);

      // Go back to search results for next property
      log(`  🔙 Going back to search results...`);
      await page.goto(`https://app.reonomy.com/#!/search/${searchId}`, {
        waitUntil: 'networkidle',
        timeout: 30000
      });

      await page.waitForTimeout(2000);
    }

    // Save results
    if (leads.length > 0) {
      log(`\n✅ Total leads scraped: ${leads.length}`);

      const outputData = {
        scrapeDate: new Date().toISOString(),
        location: SEARCH_LOCATION,
        searchId: searchId,
        leadCount: leads.length,
        framework: 'Playwright',
        leads: leads
      };

      fs.writeFileSync(OUTPUT_FILE, JSON.stringify(outputData, null, 2));
      log(`💾 Saved to: ${OUTPUT_FILE}`);
    } else {
      log('\n⚠️  No leads scraped.');
    }

    log('\n✅ Scraping complete!');

    return { leadCount: leads.length, outputFile: OUTPUT_FILE };

  } catch (error) {
    log(`\n❌ Error: ${error.message}`);
    log(error.stack);

    try {
      await page.screenshot({ path: '/tmp/reonomy-v11-simple-error.png', fullPage: true });
      log('📸 Error screenshot saved: /tmp/reonomy-v11-simple-error.png');
    } catch (e) {}

    throw error;

  } finally {
    await context.close();
    await browser.close();
    log('\n🔚 Browser closed');
  }
}

// Run
scrapeLeads()
  .then(result => {
    log(`\n🎉 Success! ${result.leadCount} leads scraped.`);
    console.log(`\n💾 View your leads at: ${result.outputFile}`);
    process.exit(0);
  })
  .catch(error => {
    log(`\n💥 Scraper failed: ${error.message}`);
    process.exit(1);
  });