/** * SkySlope Forms Scraper — via Utah Real Estate SSO * * Flow: utahrealestate.com login → Forms → MLS Forms → SkySlope → download all library PDFs * * Run: npm run scrape:forms * * Credentials read from .env.local: * URE_USERNAME — Utah Real Estate username * URE_PASSWORD — Utah Real Estate password */ import { chromium } from 'playwright'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; import { config } from 'dotenv'; config({ path: path.resolve(process.cwd(), '.env.local') }); const URE_USERNAME = process.env.URE_USERNAME; const URE_PASSWORD = process.env.URE_PASSWORD; const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms'); if (!URE_USERNAME || !URE_PASSWORD) { console.error('Missing required env vars: URE_USERNAME, URE_PASSWORD'); process.exit(1); } async function main() { await fs.mkdir(OUTPUT_DIR, { recursive: true }); const browser = await chromium.launch({ headless: false }); const savedState = await fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null); const context = await browser.newContext({ acceptDownloads: true, storageState: savedState ?? undefined, }); const page = await context.newPage(); // ── Step 1: Login to Utah Real Estate ───────────────────────────────────── console.log('Navigating to utahrealestate.com...'); await page.goto('https://www.utahrealestate.com/auth/login', { waitUntil: 'domcontentloaded', timeout: 30_000 }); await page.waitForTimeout(2000); console.log(`Login page URL: ${page.url()}`); await page.screenshot({ path: 'scripts/debug-ure-login.png' }); const COOKIE_FILE = path.resolve(process.cwd(), 'scripts/.ure-session.json'); // Only fill login form if we're actually on the login page if (page.url().includes('/auth/login') || page.url().includes('/login')) { const usernameInput = page.locator('input[name="username"], input[name="user"], input[id*="user"], input[placeholder*="user" i]').first(); const passwordInput = page.locator('input[type="password"]').first(); await usernameInput.waitFor({ timeout: 10_000 }); await usernameInput.fill(URE_USERNAME!); await passwordInput.fill(URE_PASSWORD!); console.log('Credentials filled, submitting...'); await page.locator('button[type="submit"], input[type="submit"], button:has-text("Login"), button:has-text("Sign In"), button:has-text("Log In")').first().click(); await page.waitForLoadState('domcontentloaded'); await page.waitForTimeout(3000); } else { console.log('Already logged in (session restored).'); } console.log(`After login URL: ${page.url()}`); await page.screenshot({ path: 'scripts/debug-ure-after-login.png' }); // ── Step 1b: Handle 2FA if present ──────────────────────────────────────── const pageText = await page.locator('body').innerText().catch(() => ''); if (pageText.includes('verification code') || pageText.includes('one-time')) { console.log('\n⚡ 2FA detected — please complete it in the browser window.'); console.log(' (Select Text/Email, enter the code, and click Submit)'); console.log(' Waiting up to 2 minutes for you to finish...\n'); // Poll every 2s until 2FA page is gone (up to 2 minutes) const deadline = Date.now() + 120_000; while (Date.now() < deadline) { await page.waitForTimeout(2000); const text = await page.locator('body').innerText().catch(() => ''); if (!text.includes('verification code') && !text.includes('one-time')) break; process.stdout.write('.'); } console.log(); await page.waitForLoadState('domcontentloaded'); await page.waitForTimeout(2000); console.log(`After 2FA URL: ${page.url()}`); // Save session so we skip 2FA next time await context.storageState({ path: COOKIE_FILE }); console.log('Session saved — 2FA will be skipped on next run.'); } // ── Step 2: Navigate directly to SkySlope SSO URL ──────────────────────── console.log('Navigating to SkySlope via SSO...'); const [newPage] = await Promise.all([ context.waitForEvent('page', { timeout: 15_000 }).catch(() => null), page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' }), ]); // The SSO link opens a new tab const activePage = newPage ?? page; await activePage.waitForLoadState('domcontentloaded'); await activePage.waitForTimeout(5000); console.log(`SkySlope URL: ${activePage.url()}`); await activePage.screenshot({ path: 'scripts/debug-skyslope-landing.png' }); await downloadAllForms(activePage, context, OUTPUT_DIR); if (newPage) await newPage.close(); await browser.close(); console.log('\nDone.'); } async function downloadAllForms( page: import('playwright').Page, context: import('playwright').BrowserContext, outputDir: string ) { const downloaded: string[] = []; const failed: string[] = []; // Handle NRDS auth if it appears await handleNRDSAuth(page); // Wait for forms library to load await page.waitForTimeout(4000); console.log(`Forms library URL: ${page.url()}`); // Navigate to Browse Libraries console.log('Clicking Browse Libraries...'); const browseLink = page.locator('a:has-text("Browse Libraries"), a[href*="libraries"], nav a:has-text("Libraries")').first(); if (await browseLink.isVisible({ timeout: 5_000 }).catch(() => false)) { await browseLink.click(); await page.waitForLoadState('domcontentloaded'); await page.waitForTimeout(3000); } else { await page.goto('https://forms.skyslope.com/libraries', { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(3000); } console.log(`Libraries URL: ${page.url()}`); await page.screenshot({ path: 'scripts/debug-libraries-page.png' }); const bodyText2 = await page.locator('body').innerText().catch(() => ''); console.log('Libraries page text (first 600):', bodyText2.slice(0, 600)); // Find all library cards/links const libraryLinks = await page.locator( 'a[href*="/library/"], a[href*="/libraries/"], [class*="library-card"], [class*="libraryCard"]' ).all(); console.log(`Found ${libraryLinks.length} library links`); if (libraryLinks.length > 0) { const libraryHrefs: string[] = []; for (const link of libraryLinks) { const href = await link.getAttribute('href').catch(() => ''); const name = await link.textContent().catch(() => ''); if (href) { libraryHrefs.push(href); console.log(` Library: ${name?.trim().slice(0, 50)} → ${href}`); } } for (const href of libraryHrefs) { const url = href.startsWith('http') ? href : `https://forms.skyslope.com${href}`; console.log(`\n── Opening library: ${url} ──`); await page.goto(url, { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(4000); await page.screenshot({ path: `scripts/debug-library-${Date.now()}.png` }); await downloadFormsInView(page, context, outputDir, downloaded, failed); } } else { // Fallback: libraries might be listed as clickable items const bodyText = await page.locator('body').innerText().catch(() => ''); const libraryNames = ['Data Forms - URE', 'Utah Association of Realtors', 'Utah CCIM']; for (const libName of libraryNames) { const libLink = page.locator(`a:has-text("${libName}"), button:has-text("${libName}")`).first(); if (await libLink.isVisible({ timeout: 3_000 }).catch(() => false)) { console.log(`\n── Library: ${libName} ──`); await libLink.click(); await page.waitForLoadState('domcontentloaded'); await page.waitForTimeout(4000); await downloadFormsInView(page, context, outputDir, downloaded, failed); await page.goBack(); await page.waitForTimeout(2000); } } if (libraryLinks.length === 0) { // We're already on the all-forms page — download directly console.log('All forms visible on current page — downloading...'); await downloadFormsInView(page, context, outputDir, downloaded, failed); } } console.log(`\n✓ Downloaded ${downloaded.length} forms`); if (failed.length > 0) console.log(`✗ Failed: ${failed.length} — ${failed.join(', ')}`); } async function handleNRDSAuth(page: import('playwright').Page) { const LAST_NAME = process.env.SKYSLOPE_LAST_NAME || 'Copeland'; const NRDS_ID = process.env.SKYSLOPE_NRDS_ID || '837075029'; // Check if NRDS auth page appears const isNRDS = await page.locator('input[placeholder*="last" i], input[placeholder*="nrds" i]').first().isVisible({ timeout: 5_000 }).catch(() => false); if (!isNRDS) return; console.log('NRDS authorization required — filling...'); const inputs = await page.locator('input').all(); if (inputs.length >= 2) { await inputs[0].fill(LAST_NAME); await inputs[1].fill(NRDS_ID); await page.locator('button:has-text("Next"), button[type="submit"]').first().click(); await page.waitForLoadState('domcontentloaded'); await page.waitForTimeout(3000); console.log(`After NRDS URL: ${page.url()}`); } } async function downloadFormsInView( page: import('playwright').Page, context: import('playwright').BrowserContext, outputDir: string, downloaded: string[], failed: string[] ) { // Flow: click form name → preview opens → click Download button → save file // Extract form names from the page body text — the list renders as "Name\nAdd\nName\nAdd..." const bodyText = await page.locator('body').innerText().catch(() => ''); const lines = bodyText.split('\n').map(l => l.trim()).filter(l => l.length > 3); const formNames: string[] = []; for (let i = 0; i < lines.length; i++) { if (lines[i] === 'Add' && i > 0 && lines[i - 1] !== 'Add' && lines[i - 1].length > 3) { formNames.push(lines[i - 1]); } } const names = [...new Set(formNames)]; console.log(` Found ${names.length} forms to download`); if (names.length === 0) { await page.screenshot({ path: `scripts/debug-no-forms-${Date.now()}.png` }); return; } for (const formName of names) { const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100); const destPath = path.join(outputDir, `${sanitized}.pdf`); // Skip already downloaded try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ } // Click the form name to open preview const nameEl = page.locator(`text="${formName}"`).first(); if (!await nameEl.isVisible({ timeout: 3_000 }).catch(() => false)) { process.stdout.write(` ⚠ not found: ${sanitized}\n`); failed.push(sanitized); continue; } await nameEl.click().catch(() => {}); // Wait for preview/modal to appear (up to 5s) await page.waitForTimeout(2000); // Click Download button in the preview const downloadBtn = page.locator( 'button:has-text("Download"), a:has-text("Download"), [aria-label*="download" i], button[title*="download" i]' ).first(); if (!await downloadBtn.isVisible({ timeout: 5_000 }).catch(() => false)) { process.stdout.write(` ⚠ no Download button found for: ${sanitized}\n`); await page.screenshot({ path: `scripts/debug-no-download-btn-${Date.now()}.png` }); await page.keyboard.press('Escape').catch(() => {}); await page.waitForTimeout(500); failed.push(sanitized); continue; } try { const [download] = await Promise.all([ page.waitForEvent('download', { timeout: 20_000 }), downloadBtn.click(), ]); await download.saveAs(destPath); process.stdout.write(` ✓ ${sanitized}.pdf\n`); downloaded.push(sanitized); } catch (err) { process.stdout.write(` ✗ download failed: ${sanitized} — ${(err as Error).message.slice(0, 60)}\n`); failed.push(sanitized); } // Close preview and return to list await page.keyboard.press('Escape').catch(() => {}); await page.waitForTimeout(800); } } async function downloadViaTextRows( page: import('playwright').Page, outputDir: string, downloaded: string[], failed: string[] ) { // Legacy fallback — kept for safety but downloadFormsInView handles all cases now console.log(' (downloadViaTextRows called — should not reach here normally)'); const rows = await page.evaluate(() => { const candidates = Array.from(document.querySelectorAll('tr, li, [class*="row"], [class*="item"]')); return candidates .map(el => ({ text: el.textContent?.replace(/\s+/g, ' ').trim() ?? '', hasAdd: el.textContent?.includes('Add') ?? false, })) .filter(r => r.hasAdd && r.text.length > 10) .map(r => r.text.replace(/\s*Add\s*$/, '').trim()); }); console.log(` Found ${rows.length} form names via text extraction`); for (const formName of rows) { if (!formName || formName.length < 3) continue; const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100); const destPath = path.join(outputDir, `${sanitized}.pdf`); try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ } const el = page.locator(`text="${formName}"`).first(); if (!await el.isVisible({ timeout: 2_000 }).catch(() => false)) { process.stdout.write(` ⚠ not visible: ${sanitized}\n`); continue; } const pdfUrl = await interceptPdfOnClick(page, el); if (pdfUrl) { try { const buf = await page.evaluate(async (url) => { const r = await fetch(url, { credentials: 'include' }); const ab = await r.arrayBuffer(); return Array.from(new Uint8Array(ab)); }, pdfUrl); await fs.writeFile(destPath, Buffer.from(buf)); process.stdout.write(` ✓ ${sanitized}.pdf\n`); downloaded.push(sanitized); } catch { process.stdout.write(` ✗ ${sanitized}\n`); failed.push(sanitized); } } else { process.stdout.write(` ⚠ no PDF: ${sanitized}\n`); failed.push(sanitized); } await page.keyboard.press('Escape').catch(() => {}); await page.waitForTimeout(500); } } async function interceptPdfOnClick( page: import('playwright').Page, row: import('playwright').Locator ): Promise { return new Promise(async (resolve) => { let resolved = false; const handler = (response: import('playwright').Response) => { const ct = response.headers()['content-type'] ?? ''; if (!resolved && (ct.includes('pdf') || response.url().endsWith('.pdf'))) { resolved = true; resolve(response.url()); } }; page.on('response', handler); await row.click({ timeout: 3_000 }).catch(() => {}); await page.waitForTimeout(2000); page.off('response', handler); if (!resolved) resolve(null); }); } main().catch(err => { console.error('Fatal:', err.message); process.exit(1); });