Files
red/teressa-copeland-homes/scripts/scrape-skyslope-forms.ts
Chandler Copeland 9117dc4c02 initial install
2026-04-08 12:54:58 -06:00

432 lines
17 KiB
TypeScript

/**
* SkySlope Forms Scraper — via Utah Real Estate SSO
*
* Flow: utahrealestate.com login → Forms → MLS Forms → SkySlope → download all library PDFs
*
* Run: npm run scrape:forms
*
* Credentials read from .env.local:
* URE_USERNAME — Utah Real Estate username
* URE_PASSWORD — Utah Real Estate password
*/
import { chromium } from 'playwright';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
import { tmpdir } from 'node:os';
import AdmZip from 'adm-zip';
import { config } from 'dotenv';
config({ path: path.resolve(process.cwd(), '.env.local') });
const URE_USERNAME = process.env.URE_USERNAME;
const URE_PASSWORD = process.env.URE_PASSWORD;
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
if (!URE_USERNAME || !URE_PASSWORD) {
console.error('Missing required env vars: URE_USERNAME, URE_PASSWORD');
process.exit(1);
}
async function main() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const browser = await chromium.launch({ headless: false });
const savedState = await fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null);
const context = await browser.newContext({
acceptDownloads: true,
storageState: savedState ?? undefined,
});
const page = await context.newPage();
// ── Step 1: Login to Utah Real Estate ─────────────────────────────────────
console.log('Navigating to utahrealestate.com...');
await page.goto('https://www.utahrealestate.com/auth/login', { waitUntil: 'domcontentloaded', timeout: 30_000 });
await page.waitForTimeout(2000);
console.log(`Login page URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-ure-login.png' });
const COOKIE_FILE = path.resolve(process.cwd(), 'scripts/.ure-session.json');
// Only fill login form if we're actually on the login page
if (page.url().includes('/auth/login') || page.url().includes('/login')) {
const usernameInput = page.locator('input[name="username"], input[name="user"], input[id*="user"], input[placeholder*="user" i]').first();
const passwordInput = page.locator('input[type="password"]').first();
await usernameInput.waitFor({ timeout: 10_000 });
await usernameInput.fill(URE_USERNAME!);
await passwordInput.fill(URE_PASSWORD!);
console.log('Credentials filled, submitting...');
await page.locator('button[type="submit"], input[type="submit"], button:has-text("Login"), button:has-text("Sign In"), button:has-text("Log In")').first().click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
} else {
console.log('Already logged in (session restored).');
}
console.log(`After login URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-ure-after-login.png' });
// ── Step 1b: Handle 2FA if present ────────────────────────────────────────
const pageText = await page.locator('body').innerText().catch(() => '');
if (pageText.includes('verification code') || pageText.includes('one-time')) {
console.log('\n⚡ 2FA detected — please complete it in the browser window.');
console.log(' (Select Text/Email, enter the code, and click Submit)');
console.log(' Waiting up to 2 minutes for you to finish...\n');
// Poll every 2s until 2FA page is gone (up to 2 minutes)
const deadline = Date.now() + 120_000;
while (Date.now() < deadline) {
await page.waitForTimeout(2000);
const text = await page.locator('body').innerText().catch(() => '');
if (!text.includes('verification code') && !text.includes('one-time')) break;
process.stdout.write('.');
}
console.log();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(2000);
console.log(`After 2FA URL: ${page.url()}`);
// Save session so we skip 2FA next time
await context.storageState({ path: COOKIE_FILE });
console.log('Session saved — 2FA will be skipped on next run.');
}
// ── Step 2: Navigate directly to SkySlope SSO URL ────────────────────────
console.log('Navigating to SkySlope via SSO...');
const [newPage] = await Promise.all([
context.waitForEvent('page', { timeout: 15_000 }).catch(() => null),
page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' }),
]);
// The SSO link opens a new tab
const activePage = newPage ?? page;
await activePage.waitForLoadState('domcontentloaded');
await activePage.waitForTimeout(5000);
console.log(`SkySlope URL: ${activePage.url()}`);
await activePage.screenshot({ path: 'scripts/debug-skyslope-landing.png' });
await downloadAllForms(activePage, context, OUTPUT_DIR);
if (newPage) await newPage.close();
await browser.close();
console.log('\nDone.');
}
async function downloadAllForms(
page: import('playwright').Page,
context: import('playwright').BrowserContext,
outputDir: string
) {
const downloaded: string[] = [];
const failed: string[] = [];
// Handle NRDS auth if it appears
await handleNRDSAuth(page);
// Wait for forms library to load
await page.waitForTimeout(4000);
console.log(`Forms library URL: ${page.url()}`);
// Navigate to Browse Libraries
console.log('Clicking Browse Libraries...');
const browseLink = page.locator('a:has-text("Browse Libraries"), a[href*="libraries"], nav a:has-text("Libraries")').first();
if (await browseLink.isVisible({ timeout: 5_000 }).catch(() => false)) {
await browseLink.click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
} else {
await page.goto('https://forms.skyslope.com/libraries', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(3000);
}
console.log(`Libraries URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-libraries-page.png' });
const bodyText2 = await page.locator('body').innerText().catch(() => '');
console.log('Libraries page text (first 600):', bodyText2.slice(0, 600));
// Find all library cards/links
const libraryLinks = await page.locator(
'a[href*="/library/"], a[href*="/libraries/"], [class*="library-card"], [class*="libraryCard"]'
).all();
console.log(`Found ${libraryLinks.length} library links`);
if (libraryLinks.length > 0) {
const libraryHrefs: string[] = [];
for (const link of libraryLinks) {
const href = await link.getAttribute('href').catch(() => '');
const name = await link.textContent().catch(() => '');
if (href) {
libraryHrefs.push(href);
console.log(` Library: ${name?.trim().slice(0, 50)}${href}`);
}
}
for (const href of libraryHrefs) {
const url = href.startsWith('http') ? href : `https://forms.skyslope.com${href}`;
console.log(`\n── Opening library: ${url} ──`);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(4000);
await page.screenshot({ path: `scripts/debug-library-${Date.now()}.png` });
await downloadFormsInView(page, context, outputDir, downloaded, failed);
}
} else {
// Fallback: libraries might be listed as clickable items
const bodyText = await page.locator('body').innerText().catch(() => '');
const libraryNames = ['Data Forms - URE', 'Utah Association of Realtors', 'Utah CCIM'];
for (const libName of libraryNames) {
const libLink = page.locator(`a:has-text("${libName}"), button:has-text("${libName}")`).first();
if (await libLink.isVisible({ timeout: 3_000 }).catch(() => false)) {
console.log(`\n── Library: ${libName} ──`);
await libLink.click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(4000);
await downloadFormsInView(page, context, outputDir, downloaded, failed);
await page.goBack();
await page.waitForTimeout(2000);
}
}
if (libraryLinks.length === 0) {
// We're already on the all-forms page — download directly
console.log('All forms visible on current page — downloading...');
await downloadFormsInView(page, context, outputDir, downloaded, failed);
}
}
console.log(`\n✓ Downloaded ${downloaded.length} forms`);
if (failed.length > 0) console.log(`✗ Failed: ${failed.length}${failed.join(', ')}`);
}
async function handleNRDSAuth(page: import('playwright').Page) {
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME || 'Copeland';
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID || '837075029';
// Check if NRDS auth page appears
const isNRDS = await page.locator('input[placeholder*="last" i], input[placeholder*="nrds" i]').first().isVisible({ timeout: 5_000 }).catch(() => false);
if (!isNRDS) return;
console.log('NRDS authorization required — filling...');
const inputs = await page.locator('input').all();
if (inputs.length >= 2) {
await inputs[0].fill(LAST_NAME);
await inputs[1].fill(NRDS_ID);
await page.locator('button:has-text("Next"), button[type="submit"]').first().click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
console.log(`After NRDS URL: ${page.url()}`);
}
}
function extractFormNames(bodyText: string): string[] {
const lines = bodyText.split('\n').map(l => l.trim()).filter(l => l.length > 0);
const formNames: string[] = [];
for (let i = 0; i < lines.length; i++) {
if (lines[i] === 'Add' && i > 0 && lines[i - 1] !== 'Add' && lines[i - 1].length > 3) {
formNames.push(lines[i - 1]);
}
}
return formNames;
}
async function downloadFormsInView(
page: import('playwright').Page,
context: import('playwright').BrowserContext,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Flow: scroll to load all forms, then click form name → preview → Download button → save
// Scroll down repeatedly to trigger infinite scroll, collecting all form names
const allNames = new Set<string>();
let prevCount = 0;
let stallRounds = 0;
while (stallRounds < 5) {
// Scroll both window and any inner scrollable container to handle virtualized lists
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
const scrollable = document.querySelector('[class*="scroll"], [class*="list"], main, [role="main"], .overflow-auto, .overflow-y-auto');
if (scrollable) scrollable.scrollTop = scrollable.scrollHeight;
});
await page.waitForTimeout(2000);
const bodyText = await page.locator('body').innerText().catch(() => '');
for (const n of extractFormNames(bodyText)) allNames.add(n);
if (allNames.size === prevCount) {
stallRounds++;
} else {
stallRounds = 0;
prevCount = allNames.size;
process.stdout.write(` Loaded ${allNames.size} forms so far...\n`);
}
}
// Scroll back to top before clicking
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(500);
const names = [...allNames];
console.log(` Found ${names.length} forms to download`);
if (names.length === 0) {
await page.screenshot({ path: `scripts/debug-no-forms-${Date.now()}.png` });
return;
}
for (const formName of names) {
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
const destPath = path.join(outputDir, `${sanitized}.pdf`);
// Skip already downloaded
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
// Click the form name to open preview
const nameEl = page.locator(`text="${formName}"`).first();
if (!await nameEl.isVisible({ timeout: 3_000 }).catch(() => false)) {
process.stdout.write(` ⚠ not found: ${sanitized}\n`);
failed.push(sanitized);
continue;
}
await nameEl.click().catch(() => {});
// Wait for preview/modal to appear (up to 5s)
await page.waitForTimeout(2000);
// Click Download button in the preview
const downloadBtn = page.locator(
'button:has-text("Download"), a:has-text("Download"), [aria-label*="download" i], button[title*="download" i]'
).first();
if (!await downloadBtn.isVisible({ timeout: 5_000 }).catch(() => false)) {
process.stdout.write(` ⚠ no Download button found for: ${sanitized}\n`);
await page.screenshot({ path: `scripts/debug-no-download-btn-${Date.now()}.png` });
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(500);
failed.push(sanitized);
continue;
}
try {
const [download] = await Promise.all([
page.waitForEvent('download', { timeout: 20_000 }),
downloadBtn.click(),
]);
const tmpPath = path.join(tmpdir(), `skyslope-${Date.now()}.tmp`);
await download.saveAs(tmpPath);
await savePdf(tmpPath, destPath);
process.stdout.write(`${sanitized}.pdf\n`);
downloaded.push(sanitized);
} catch (err) {
process.stdout.write(` ✗ download failed: ${sanitized}${(err as Error).message.slice(0, 60)}\n`);
failed.push(sanitized);
}
// Close preview and return to list
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(800);
}
}
async function downloadViaTextRows(
page: import('playwright').Page,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Legacy fallback — kept for safety but downloadFormsInView handles all cases now
console.log(' (downloadViaTextRows called — should not reach here normally)');
const rows = await page.evaluate(() => {
const candidates = Array.from(document.querySelectorAll('tr, li, [class*="row"], [class*="item"]'));
return candidates
.map(el => ({
text: el.textContent?.replace(/\s+/g, ' ').trim() ?? '',
hasAdd: el.textContent?.includes('Add') ?? false,
}))
.filter(r => r.hasAdd && r.text.length > 10)
.map(r => r.text.replace(/\s*Add\s*$/, '').trim());
});
console.log(` Found ${rows.length} form names via text extraction`);
for (const formName of rows) {
if (!formName || formName.length < 3) continue;
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
const destPath = path.join(outputDir, `${sanitized}.pdf`);
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
const el = page.locator(`text="${formName}"`).first();
if (!await el.isVisible({ timeout: 2_000 }).catch(() => false)) {
process.stdout.write(` ⚠ not visible: ${sanitized}\n`);
continue;
}
const pdfUrl = await interceptPdfOnClick(page, el);
if (pdfUrl) {
try {
const buf = await page.evaluate(async (url) => {
const r = await fetch(url, { credentials: 'include' });
const ab = await r.arrayBuffer();
return Array.from(new Uint8Array(ab));
}, pdfUrl);
await fs.writeFile(destPath, Buffer.from(buf));
process.stdout.write(`${sanitized}.pdf\n`);
downloaded.push(sanitized);
} catch {
process.stdout.write(`${sanitized}\n`);
failed.push(sanitized);
}
} else {
process.stdout.write(` ⚠ no PDF: ${sanitized}\n`);
failed.push(sanitized);
}
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(500);
}
}
async function interceptPdfOnClick(
page: import('playwright').Page,
row: import('playwright').Locator
): Promise<string | null> {
return new Promise(async (resolve) => {
let resolved = false;
const handler = (response: import('playwright').Response) => {
const ct = response.headers()['content-type'] ?? '';
if (!resolved && (ct.includes('pdf') || response.url().endsWith('.pdf'))) {
resolved = true;
resolve(response.url());
}
};
page.on('response', handler);
await row.click({ timeout: 3_000 }).catch(() => {});
await page.waitForTimeout(2000);
page.off('response', handler);
if (!resolved) resolve(null);
});
}
/** If the downloaded file is a ZIP, extract the first PDF inside; otherwise move as-is. */
async function savePdf(tmpPath: string, destPath: string) {
const buf = await fs.readFile(tmpPath);
const isPk = buf[0] === 0x50 && buf[1] === 0x4b; // PK magic bytes = ZIP
if (isPk) {
const zip = new AdmZip(buf);
const entry = zip.getEntries().find(e => e.entryName.toLowerCase().endsWith('.pdf'));
if (!entry) throw new Error('ZIP contained no PDF entry');
await fs.writeFile(destPath, entry.getData());
} else {
await fs.rename(tmpPath, destPath);
}
await fs.unlink(tmpPath).catch(() => {}); // clean up tmp if rename didn't move it
}
main().catch(err => {
console.error('Fatal:', err.message);
process.exit(1);
});