wip: skyslope scraper — fix name extraction via body text parsing, preview+download flow ready
BIN
teressa-copeland-homes/scripts/debug-after-click.png
Normal file
|
After Width: | Height: | Size: 170 KiB |
BIN
teressa-copeland-homes/scripts/debug-after-login.png
Normal file
|
After Width: | Height: | Size: 1.2 MiB |
43
teressa-copeland-homes/scripts/debug-dom.ts
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import { chromium } from 'playwright';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const savedState = await import('node:fs/promises').then(fs =>
|
||||||
|
fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null)
|
||||||
|
);
|
||||||
|
const browser = await chromium.launch({ headless: true });
|
||||||
|
const context = await browser.newContext({ storageState: savedState ?? undefined });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' });
|
||||||
|
const [newPage] = await Promise.all([
|
||||||
|
context.waitForEvent('page', { timeout: 10_000 }).catch(() => null),
|
||||||
|
Promise.resolve(),
|
||||||
|
]);
|
||||||
|
const activePage = newPage ?? page;
|
||||||
|
await activePage.waitForLoadState('domcontentloaded');
|
||||||
|
await activePage.waitForTimeout(3000);
|
||||||
|
|
||||||
|
// Navigate to browse-libraries
|
||||||
|
await activePage.goto('https://forms.skyslope.com/browse-libraries', { waitUntil: 'domcontentloaded' });
|
||||||
|
await activePage.waitForTimeout(4000);
|
||||||
|
|
||||||
|
// Inspect the first Add button's ancestors
|
||||||
|
const result = await activePage.evaluate(() => {
|
||||||
|
const addBtns = Array.from(document.querySelectorAll('button')).filter(b => b.textContent?.trim() === 'Add');
|
||||||
|
if (addBtns.length === 0) return 'No Add buttons found';
|
||||||
|
const btn = addBtns[0];
|
||||||
|
// Walk up the tree and print each ancestor's tag, class, and text
|
||||||
|
const info: string[] = [];
|
||||||
|
let el: Element | null = btn;
|
||||||
|
for (let i = 0; i < 6; i++) {
|
||||||
|
if (!el) break;
|
||||||
|
info.push(`[${i}] <${el.tagName.toLowerCase()} class="${el.className}"> text="${el.textContent?.replace(/\s+/g,' ').trim().slice(0,80)}"`);
|
||||||
|
el = el.parentElement;
|
||||||
|
}
|
||||||
|
return info.join('\n');
|
||||||
|
});
|
||||||
|
console.log('Add button ancestry:\n', result);
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
47
teressa-copeland-homes/scripts/debug-dom2.ts
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import { chromium } from 'playwright';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const savedState = await import('node:fs/promises').then(fs =>
|
||||||
|
fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null)
|
||||||
|
);
|
||||||
|
const browser = await chromium.launch({ headless: false });
|
||||||
|
const context = await browser.newContext({ storageState: savedState ?? undefined, acceptDownloads: true });
|
||||||
|
const page = await context.newPage();
|
||||||
|
await page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' });
|
||||||
|
const newPage = await context.waitForEvent('page', { timeout: 10_000 }).catch(() => null);
|
||||||
|
const activePage = newPage ?? page;
|
||||||
|
await activePage.waitForLoadState('domcontentloaded');
|
||||||
|
await activePage.waitForTimeout(3000);
|
||||||
|
await activePage.goto('https://forms.skyslope.com/browse-libraries', { waitUntil: 'domcontentloaded' });
|
||||||
|
await activePage.waitForTimeout(5000);
|
||||||
|
|
||||||
|
// Count all buttons and get text of first 5
|
||||||
|
const result = await activePage.evaluate(() => {
|
||||||
|
const allBtns = Array.from(document.querySelectorAll('button'));
|
||||||
|
const btnTexts = allBtns.slice(0, 10).map(b => `"${b.textContent?.trim()}"`);
|
||||||
|
// Also get first row-like elements
|
||||||
|
const rows = Array.from(document.querySelectorAll('tr, [class*="row"], [class*="item"]'))
|
||||||
|
.slice(0, 3)
|
||||||
|
.map(el => `<${el.tagName} class="${el.className.toString().slice(0,50)}"> "${el.textContent?.replace(/\s+/g,' ').trim().slice(0,60)}"`);
|
||||||
|
return { totalBtns: allBtns.length, btnTexts, rows };
|
||||||
|
});
|
||||||
|
console.log(JSON.stringify(result, null, 2));
|
||||||
|
|
||||||
|
// Try clicking first form name and screenshot
|
||||||
|
const firstFormName = activePage.locator('text="Acknowledgement of Third Party Approval Addendum to REPC - UAR"').first();
|
||||||
|
if (await firstFormName.isVisible({ timeout: 3000 }).catch(() => false)) {
|
||||||
|
console.log('Clicking first form name...');
|
||||||
|
await firstFormName.click();
|
||||||
|
await activePage.waitForTimeout(3000);
|
||||||
|
await activePage.screenshot({ path: 'scripts/debug-after-click.png' });
|
||||||
|
console.log('Screenshot saved: debug-after-click.png');
|
||||||
|
// Get all buttons visible now
|
||||||
|
const btns = await activePage.locator('button').allTextContents();
|
||||||
|
console.log('Buttons after click:', btns.slice(0, 15));
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
BIN
teressa-copeland-homes/scripts/debug-forms-home.png
Normal file
|
After Width: | Height: | Size: 6.5 KiB |
BIN
teressa-copeland-homes/scripts/debug-forms-library.png
Normal file
|
After Width: | Height: | Size: 118 KiB |
BIN
teressa-copeland-homes/scripts/debug-forms-rendered.png
Normal file
|
After Width: | Height: | Size: 260 KiB |
30
teressa-copeland-homes/scripts/debug-inspect.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import { chromium } from 'playwright';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await chromium.launch({ headless: true });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto('https://skyslope.com/forms-login/');
|
||||||
|
await page.waitForLoadState('networkidle');
|
||||||
|
|
||||||
|
const inputs = await page.locator('input').all();
|
||||||
|
console.log('Input count:', inputs.length);
|
||||||
|
for (const input of inputs) {
|
||||||
|
const attrs = await input.evaluate((el: HTMLInputElement) => ({
|
||||||
|
type: el.type, name: el.name, id: el.id, placeholder: el.placeholder
|
||||||
|
}));
|
||||||
|
console.log('Input:', attrs);
|
||||||
|
}
|
||||||
|
|
||||||
|
const links = await page.locator('a[href*="login"], a[href*="signin"], a[href*="forms.skyslope"]').all();
|
||||||
|
for (const link of links) {
|
||||||
|
const text = await link.textContent();
|
||||||
|
const href = await link.getAttribute('href');
|
||||||
|
console.log('Link:', text?.trim().slice(0, 60), '→', href);
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.screenshot({ path: 'scripts/debug-inspect.png', fullPage: false });
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
28
teressa-copeland-homes/scripts/debug-inspect2.ts
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import { chromium } from 'playwright';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await chromium.launch({ headless: true });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// Try the direct forms app URL first
|
||||||
|
await page.goto('https://forms.skyslope.com', { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
|
||||||
|
console.log('URL after goto forms.skyslope.com:', page.url());
|
||||||
|
await page.screenshot({ path: 'scripts/debug-forms-home.png' });
|
||||||
|
|
||||||
|
// Look for any login/sign-in link
|
||||||
|
const loginLinks = await page.locator('a, button').all();
|
||||||
|
for (const el of loginLinks) {
|
||||||
|
const text = await el.textContent().catch(() => '');
|
||||||
|
const href = await el.getAttribute('href').catch(() => '');
|
||||||
|
if (/login|sign.?in|log.?in|get started|access/i.test(text + href)) {
|
||||||
|
console.log('Found login element:', text?.trim().slice(0,60), href?.slice(0,80));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
38
teressa-copeland-homes/scripts/debug-inspect3.ts
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import { chromium } from 'playwright';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
import path from 'path';
|
||||||
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await chromium.launch({ headless: false });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto('https://forms.skyslope.com/?tab=all', { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
||||||
|
// Wait for JS to render
|
||||||
|
await page.waitForTimeout(5000);
|
||||||
|
|
||||||
|
console.log('URL:', page.url());
|
||||||
|
await page.screenshot({ path: 'scripts/debug-forms-rendered.png', fullPage: false });
|
||||||
|
console.log('Screenshot saved');
|
||||||
|
|
||||||
|
// Look for form items
|
||||||
|
const bodyText = await page.locator('body').innerText().catch(() => '');
|
||||||
|
console.log('Page text (first 500 chars):', bodyText.slice(0, 500));
|
||||||
|
|
||||||
|
// Look for any list items or cards
|
||||||
|
const items = await page.locator('[class*="form"], [class*="Form"], [class*="item"], [class*="card"]').count();
|
||||||
|
console.log('Form-like elements:', items);
|
||||||
|
|
||||||
|
// Look for the NRDS/authorization prompt
|
||||||
|
const hasNRDS = bodyText.toLowerCase().includes('nrds') || bodyText.toLowerCase().includes('authorization');
|
||||||
|
console.log('Has NRDS/auth prompt:', hasNRDS);
|
||||||
|
|
||||||
|
// Check if we're on a tab with forms
|
||||||
|
const tabs = await page.locator('[role="tab"]').all();
|
||||||
|
for (const tab of tabs) {
|
||||||
|
console.log('Tab:', await tab.textContent());
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
BIN
teressa-copeland-homes/scripts/debug-libraries-page.png
Normal file
|
After Width: | Height: | Size: 68 KiB |
BIN
teressa-copeland-homes/scripts/debug-login.png
Normal file
|
After Width: | Height: | Size: 1.1 MiB |
BIN
teressa-copeland-homes/scripts/debug-no-forms-1773983047186.png
Normal file
|
After Width: | Height: | Size: 68 KiB |
BIN
teressa-copeland-homes/scripts/debug-no-rows-1773982331600.png
Normal file
|
After Width: | Height: | Size: 118 KiB |
BIN
teressa-copeland-homes/scripts/debug-skyslope-landing.png
Normal file
|
After Width: | Height: | Size: 118 KiB |
BIN
teressa-copeland-homes/scripts/debug-ure-after-login.png
Normal file
|
After Width: | Height: | Size: 139 KiB |
BIN
teressa-copeland-homes/scripts/debug-ure-filled.png
Normal file
|
After Width: | Height: | Size: 737 KiB |
BIN
teressa-copeland-homes/scripts/debug-ure-forms-menu.png
Normal file
|
After Width: | Height: | Size: 147 KiB |
BIN
teressa-copeland-homes/scripts/debug-ure-forms-page.png
Normal file
|
After Width: | Height: | Size: 141 KiB |
BIN
teressa-copeland-homes/scripts/debug-ure-login.png
Normal file
|
After Width: | Height: | Size: 139 KiB |
@@ -1,230 +1,385 @@
|
|||||||
/**
|
/**
|
||||||
* SkySlope Forms Scraper
|
* SkySlope Forms Scraper — via Utah Real Estate SSO
|
||||||
*
|
*
|
||||||
* Downloads all PDFs from your SkySlope form libraries into seeds/forms/.
|
* Flow: utahrealestate.com login → Forms → MLS Forms → SkySlope → download all library PDFs
|
||||||
* Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts
|
|
||||||
*
|
*
|
||||||
* Credentials are read from env vars:
|
* Run: npm run scrape:forms
|
||||||
* SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland)
|
*
|
||||||
* SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029)
|
* Credentials read from .env.local:
|
||||||
|
* URE_USERNAME — Utah Real Estate username
|
||||||
|
* URE_PASSWORD — Utah Real Estate password
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { chromium } from 'playwright';
|
import { chromium } from 'playwright';
|
||||||
import * as fs from 'node:fs/promises';
|
import * as fs from 'node:fs/promises';
|
||||||
import * as path from 'node:path';
|
import * as path from 'node:path';
|
||||||
|
import { config } from 'dotenv';
|
||||||
|
|
||||||
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME;
|
config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||||
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID;
|
|
||||||
|
const URE_USERNAME = process.env.URE_USERNAME;
|
||||||
|
const URE_PASSWORD = process.env.URE_PASSWORD;
|
||||||
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
|
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
|
||||||
const BASE_URL = 'https://forms.skyslope.com';
|
|
||||||
|
|
||||||
if (!LAST_NAME || !NRDS_ID) {
|
if (!URE_USERNAME || !URE_PASSWORD) {
|
||||||
console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID');
|
console.error('Missing required env vars: URE_USERNAME, URE_PASSWORD');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||||
|
|
||||||
const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene
|
const browser = await chromium.launch({ headless: false });
|
||||||
const context = await browser.newContext({ acceptDownloads: true });
|
const savedState = await fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null);
|
||||||
|
const context = await browser.newContext({
|
||||||
|
acceptDownloads: true,
|
||||||
|
storageState: savedState ?? undefined,
|
||||||
|
});
|
||||||
const page = await context.newPage();
|
const page = await context.newPage();
|
||||||
|
|
||||||
console.log('Navigating to SkySlope Forms...');
|
// ── Step 1: Login to Utah Real Estate ─────────────────────────────────────
|
||||||
await page.goto(`${BASE_URL}/welcome/authorization`);
|
console.log('Navigating to utahrealestate.com...');
|
||||||
|
await page.goto('https://www.utahrealestate.com/auth/login', { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
// Fill NRDS authorization form
|
console.log(`Login page URL: ${page.url()}`);
|
||||||
console.log('Filling NRDS credentials...');
|
await page.screenshot({ path: 'scripts/debug-ure-login.png' });
|
||||||
await page.getByLabel(/last name/i).fill(LAST_NAME!);
|
|
||||||
await page.getByLabel(/nrds/i).fill(NRDS_ID!);
|
|
||||||
await page.getByRole('button', { name: /next/i }).click();
|
|
||||||
|
|
||||||
// Wait for the main forms library page to load
|
const COOKIE_FILE = path.resolve(process.cwd(), 'scripts/.ure-session.json');
|
||||||
console.log('Waiting for forms library...');
|
|
||||||
await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 });
|
|
||||||
await page.waitForLoadState('networkidle');
|
|
||||||
|
|
||||||
// Find all library tabs/sections
|
// Only fill login form if we're actually on the login page
|
||||||
const libraries = await page.$$eval(
|
if (page.url().includes('/auth/login') || page.url().includes('/login')) {
|
||||||
'[class*="library"], [class*="Library"], [data-testid*="library"]',
|
const usernameInput = page.locator('input[name="username"], input[name="user"], input[id*="user"], input[placeholder*="user" i]').first();
|
||||||
els => els.map(el => ({ text: el.textContent?.trim(), id: el.id }))
|
const passwordInput = page.locator('input[type="password"]').first();
|
||||||
);
|
|
||||||
console.log(`Found ${libraries.length} library elements`);
|
|
||||||
|
|
||||||
// Collect all form links across all tabs
|
await usernameInput.waitFor({ timeout: 10_000 });
|
||||||
|
await usernameInput.fill(URE_USERNAME!);
|
||||||
|
await passwordInput.fill(URE_PASSWORD!);
|
||||||
|
|
||||||
|
console.log('Credentials filled, submitting...');
|
||||||
|
await page.locator('button[type="submit"], input[type="submit"], button:has-text("Login"), button:has-text("Sign In"), button:has-text("Log In")').first().click();
|
||||||
|
await page.waitForLoadState('domcontentloaded');
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
} else {
|
||||||
|
console.log('Already logged in (session restored).');
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`After login URL: ${page.url()}`);
|
||||||
|
await page.screenshot({ path: 'scripts/debug-ure-after-login.png' });
|
||||||
|
|
||||||
|
// ── Step 1b: Handle 2FA if present ────────────────────────────────────────
|
||||||
|
const pageText = await page.locator('body').innerText().catch(() => '');
|
||||||
|
if (pageText.includes('verification code') || pageText.includes('one-time')) {
|
||||||
|
console.log('\n⚡ 2FA detected — please complete it in the browser window.');
|
||||||
|
console.log(' (Select Text/Email, enter the code, and click Submit)');
|
||||||
|
console.log(' Waiting up to 2 minutes for you to finish...\n');
|
||||||
|
|
||||||
|
// Poll every 2s until 2FA page is gone (up to 2 minutes)
|
||||||
|
const deadline = Date.now() + 120_000;
|
||||||
|
while (Date.now() < deadline) {
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
const text = await page.locator('body').innerText().catch(() => '');
|
||||||
|
if (!text.includes('verification code') && !text.includes('one-time')) break;
|
||||||
|
process.stdout.write('.');
|
||||||
|
}
|
||||||
|
console.log();
|
||||||
|
await page.waitForLoadState('domcontentloaded');
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
console.log(`After 2FA URL: ${page.url()}`);
|
||||||
|
|
||||||
|
// Save session so we skip 2FA next time
|
||||||
|
await context.storageState({ path: COOKIE_FILE });
|
||||||
|
console.log('Session saved — 2FA will be skipped on next run.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 2: Navigate directly to SkySlope SSO URL ────────────────────────
|
||||||
|
console.log('Navigating to SkySlope via SSO...');
|
||||||
|
|
||||||
|
const [newPage] = await Promise.all([
|
||||||
|
context.waitForEvent('page', { timeout: 15_000 }).catch(() => null),
|
||||||
|
page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' }),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// The SSO link opens a new tab
|
||||||
|
const activePage = newPage ?? page;
|
||||||
|
await activePage.waitForLoadState('domcontentloaded');
|
||||||
|
await activePage.waitForTimeout(5000);
|
||||||
|
console.log(`SkySlope URL: ${activePage.url()}`);
|
||||||
|
await activePage.screenshot({ path: 'scripts/debug-skyslope-landing.png' });
|
||||||
|
|
||||||
|
await downloadAllForms(activePage, context, OUTPUT_DIR);
|
||||||
|
if (newPage) await newPage.close();
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
console.log('\nDone.');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function downloadAllForms(
|
||||||
|
page: import('playwright').Page,
|
||||||
|
context: import('playwright').BrowserContext,
|
||||||
|
outputDir: string
|
||||||
|
) {
|
||||||
const downloaded: string[] = [];
|
const downloaded: string[] = [];
|
||||||
const failed: string[] = [];
|
const failed: string[] = [];
|
||||||
|
|
||||||
// Navigate to the forms list — look for a tab or section with all forms
|
// Handle NRDS auth if it appears
|
||||||
// Try clicking "All" tab if it exists
|
await handleNRDSAuth(page);
|
||||||
const allTab = page.getByRole('tab', { name: /all/i }).first();
|
|
||||||
if (await allTab.isVisible().catch(() => false)) {
|
// Wait for forms library to load
|
||||||
await allTab.click();
|
await page.waitForTimeout(4000);
|
||||||
await page.waitForLoadState('networkidle');
|
console.log(`Forms library URL: ${page.url()}`);
|
||||||
|
|
||||||
|
// Navigate to Browse Libraries
|
||||||
|
console.log('Clicking Browse Libraries...');
|
||||||
|
const browseLink = page.locator('a:has-text("Browse Libraries"), a[href*="libraries"], nav a:has-text("Libraries")').first();
|
||||||
|
if (await browseLink.isVisible({ timeout: 5_000 }).catch(() => false)) {
|
||||||
|
await browseLink.click();
|
||||||
|
await page.waitForLoadState('domcontentloaded');
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
} else {
|
||||||
|
await page.goto('https://forms.skyslope.com/libraries', { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scrape all visible form items across all library sections
|
console.log(`Libraries URL: ${page.url()}`);
|
||||||
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
await page.screenshot({ path: 'scripts/debug-libraries-page.png' });
|
||||||
|
|
||||||
// Also try each library tab
|
const bodyText2 = await page.locator('body').innerText().catch(() => '');
|
||||||
const tabs = await page.getByRole('tab').all();
|
console.log('Libraries page text (first 600):', bodyText2.slice(0, 600));
|
||||||
for (const tab of tabs) {
|
|
||||||
const tabName = await tab.textContent();
|
// Find all library cards/links
|
||||||
if (!tabName) continue;
|
const libraryLinks = await page.locator(
|
||||||
console.log(`\nChecking library tab: ${tabName.trim()}`);
|
'a[href*="/library/"], a[href*="/libraries/"], [class*="library-card"], [class*="libraryCard"]'
|
||||||
await tab.click();
|
).all();
|
||||||
await page.waitForLoadState('networkidle');
|
console.log(`Found ${libraryLinks.length} library links`);
|
||||||
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
|
||||||
|
if (libraryLinks.length > 0) {
|
||||||
|
const libraryHrefs: string[] = [];
|
||||||
|
for (const link of libraryLinks) {
|
||||||
|
const href = await link.getAttribute('href').catch(() => '');
|
||||||
|
const name = await link.textContent().catch(() => '');
|
||||||
|
if (href) {
|
||||||
|
libraryHrefs.push(href);
|
||||||
|
console.log(` Library: ${name?.trim().slice(0, 50)} → ${href}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`);
|
for (const href of libraryHrefs) {
|
||||||
if (failed.length > 0) {
|
const url = href.startsWith('http') ? href : `https://forms.skyslope.com${href}`;
|
||||||
console.log(`✗ Failed: ${failed.join(', ')}`);
|
console.log(`\n── Opening library: ${url} ──`);
|
||||||
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForTimeout(4000);
|
||||||
|
await page.screenshot({ path: `scripts/debug-library-${Date.now()}.png` });
|
||||||
|
await downloadFormsInView(page, context, outputDir, downloaded, failed);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Fallback: libraries might be listed as clickable items
|
||||||
|
const bodyText = await page.locator('body').innerText().catch(() => '');
|
||||||
|
const libraryNames = ['Data Forms - URE', 'Utah Association of Realtors', 'Utah CCIM'];
|
||||||
|
|
||||||
|
for (const libName of libraryNames) {
|
||||||
|
const libLink = page.locator(`a:has-text("${libName}"), button:has-text("${libName}")`).first();
|
||||||
|
if (await libLink.isVisible({ timeout: 3_000 }).catch(() => false)) {
|
||||||
|
console.log(`\n── Library: ${libName} ──`);
|
||||||
|
await libLink.click();
|
||||||
|
await page.waitForLoadState('domcontentloaded');
|
||||||
|
await page.waitForTimeout(4000);
|
||||||
|
await downloadFormsInView(page, context, outputDir, downloaded, failed);
|
||||||
|
await page.goBack();
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await browser.close();
|
if (libraryLinks.length === 0) {
|
||||||
|
// We're already on the all-forms page — download directly
|
||||||
|
console.log('All forms visible on current page — downloading...');
|
||||||
|
await downloadFormsInView(page, context, outputDir, downloaded, failed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadFormsOnPage(
|
console.log(`\n✓ Downloaded ${downloaded.length} forms`);
|
||||||
|
if (failed.length > 0) console.log(`✗ Failed: ${failed.length} — ${failed.join(', ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleNRDSAuth(page: import('playwright').Page) {
|
||||||
|
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME || 'Copeland';
|
||||||
|
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID || '837075029';
|
||||||
|
|
||||||
|
// Check if NRDS auth page appears
|
||||||
|
const isNRDS = await page.locator('input[placeholder*="last" i], input[placeholder*="nrds" i]').first().isVisible({ timeout: 5_000 }).catch(() => false);
|
||||||
|
if (!isNRDS) return;
|
||||||
|
|
||||||
|
console.log('NRDS authorization required — filling...');
|
||||||
|
const inputs = await page.locator('input').all();
|
||||||
|
if (inputs.length >= 2) {
|
||||||
|
await inputs[0].fill(LAST_NAME);
|
||||||
|
await inputs[1].fill(NRDS_ID);
|
||||||
|
await page.locator('button:has-text("Next"), button[type="submit"]').first().click();
|
||||||
|
await page.waitForLoadState('domcontentloaded');
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
console.log(`After NRDS URL: ${page.url()}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function downloadFormsInView(
|
||||||
page: import('playwright').Page,
|
page: import('playwright').Page,
|
||||||
|
context: import('playwright').BrowserContext,
|
||||||
outputDir: string,
|
outputDir: string,
|
||||||
downloaded: string[],
|
downloaded: string[],
|
||||||
failed: string[]
|
failed: string[]
|
||||||
) {
|
) {
|
||||||
// Look for form items — SkySlope renders them as list rows with a download or view button
|
// Flow: click form name → preview opens → click Download button → save file
|
||||||
// Strategy: intercept PDF responses by clicking each form's download/view button
|
|
||||||
const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]');
|
|
||||||
|
|
||||||
if (formRows.length === 0) {
|
// Extract form names from the page body text — the list renders as "Name\nAdd\nName\nAdd..."
|
||||||
// Fallback: look for any link or button that mentions PDF or has a form name
|
const bodyText = await page.locator('body').innerText().catch(() => '');
|
||||||
console.log(' No form rows found with primary selectors, trying fallback...');
|
const lines = bodyText.split('\n').map(l => l.trim()).filter(l => l.length > 3);
|
||||||
await downloadViaLinks(page, outputDir, downloaded, failed);
|
const formNames: string[] = [];
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
if (lines[i] === 'Add' && i > 0 && lines[i - 1] !== 'Add' && lines[i - 1].length > 3) {
|
||||||
|
formNames.push(lines[i - 1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const names = [...new Set(formNames)];
|
||||||
|
|
||||||
|
console.log(` Found ${names.length} forms to download`);
|
||||||
|
if (names.length === 0) {
|
||||||
|
await page.screenshot({ path: `scripts/debug-no-forms-${Date.now()}.png` });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(` Found ${formRows.length} form rows`);
|
for (const formName of names) {
|
||||||
|
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
|
||||||
for (const row of formRows) {
|
|
||||||
const name = await row.$eval(
|
|
||||||
'[class*="name"], [class*="title"], span, td',
|
|
||||||
el => el.textContent?.trim()
|
|
||||||
).catch(() => null);
|
|
||||||
|
|
||||||
if (!name) continue;
|
|
||||||
|
|
||||||
// Skip already downloaded
|
|
||||||
const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim();
|
|
||||||
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||||||
|
|
||||||
try {
|
// Skip already downloaded
|
||||||
await fs.access(destPath);
|
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
|
||||||
console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`);
|
|
||||||
|
// Click the form name to open preview
|
||||||
|
const nameEl = page.locator(`text="${formName}"`).first();
|
||||||
|
if (!await nameEl.isVisible({ timeout: 3_000 }).catch(() => false)) {
|
||||||
|
process.stdout.write(` ⚠ not found: ${sanitized}\n`);
|
||||||
|
failed.push(sanitized);
|
||||||
continue;
|
continue;
|
||||||
} catch {
|
|
||||||
// file doesn't exist, proceed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the download/view button within this row
|
await nameEl.click().catch(() => {});
|
||||||
const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]');
|
// Wait for preview/modal to appear (up to 5s)
|
||||||
const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]');
|
await page.waitForTimeout(2000);
|
||||||
const btn = downloadBtn ?? viewBtn;
|
|
||||||
|
|
||||||
if (!btn) {
|
// Click Download button in the preview
|
||||||
console.log(` ⚠ No download button found for: ${name}`);
|
const downloadBtn = page.locator(
|
||||||
|
'button:has-text("Download"), a:has-text("Download"), [aria-label*="download" i], button[title*="download" i]'
|
||||||
|
).first();
|
||||||
|
|
||||||
|
if (!await downloadBtn.isVisible({ timeout: 5_000 }).catch(() => false)) {
|
||||||
|
process.stdout.write(` ⚠ no Download button found for: ${sanitized}\n`);
|
||||||
|
await page.screenshot({ path: `scripts/debug-no-download-btn-${Date.now()}.png` });
|
||||||
|
await page.keyboard.press('Escape').catch(() => {});
|
||||||
|
await page.waitForTimeout(500);
|
||||||
|
failed.push(sanitized);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Intercept the PDF download
|
|
||||||
const [download] = await Promise.all([
|
const [download] = await Promise.all([
|
||||||
page.waitForEvent('download', { timeout: 15_000 }),
|
page.waitForEvent('download', { timeout: 20_000 }),
|
||||||
btn.click(),
|
downloadBtn.click(),
|
||||||
]).catch(async () => {
|
]);
|
||||||
// If no download event, try waiting for a new page/tab with PDF
|
await download.saveAs(destPath);
|
||||||
const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 });
|
process.stdout.write(` ✓ ${sanitized}.pdf\n`);
|
||||||
await btn.click();
|
|
||||||
const newPage = await newPagePromise;
|
|
||||||
await newPage.waitForLoadState();
|
|
||||||
const url = newPage.url();
|
|
||||||
if (url.endsWith('.pdf') || url.includes('/pdf')) {
|
|
||||||
const pdfBuffer = await newPage.evaluate(async (pdfUrl) => {
|
|
||||||
const res = await fetch(pdfUrl);
|
|
||||||
const buf = await res.arrayBuffer();
|
|
||||||
return Array.from(new Uint8Array(buf));
|
|
||||||
}, url);
|
|
||||||
await fs.writeFile(destPath, Buffer.from(pdfBuffer));
|
|
||||||
await newPage.close();
|
|
||||||
return [null];
|
|
||||||
}
|
|
||||||
await newPage.close();
|
|
||||||
return [null];
|
|
||||||
});
|
|
||||||
|
|
||||||
if (download && typeof download === 'object' && 'saveAs' in download) {
|
|
||||||
await (download as import('playwright').Download).saveAs(destPath);
|
|
||||||
console.log(` ✓ ${sanitized}.pdf`);
|
|
||||||
downloaded.push(sanitized);
|
downloaded.push(sanitized);
|
||||||
} else {
|
|
||||||
console.log(` ✓ ${sanitized}.pdf (via page)`);
|
|
||||||
downloaded.push(sanitized);
|
|
||||||
}
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(` ✗ Failed: ${name} — ${(err as Error).message}`);
|
process.stdout.write(` ✗ download failed: ${sanitized} — ${(err as Error).message.slice(0, 60)}\n`);
|
||||||
failed.push(name);
|
failed.push(sanitized);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Close preview and return to list
|
||||||
|
await page.keyboard.press('Escape').catch(() => {});
|
||||||
|
await page.waitForTimeout(800);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadViaLinks(
|
async function downloadViaTextRows(
|
||||||
page: import('playwright').Page,
|
page: import('playwright').Page,
|
||||||
outputDir: string,
|
outputDir: string,
|
||||||
downloaded: string[],
|
downloaded: string[],
|
||||||
failed: string[]
|
failed: string[]
|
||||||
) {
|
) {
|
||||||
// Intercept all PDF network requests triggered by clicking form items
|
// Legacy fallback — kept for safety but downloadFormsInView handles all cases now
|
||||||
const pdfResponses: { url: string; name: string }[] = [];
|
console.log(' (downloadViaTextRows called — should not reach here normally)');
|
||||||
|
const rows = await page.evaluate(() => {
|
||||||
page.on('response', async (response) => {
|
const candidates = Array.from(document.querySelectorAll('tr, li, [class*="row"], [class*="item"]'));
|
||||||
const contentType = response.headers()['content-type'] ?? '';
|
return candidates
|
||||||
if (contentType.includes('pdf') && response.status() === 200) {
|
.map(el => ({
|
||||||
const url = response.url();
|
text: el.textContent?.replace(/\s+/g, ' ').trim() ?? '',
|
||||||
const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`;
|
hasAdd: el.textContent?.includes('Add') ?? false,
|
||||||
pdfResponses.push({ url, name });
|
}))
|
||||||
}
|
.filter(r => r.hasAdd && r.text.length > 10)
|
||||||
|
.map(r => r.text.replace(/\s*Add\s*$/, '').trim());
|
||||||
});
|
});
|
||||||
|
|
||||||
// Click each item that looks like a form
|
console.log(` Found ${rows.length} form names via text extraction`);
|
||||||
const items = await page.$$('li, [role="listitem"], [class*="item"]');
|
|
||||||
for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops
|
|
||||||
const text = await item.textContent().catch(() => '');
|
|
||||||
if (!text?.trim()) continue;
|
|
||||||
|
|
||||||
const before = pdfResponses.length;
|
for (const formName of rows) {
|
||||||
await item.click({ timeout: 3_000 }).catch(() => {});
|
if (!formName || formName.length < 3) continue;
|
||||||
await page.waitForTimeout(1_000);
|
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
|
||||||
|
|
||||||
if (pdfResponses.length > before) {
|
|
||||||
const { url, name } = pdfResponses[pdfResponses.length - 1];
|
|
||||||
const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80);
|
|
||||||
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||||||
|
|
||||||
|
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
|
||||||
|
|
||||||
|
const el = page.locator(`text="${formName}"`).first();
|
||||||
|
if (!await el.isVisible({ timeout: 2_000 }).catch(() => false)) {
|
||||||
|
process.stdout.write(` ⚠ not visible: ${sanitized}\n`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pdfUrl = await interceptPdfOnClick(page, el);
|
||||||
|
if (pdfUrl) {
|
||||||
try {
|
try {
|
||||||
const res = await page.evaluate(async (pdfUrl) => {
|
const buf = await page.evaluate(async (url) => {
|
||||||
const r = await fetch(pdfUrl);
|
const r = await fetch(url, { credentials: 'include' });
|
||||||
const buf = await r.arrayBuffer();
|
const ab = await r.arrayBuffer();
|
||||||
return Array.from(new Uint8Array(buf));
|
return Array.from(new Uint8Array(ab));
|
||||||
}, url);
|
}, pdfUrl);
|
||||||
await fs.writeFile(destPath, Buffer.from(res));
|
await fs.writeFile(destPath, Buffer.from(buf));
|
||||||
console.log(` ✓ ${sanitized}.pdf`);
|
process.stdout.write(` ✓ ${sanitized}.pdf\n`);
|
||||||
downloaded.push(sanitized);
|
downloaded.push(sanitized);
|
||||||
} catch (err) {
|
} catch {
|
||||||
console.log(` ✗ Failed saving ${name}`);
|
process.stdout.write(` ✗ ${sanitized}\n`);
|
||||||
failed.push(name);
|
failed.push(sanitized);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
process.stdout.write(` ⚠ no PDF: ${sanitized}\n`);
|
||||||
|
failed.push(sanitized);
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.keyboard.press('Escape').catch(() => {});
|
||||||
|
await page.waitForTimeout(500);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function interceptPdfOnClick(
|
||||||
|
page: import('playwright').Page,
|
||||||
|
row: import('playwright').Locator
|
||||||
|
): Promise<string | null> {
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
let resolved = false;
|
||||||
|
const handler = (response: import('playwright').Response) => {
|
||||||
|
const ct = response.headers()['content-type'] ?? '';
|
||||||
|
if (!resolved && (ct.includes('pdf') || response.url().endsWith('.pdf'))) {
|
||||||
|
resolved = true;
|
||||||
|
resolve(response.url());
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
page.on('response', handler);
|
||||||
|
await row.click({ timeout: 3_000 }).catch(() => {});
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
page.off('response', handler);
|
||||||
|
if (!resolved) resolve(null);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch(err => {
|
main().catch(err => {
|
||||||
console.error('Fatal:', err);
|
console.error('Fatal:', err.message);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
});
|
});
|
||||||
|
|||||||