wip: skyslope scraper — fix name extraction via body text parsing, preview+download flow ready

This commit is contained in:
Chandler Copeland
2026-03-19 23:06:17 -06:00
parent 1983f2c8cd
commit ac5b98fe33
21 changed files with 497 additions and 156 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View File

@@ -0,0 +1,43 @@
import { chromium } from 'playwright';
import { config } from 'dotenv';
import path from 'path';
config({ path: path.resolve(process.cwd(), '.env.local') });
(async () => {
const savedState = await import('node:fs/promises').then(fs =>
fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null)
);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({ storageState: savedState ?? undefined });
const page = await browser.newPage();
await page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' });
const [newPage] = await Promise.all([
context.waitForEvent('page', { timeout: 10_000 }).catch(() => null),
Promise.resolve(),
]);
const activePage = newPage ?? page;
await activePage.waitForLoadState('domcontentloaded');
await activePage.waitForTimeout(3000);
// Navigate to browse-libraries
await activePage.goto('https://forms.skyslope.com/browse-libraries', { waitUntil: 'domcontentloaded' });
await activePage.waitForTimeout(4000);
// Inspect the first Add button's ancestors
const result = await activePage.evaluate(() => {
const addBtns = Array.from(document.querySelectorAll('button')).filter(b => b.textContent?.trim() === 'Add');
if (addBtns.length === 0) return 'No Add buttons found';
const btn = addBtns[0];
// Walk up the tree and print each ancestor's tag, class, and text
const info: string[] = [];
let el: Element | null = btn;
for (let i = 0; i < 6; i++) {
if (!el) break;
info.push(`[${i}] <${el.tagName.toLowerCase()} class="${el.className}"> text="${el.textContent?.replace(/\s+/g,' ').trim().slice(0,80)}"`);
el = el.parentElement;
}
return info.join('\n');
});
console.log('Add button ancestry:\n', result);
await browser.close();
})();

View File

@@ -0,0 +1,47 @@
import { chromium } from 'playwright';
import { config } from 'dotenv';
import path from 'path';
config({ path: path.resolve(process.cwd(), '.env.local') });
(async () => {
const savedState = await import('node:fs/promises').then(fs =>
fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null)
);
const browser = await chromium.launch({ headless: false });
const context = await browser.newContext({ storageState: savedState ?? undefined, acceptDownloads: true });
const page = await context.newPage();
await page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' });
const newPage = await context.waitForEvent('page', { timeout: 10_000 }).catch(() => null);
const activePage = newPage ?? page;
await activePage.waitForLoadState('domcontentloaded');
await activePage.waitForTimeout(3000);
await activePage.goto('https://forms.skyslope.com/browse-libraries', { waitUntil: 'domcontentloaded' });
await activePage.waitForTimeout(5000);
// Count all buttons and get text of first 5
const result = await activePage.evaluate(() => {
const allBtns = Array.from(document.querySelectorAll('button'));
const btnTexts = allBtns.slice(0, 10).map(b => `"${b.textContent?.trim()}"`);
// Also get first row-like elements
const rows = Array.from(document.querySelectorAll('tr, [class*="row"], [class*="item"]'))
.slice(0, 3)
.map(el => `<${el.tagName} class="${el.className.toString().slice(0,50)}"> "${el.textContent?.replace(/\s+/g,' ').trim().slice(0,60)}"`);
return { totalBtns: allBtns.length, btnTexts, rows };
});
console.log(JSON.stringify(result, null, 2));
// Try clicking first form name and screenshot
const firstFormName = activePage.locator('text="Acknowledgement of Third Party Approval Addendum to REPC - UAR"').first();
if (await firstFormName.isVisible({ timeout: 3000 }).catch(() => false)) {
console.log('Clicking first form name...');
await firstFormName.click();
await activePage.waitForTimeout(3000);
await activePage.screenshot({ path: 'scripts/debug-after-click.png' });
console.log('Screenshot saved: debug-after-click.png');
// Get all buttons visible now
const btns = await activePage.locator('button').allTextContents();
console.log('Buttons after click:', btns.slice(0, 15));
}
await browser.close();
})();

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

View File

@@ -0,0 +1,30 @@
import { chromium } from 'playwright';
import { config } from 'dotenv';
import path from 'path';
config({ path: path.resolve(process.cwd(), '.env.local') });
(async () => {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://skyslope.com/forms-login/');
await page.waitForLoadState('networkidle');
const inputs = await page.locator('input').all();
console.log('Input count:', inputs.length);
for (const input of inputs) {
const attrs = await input.evaluate((el: HTMLInputElement) => ({
type: el.type, name: el.name, id: el.id, placeholder: el.placeholder
}));
console.log('Input:', attrs);
}
const links = await page.locator('a[href*="login"], a[href*="signin"], a[href*="forms.skyslope"]').all();
for (const link of links) {
const text = await link.textContent();
const href = await link.getAttribute('href');
console.log('Link:', text?.trim().slice(0, 60), '→', href);
}
await page.screenshot({ path: 'scripts/debug-inspect.png', fullPage: false });
await browser.close();
})();

View File

@@ -0,0 +1,28 @@
import { chromium } from 'playwright';
import { config } from 'dotenv';
import path from 'path';
config({ path: path.resolve(process.cwd(), '.env.local') });
(async () => {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
// Try the direct forms app URL first
await page.goto('https://forms.skyslope.com', { waitUntil: 'domcontentloaded', timeout: 30_000 });
await page.waitForTimeout(3000);
console.log('URL after goto forms.skyslope.com:', page.url());
await page.screenshot({ path: 'scripts/debug-forms-home.png' });
// Look for any login/sign-in link
const loginLinks = await page.locator('a, button').all();
for (const el of loginLinks) {
const text = await el.textContent().catch(() => '');
const href = await el.getAttribute('href').catch(() => '');
if (/login|sign.?in|log.?in|get started|access/i.test(text + href)) {
console.log('Found login element:', text?.trim().slice(0,60), href?.slice(0,80));
}
}
await browser.close();
})();

View File

@@ -0,0 +1,38 @@
import { chromium } from 'playwright';
import { config } from 'dotenv';
import path from 'path';
config({ path: path.resolve(process.cwd(), '.env.local') });
(async () => {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://forms.skyslope.com/?tab=all', { waitUntil: 'domcontentloaded', timeout: 30_000 });
// Wait for JS to render
await page.waitForTimeout(5000);
console.log('URL:', page.url());
await page.screenshot({ path: 'scripts/debug-forms-rendered.png', fullPage: false });
console.log('Screenshot saved');
// Look for form items
const bodyText = await page.locator('body').innerText().catch(() => '');
console.log('Page text (first 500 chars):', bodyText.slice(0, 500));
// Look for any list items or cards
const items = await page.locator('[class*="form"], [class*="Form"], [class*="item"], [class*="card"]').count();
console.log('Form-like elements:', items);
// Look for the NRDS/authorization prompt
const hasNRDS = bodyText.toLowerCase().includes('nrds') || bodyText.toLowerCase().includes('authorization');
console.log('Has NRDS/auth prompt:', hasNRDS);
// Check if we're on a tab with forms
const tabs = await page.locator('[role="tab"]').all();
for (const tab of tabs) {
console.log('Tab:', await tab.textContent());
}
await page.waitForTimeout(2000);
await browser.close();
})();

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 737 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

View File

@@ -1,230 +1,385 @@
/**
* SkySlope Forms Scraper
* SkySlope Forms Scraper — via Utah Real Estate SSO
*
* Downloads all PDFs from your SkySlope form libraries into seeds/forms/.
* Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts
* Flow: utahrealestate.com login → Forms → MLS Forms → SkySlope → download all library PDFs
*
* Credentials are read from env vars:
* SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland)
* SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029)
* Run: npm run scrape:forms
*
* Credentials read from .env.local:
* URE_USERNAME — Utah Real Estate username
* URE_PASSWORD — Utah Real Estate password
*/
import { chromium } from 'playwright';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
import { config } from 'dotenv';
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME;
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID;
config({ path: path.resolve(process.cwd(), '.env.local') });
const URE_USERNAME = process.env.URE_USERNAME;
const URE_PASSWORD = process.env.URE_PASSWORD;
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
const BASE_URL = 'https://forms.skyslope.com';
if (!LAST_NAME || !NRDS_ID) {
console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID');
if (!URE_USERNAME || !URE_PASSWORD) {
console.error('Missing required env vars: URE_USERNAME, URE_PASSWORD');
process.exit(1);
}
async function main() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene
const context = await browser.newContext({ acceptDownloads: true });
const browser = await chromium.launch({ headless: false });
const savedState = await fs.readFile(path.resolve(process.cwd(), 'scripts/.ure-session.json'), 'utf8').then(JSON.parse).catch(() => null);
const context = await browser.newContext({
acceptDownloads: true,
storageState: savedState ?? undefined,
});
const page = await context.newPage();
console.log('Navigating to SkySlope Forms...');
await page.goto(`${BASE_URL}/welcome/authorization`);
// ── Step 1: Login to Utah Real Estate ─────────────────────────────────────
console.log('Navigating to utahrealestate.com...');
await page.goto('https://www.utahrealestate.com/auth/login', { waitUntil: 'domcontentloaded', timeout: 30_000 });
await page.waitForTimeout(2000);
// Fill NRDS authorization form
console.log('Filling NRDS credentials...');
await page.getByLabel(/last name/i).fill(LAST_NAME!);
await page.getByLabel(/nrds/i).fill(NRDS_ID!);
await page.getByRole('button', { name: /next/i }).click();
console.log(`Login page URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-ure-login.png' });
// Wait for the main forms library page to load
console.log('Waiting for forms library...');
await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 });
await page.waitForLoadState('networkidle');
const COOKIE_FILE = path.resolve(process.cwd(), 'scripts/.ure-session.json');
// Find all library tabs/sections
const libraries = await page.$$eval(
'[class*="library"], [class*="Library"], [data-testid*="library"]',
els => els.map(el => ({ text: el.textContent?.trim(), id: el.id }))
);
console.log(`Found ${libraries.length} library elements`);
// Only fill login form if we're actually on the login page
if (page.url().includes('/auth/login') || page.url().includes('/login')) {
const usernameInput = page.locator('input[name="username"], input[name="user"], input[id*="user"], input[placeholder*="user" i]').first();
const passwordInput = page.locator('input[type="password"]').first();
// Collect all form links across all tabs
await usernameInput.waitFor({ timeout: 10_000 });
await usernameInput.fill(URE_USERNAME!);
await passwordInput.fill(URE_PASSWORD!);
console.log('Credentials filled, submitting...');
await page.locator('button[type="submit"], input[type="submit"], button:has-text("Login"), button:has-text("Sign In"), button:has-text("Log In")').first().click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
} else {
console.log('Already logged in (session restored).');
}
console.log(`After login URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-ure-after-login.png' });
// ── Step 1b: Handle 2FA if present ────────────────────────────────────────
const pageText = await page.locator('body').innerText().catch(() => '');
if (pageText.includes('verification code') || pageText.includes('one-time')) {
console.log('\n⚡ 2FA detected — please complete it in the browser window.');
console.log(' (Select Text/Email, enter the code, and click Submit)');
console.log(' Waiting up to 2 minutes for you to finish...\n');
// Poll every 2s until 2FA page is gone (up to 2 minutes)
const deadline = Date.now() + 120_000;
while (Date.now() < deadline) {
await page.waitForTimeout(2000);
const text = await page.locator('body').innerText().catch(() => '');
if (!text.includes('verification code') && !text.includes('one-time')) break;
process.stdout.write('.');
}
console.log();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(2000);
console.log(`After 2FA URL: ${page.url()}`);
// Save session so we skip 2FA next time
await context.storageState({ path: COOKIE_FILE });
console.log('Session saved — 2FA will be skipped on next run.');
}
// ── Step 2: Navigate directly to SkySlope SSO URL ────────────────────────
console.log('Navigating to SkySlope via SSO...');
const [newPage] = await Promise.all([
context.waitForEvent('page', { timeout: 15_000 }).catch(() => null),
page.goto('https://www.utahrealestate.com/sso/connect/client/skyslope', { waitUntil: 'domcontentloaded' }),
]);
// The SSO link opens a new tab
const activePage = newPage ?? page;
await activePage.waitForLoadState('domcontentloaded');
await activePage.waitForTimeout(5000);
console.log(`SkySlope URL: ${activePage.url()}`);
await activePage.screenshot({ path: 'scripts/debug-skyslope-landing.png' });
await downloadAllForms(activePage, context, OUTPUT_DIR);
if (newPage) await newPage.close();
await browser.close();
console.log('\nDone.');
}
async function downloadAllForms(
page: import('playwright').Page,
context: import('playwright').BrowserContext,
outputDir: string
) {
const downloaded: string[] = [];
const failed: string[] = [];
// Navigate to the forms list — look for a tab or section with all forms
// Try clicking "All" tab if it exists
const allTab = page.getByRole('tab', { name: /all/i }).first();
if (await allTab.isVisible().catch(() => false)) {
await allTab.click();
await page.waitForLoadState('networkidle');
// Handle NRDS auth if it appears
await handleNRDSAuth(page);
// Wait for forms library to load
await page.waitForTimeout(4000);
console.log(`Forms library URL: ${page.url()}`);
// Navigate to Browse Libraries
console.log('Clicking Browse Libraries...');
const browseLink = page.locator('a:has-text("Browse Libraries"), a[href*="libraries"], nav a:has-text("Libraries")').first();
if (await browseLink.isVisible({ timeout: 5_000 }).catch(() => false)) {
await browseLink.click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
} else {
await page.goto('https://forms.skyslope.com/libraries', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(3000);
}
// Scrape all visible form items across all library sections
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
console.log(`Libraries URL: ${page.url()}`);
await page.screenshot({ path: 'scripts/debug-libraries-page.png' });
// Also try each library tab
const tabs = await page.getByRole('tab').all();
for (const tab of tabs) {
const tabName = await tab.textContent();
if (!tabName) continue;
console.log(`\nChecking library tab: ${tabName.trim()}`);
await tab.click();
await page.waitForLoadState('networkidle');
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
const bodyText2 = await page.locator('body').innerText().catch(() => '');
console.log('Libraries page text (first 600):', bodyText2.slice(0, 600));
// Find all library cards/links
const libraryLinks = await page.locator(
'a[href*="/library/"], a[href*="/libraries/"], [class*="library-card"], [class*="libraryCard"]'
).all();
console.log(`Found ${libraryLinks.length} library links`);
if (libraryLinks.length > 0) {
const libraryHrefs: string[] = [];
for (const link of libraryLinks) {
const href = await link.getAttribute('href').catch(() => '');
const name = await link.textContent().catch(() => '');
if (href) {
libraryHrefs.push(href);
console.log(` Library: ${name?.trim().slice(0, 50)}${href}`);
}
}
console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`);
if (failed.length > 0) {
console.log(`✗ Failed: ${failed.join(', ')}`);
for (const href of libraryHrefs) {
const url = href.startsWith('http') ? href : `https://forms.skyslope.com${href}`;
console.log(`\n── Opening library: ${url} ──`);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(4000);
await page.screenshot({ path: `scripts/debug-library-${Date.now()}.png` });
await downloadFormsInView(page, context, outputDir, downloaded, failed);
}
} else {
// Fallback: libraries might be listed as clickable items
const bodyText = await page.locator('body').innerText().catch(() => '');
const libraryNames = ['Data Forms - URE', 'Utah Association of Realtors', 'Utah CCIM'];
for (const libName of libraryNames) {
const libLink = page.locator(`a:has-text("${libName}"), button:has-text("${libName}")`).first();
if (await libLink.isVisible({ timeout: 3_000 }).catch(() => false)) {
console.log(`\n── Library: ${libName} ──`);
await libLink.click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(4000);
await downloadFormsInView(page, context, outputDir, downloaded, failed);
await page.goBack();
await page.waitForTimeout(2000);
}
}
await browser.close();
if (libraryLinks.length === 0) {
// We're already on the all-forms page — download directly
console.log('All forms visible on current page — downloading...');
await downloadFormsInView(page, context, outputDir, downloaded, failed);
}
}
console.log(`\n✓ Downloaded ${downloaded.length} forms`);
if (failed.length > 0) console.log(`✗ Failed: ${failed.length}${failed.join(', ')}`);
}
async function downloadFormsOnPage(
async function handleNRDSAuth(page: import('playwright').Page) {
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME || 'Copeland';
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID || '837075029';
// Check if NRDS auth page appears
const isNRDS = await page.locator('input[placeholder*="last" i], input[placeholder*="nrds" i]').first().isVisible({ timeout: 5_000 }).catch(() => false);
if (!isNRDS) return;
console.log('NRDS authorization required — filling...');
const inputs = await page.locator('input').all();
if (inputs.length >= 2) {
await inputs[0].fill(LAST_NAME);
await inputs[1].fill(NRDS_ID);
await page.locator('button:has-text("Next"), button[type="submit"]').first().click();
await page.waitForLoadState('domcontentloaded');
await page.waitForTimeout(3000);
console.log(`After NRDS URL: ${page.url()}`);
}
}
async function downloadFormsInView(
page: import('playwright').Page,
context: import('playwright').BrowserContext,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Look for form items — SkySlope renders them as list rows with a download or view button
// Strategy: intercept PDF responses by clicking each form's download/view button
const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]');
// Flow: click form name → preview opens → click Download button → save file
if (formRows.length === 0) {
// Fallback: look for any link or button that mentions PDF or has a form name
console.log(' No form rows found with primary selectors, trying fallback...');
await downloadViaLinks(page, outputDir, downloaded, failed);
// Extract form names from the page body text — the list renders as "Name\nAdd\nName\nAdd..."
const bodyText = await page.locator('body').innerText().catch(() => '');
const lines = bodyText.split('\n').map(l => l.trim()).filter(l => l.length > 3);
const formNames: string[] = [];
for (let i = 0; i < lines.length; i++) {
if (lines[i] === 'Add' && i > 0 && lines[i - 1] !== 'Add' && lines[i - 1].length > 3) {
formNames.push(lines[i - 1]);
}
}
const names = [...new Set(formNames)];
console.log(` Found ${names.length} forms to download`);
if (names.length === 0) {
await page.screenshot({ path: `scripts/debug-no-forms-${Date.now()}.png` });
return;
}
console.log(` Found ${formRows.length} form rows`);
for (const row of formRows) {
const name = await row.$eval(
'[class*="name"], [class*="title"], span, td',
el => el.textContent?.trim()
).catch(() => null);
if (!name) continue;
// Skip already downloaded
const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim();
for (const formName of names) {
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
const destPath = path.join(outputDir, `${sanitized}.pdf`);
try {
await fs.access(destPath);
console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`);
// Skip already downloaded
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
// Click the form name to open preview
const nameEl = page.locator(`text="${formName}"`).first();
if (!await nameEl.isVisible({ timeout: 3_000 }).catch(() => false)) {
process.stdout.write(` ⚠ not found: ${sanitized}\n`);
failed.push(sanitized);
continue;
} catch {
// file doesn't exist, proceed
}
// Find the download/view button within this row
const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]');
const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]');
const btn = downloadBtn ?? viewBtn;
await nameEl.click().catch(() => {});
// Wait for preview/modal to appear (up to 5s)
await page.waitForTimeout(2000);
if (!btn) {
console.log(` ⚠ No download button found for: ${name}`);
// Click Download button in the preview
const downloadBtn = page.locator(
'button:has-text("Download"), a:has-text("Download"), [aria-label*="download" i], button[title*="download" i]'
).first();
if (!await downloadBtn.isVisible({ timeout: 5_000 }).catch(() => false)) {
process.stdout.write(` ⚠ no Download button found for: ${sanitized}\n`);
await page.screenshot({ path: `scripts/debug-no-download-btn-${Date.now()}.png` });
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(500);
failed.push(sanitized);
continue;
}
try {
// Intercept the PDF download
const [download] = await Promise.all([
page.waitForEvent('download', { timeout: 15_000 }),
btn.click(),
]).catch(async () => {
// If no download event, try waiting for a new page/tab with PDF
const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 });
await btn.click();
const newPage = await newPagePromise;
await newPage.waitForLoadState();
const url = newPage.url();
if (url.endsWith('.pdf') || url.includes('/pdf')) {
const pdfBuffer = await newPage.evaluate(async (pdfUrl) => {
const res = await fetch(pdfUrl);
const buf = await res.arrayBuffer();
return Array.from(new Uint8Array(buf));
}, url);
await fs.writeFile(destPath, Buffer.from(pdfBuffer));
await newPage.close();
return [null];
}
await newPage.close();
return [null];
});
if (download && typeof download === 'object' && 'saveAs' in download) {
await (download as import('playwright').Download).saveAs(destPath);
console.log(`${sanitized}.pdf`);
page.waitForEvent('download', { timeout: 20_000 }),
downloadBtn.click(),
]);
await download.saveAs(destPath);
process.stdout.write(`${sanitized}.pdf\n`);
downloaded.push(sanitized);
} else {
console.log(`${sanitized}.pdf (via page)`);
downloaded.push(sanitized);
}
} catch (err) {
console.log(`Failed: ${name}${(err as Error).message}`);
failed.push(name);
process.stdout.write(`download failed: ${sanitized}${(err as Error).message.slice(0, 60)}\n`);
failed.push(sanitized);
}
// Close preview and return to list
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(800);
}
}
async function downloadViaLinks(
async function downloadViaTextRows(
page: import('playwright').Page,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Intercept all PDF network requests triggered by clicking form items
const pdfResponses: { url: string; name: string }[] = [];
page.on('response', async (response) => {
const contentType = response.headers()['content-type'] ?? '';
if (contentType.includes('pdf') && response.status() === 200) {
const url = response.url();
const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`;
pdfResponses.push({ url, name });
}
// Legacy fallback — kept for safety but downloadFormsInView handles all cases now
console.log(' (downloadViaTextRows called — should not reach here normally)');
const rows = await page.evaluate(() => {
const candidates = Array.from(document.querySelectorAll('tr, li, [class*="row"], [class*="item"]'));
return candidates
.map(el => ({
text: el.textContent?.replace(/\s+/g, ' ').trim() ?? '',
hasAdd: el.textContent?.includes('Add') ?? false,
}))
.filter(r => r.hasAdd && r.text.length > 10)
.map(r => r.text.replace(/\s*Add\s*$/, '').trim());
});
// Click each item that looks like a form
const items = await page.$$('li, [role="listitem"], [class*="item"]');
for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops
const text = await item.textContent().catch(() => '');
if (!text?.trim()) continue;
console.log(` Found ${rows.length} form names via text extraction`);
const before = pdfResponses.length;
await item.click({ timeout: 3_000 }).catch(() => {});
await page.waitForTimeout(1_000);
if (pdfResponses.length > before) {
const { url, name } = pdfResponses[pdfResponses.length - 1];
const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80);
for (const formName of rows) {
if (!formName || formName.length < 3) continue;
const sanitized = formName.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 100);
const destPath = path.join(outputDir, `${sanitized}.pdf`);
try { await fs.access(destPath); process.stdout.write(` ⊙ skip: ${sanitized}\n`); continue; } catch { /* proceed */ }
const el = page.locator(`text="${formName}"`).first();
if (!await el.isVisible({ timeout: 2_000 }).catch(() => false)) {
process.stdout.write(` ⚠ not visible: ${sanitized}\n`);
continue;
}
const pdfUrl = await interceptPdfOnClick(page, el);
if (pdfUrl) {
try {
const res = await page.evaluate(async (pdfUrl) => {
const r = await fetch(pdfUrl);
const buf = await r.arrayBuffer();
return Array.from(new Uint8Array(buf));
}, url);
await fs.writeFile(destPath, Buffer.from(res));
console.log(`${sanitized}.pdf`);
const buf = await page.evaluate(async (url) => {
const r = await fetch(url, { credentials: 'include' });
const ab = await r.arrayBuffer();
return Array.from(new Uint8Array(ab));
}, pdfUrl);
await fs.writeFile(destPath, Buffer.from(buf));
process.stdout.write(`${sanitized}.pdf\n`);
downloaded.push(sanitized);
} catch (err) {
console.log(` ✗ Failed saving ${name}`);
failed.push(name);
} catch {
process.stdout.write(`${sanitized}\n`);
failed.push(sanitized);
}
} else {
process.stdout.write(` ⚠ no PDF: ${sanitized}\n`);
failed.push(sanitized);
}
await page.keyboard.press('Escape').catch(() => {});
await page.waitForTimeout(500);
}
}
async function interceptPdfOnClick(
page: import('playwright').Page,
row: import('playwright').Locator
): Promise<string | null> {
return new Promise(async (resolve) => {
let resolved = false;
const handler = (response: import('playwright').Response) => {
const ct = response.headers()['content-type'] ?? '';
if (!resolved && (ct.includes('pdf') || response.url().endsWith('.pdf'))) {
resolved = true;
resolve(response.url());
}
};
page.on('response', handler);
await row.click({ timeout: 3_000 }).catch(() => {});
await page.waitForTimeout(2000);
page.off('response', handler);
if (!resolved) resolve(null);
});
}
main().catch(err => {
console.error('Fatal:', err);
console.error('Fatal:', err.message);
process.exit(1);
});