feat: add Playwright script to scrape SkySlope form libraries into seeds/forms/
This commit is contained in:
230
teressa-copeland-homes/scripts/scrape-skyslope-forms.ts
Normal file
230
teressa-copeland-homes/scripts/scrape-skyslope-forms.ts
Normal file
@@ -0,0 +1,230 @@
|
||||
/**
|
||||
* SkySlope Forms Scraper
|
||||
*
|
||||
* Downloads all PDFs from your SkySlope form libraries into seeds/forms/.
|
||||
* Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts
|
||||
*
|
||||
* Credentials are read from env vars:
|
||||
* SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland)
|
||||
* SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029)
|
||||
*/
|
||||
|
||||
import { chromium } from 'playwright';
|
||||
import * as fs from 'node:fs/promises';
|
||||
import * as path from 'node:path';
|
||||
|
||||
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME;
|
||||
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID;
|
||||
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
|
||||
const BASE_URL = 'https://forms.skyslope.com';
|
||||
|
||||
if (!LAST_NAME || !NRDS_ID) {
|
||||
console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene
|
||||
const context = await browser.newContext({ acceptDownloads: true });
|
||||
const page = await context.newPage();
|
||||
|
||||
console.log('Navigating to SkySlope Forms...');
|
||||
await page.goto(`${BASE_URL}/welcome/authorization`);
|
||||
|
||||
// Fill NRDS authorization form
|
||||
console.log('Filling NRDS credentials...');
|
||||
await page.getByLabel(/last name/i).fill(LAST_NAME!);
|
||||
await page.getByLabel(/nrds/i).fill(NRDS_ID!);
|
||||
await page.getByRole('button', { name: /next/i }).click();
|
||||
|
||||
// Wait for the main forms library page to load
|
||||
console.log('Waiting for forms library...');
|
||||
await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 });
|
||||
await page.waitForLoadState('networkidle');
|
||||
|
||||
// Find all library tabs/sections
|
||||
const libraries = await page.$$eval(
|
||||
'[class*="library"], [class*="Library"], [data-testid*="library"]',
|
||||
els => els.map(el => ({ text: el.textContent?.trim(), id: el.id }))
|
||||
);
|
||||
console.log(`Found ${libraries.length} library elements`);
|
||||
|
||||
// Collect all form links across all tabs
|
||||
const downloaded: string[] = [];
|
||||
const failed: string[] = [];
|
||||
|
||||
// Navigate to the forms list — look for a tab or section with all forms
|
||||
// Try clicking "All" tab if it exists
|
||||
const allTab = page.getByRole('tab', { name: /all/i }).first();
|
||||
if (await allTab.isVisible().catch(() => false)) {
|
||||
await allTab.click();
|
||||
await page.waitForLoadState('networkidle');
|
||||
}
|
||||
|
||||
// Scrape all visible form items across all library sections
|
||||
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
||||
|
||||
// Also try each library tab
|
||||
const tabs = await page.getByRole('tab').all();
|
||||
for (const tab of tabs) {
|
||||
const tabName = await tab.textContent();
|
||||
if (!tabName) continue;
|
||||
console.log(`\nChecking library tab: ${tabName.trim()}`);
|
||||
await tab.click();
|
||||
await page.waitForLoadState('networkidle');
|
||||
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
||||
}
|
||||
|
||||
console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`);
|
||||
if (failed.length > 0) {
|
||||
console.log(`✗ Failed: ${failed.join(', ')}`);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
async function downloadFormsOnPage(
|
||||
page: import('playwright').Page,
|
||||
outputDir: string,
|
||||
downloaded: string[],
|
||||
failed: string[]
|
||||
) {
|
||||
// Look for form items — SkySlope renders them as list rows with a download or view button
|
||||
// Strategy: intercept PDF responses by clicking each form's download/view button
|
||||
const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]');
|
||||
|
||||
if (formRows.length === 0) {
|
||||
// Fallback: look for any link or button that mentions PDF or has a form name
|
||||
console.log(' No form rows found with primary selectors, trying fallback...');
|
||||
await downloadViaLinks(page, outputDir, downloaded, failed);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Found ${formRows.length} form rows`);
|
||||
|
||||
for (const row of formRows) {
|
||||
const name = await row.$eval(
|
||||
'[class*="name"], [class*="title"], span, td',
|
||||
el => el.textContent?.trim()
|
||||
).catch(() => null);
|
||||
|
||||
if (!name) continue;
|
||||
|
||||
// Skip already downloaded
|
||||
const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim();
|
||||
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||||
|
||||
try {
|
||||
await fs.access(destPath);
|
||||
console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`);
|
||||
continue;
|
||||
} catch {
|
||||
// file doesn't exist, proceed
|
||||
}
|
||||
|
||||
// Find the download/view button within this row
|
||||
const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]');
|
||||
const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]');
|
||||
const btn = downloadBtn ?? viewBtn;
|
||||
|
||||
if (!btn) {
|
||||
console.log(` ⚠ No download button found for: ${name}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// Intercept the PDF download
|
||||
const [download] = await Promise.all([
|
||||
page.waitForEvent('download', { timeout: 15_000 }),
|
||||
btn.click(),
|
||||
]).catch(async () => {
|
||||
// If no download event, try waiting for a new page/tab with PDF
|
||||
const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 });
|
||||
await btn.click();
|
||||
const newPage = await newPagePromise;
|
||||
await newPage.waitForLoadState();
|
||||
const url = newPage.url();
|
||||
if (url.endsWith('.pdf') || url.includes('/pdf')) {
|
||||
const pdfBuffer = await newPage.evaluate(async (pdfUrl) => {
|
||||
const res = await fetch(pdfUrl);
|
||||
const buf = await res.arrayBuffer();
|
||||
return Array.from(new Uint8Array(buf));
|
||||
}, url);
|
||||
await fs.writeFile(destPath, Buffer.from(pdfBuffer));
|
||||
await newPage.close();
|
||||
return [null];
|
||||
}
|
||||
await newPage.close();
|
||||
return [null];
|
||||
});
|
||||
|
||||
if (download && typeof download === 'object' && 'saveAs' in download) {
|
||||
await (download as import('playwright').Download).saveAs(destPath);
|
||||
console.log(` ✓ ${sanitized}.pdf`);
|
||||
downloaded.push(sanitized);
|
||||
} else {
|
||||
console.log(` ✓ ${sanitized}.pdf (via page)`);
|
||||
downloaded.push(sanitized);
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(` ✗ Failed: ${name} — ${(err as Error).message}`);
|
||||
failed.push(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadViaLinks(
|
||||
page: import('playwright').Page,
|
||||
outputDir: string,
|
||||
downloaded: string[],
|
||||
failed: string[]
|
||||
) {
|
||||
// Intercept all PDF network requests triggered by clicking form items
|
||||
const pdfResponses: { url: string; name: string }[] = [];
|
||||
|
||||
page.on('response', async (response) => {
|
||||
const contentType = response.headers()['content-type'] ?? '';
|
||||
if (contentType.includes('pdf') && response.status() === 200) {
|
||||
const url = response.url();
|
||||
const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`;
|
||||
pdfResponses.push({ url, name });
|
||||
}
|
||||
});
|
||||
|
||||
// Click each item that looks like a form
|
||||
const items = await page.$$('li, [role="listitem"], [class*="item"]');
|
||||
for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops
|
||||
const text = await item.textContent().catch(() => '');
|
||||
if (!text?.trim()) continue;
|
||||
|
||||
const before = pdfResponses.length;
|
||||
await item.click({ timeout: 3_000 }).catch(() => {});
|
||||
await page.waitForTimeout(1_000);
|
||||
|
||||
if (pdfResponses.length > before) {
|
||||
const { url, name } = pdfResponses[pdfResponses.length - 1];
|
||||
const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80);
|
||||
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||||
try {
|
||||
const res = await page.evaluate(async (pdfUrl) => {
|
||||
const r = await fetch(pdfUrl);
|
||||
const buf = await r.arrayBuffer();
|
||||
return Array.from(new Uint8Array(buf));
|
||||
}, url);
|
||||
await fs.writeFile(destPath, Buffer.from(res));
|
||||
console.log(` ✓ ${sanitized}.pdf`);
|
||||
downloaded.push(sanitized);
|
||||
} catch (err) {
|
||||
console.log(` ✗ Failed saving ${name}`);
|
||||
failed.push(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Fatal:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user