feat: add Playwright script to scrape SkySlope form libraries into seeds/forms/

This commit is contained in:
Chandler Copeland
2026-03-19 22:27:41 -06:00
parent ebb0bf03b7
commit 27462a0ebb
2 changed files with 233 additions and 1 deletions

View File

@@ -11,7 +11,8 @@
"db:migrate": "drizzle-kit migrate",
"db:seed": "tsx scripts/seed.ts",
"seed:forms": "DOTENV_CONFIG_PATH=.env.local npx tsx scripts/seed-forms.ts",
"db:studio": "drizzle-kit studio"
"db:studio": "drizzle-kit studio",
"scrape:forms": "DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts"
},
"dependencies": {
"@vercel/blob": "^2.3.1",
@@ -38,6 +39,7 @@
"drizzle-kit": "^0.31.10",
"eslint": "^9",
"eslint-config-next": "16.2.0",
"playwright": "^1.58.2",
"tailwindcss": "^4",
"tsx": "^4.21.0",
"typescript": "^5"

View File

@@ -0,0 +1,230 @@
/**
* SkySlope Forms Scraper
*
* Downloads all PDFs from your SkySlope form libraries into seeds/forms/.
* Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts
*
* Credentials are read from env vars:
* SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland)
* SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029)
*/
import { chromium } from 'playwright';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME;
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID;
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
const BASE_URL = 'https://forms.skyslope.com';
if (!LAST_NAME || !NRDS_ID) {
console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID');
process.exit(1);
}
async function main() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene
const context = await browser.newContext({ acceptDownloads: true });
const page = await context.newPage();
console.log('Navigating to SkySlope Forms...');
await page.goto(`${BASE_URL}/welcome/authorization`);
// Fill NRDS authorization form
console.log('Filling NRDS credentials...');
await page.getByLabel(/last name/i).fill(LAST_NAME!);
await page.getByLabel(/nrds/i).fill(NRDS_ID!);
await page.getByRole('button', { name: /next/i }).click();
// Wait for the main forms library page to load
console.log('Waiting for forms library...');
await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 });
await page.waitForLoadState('networkidle');
// Find all library tabs/sections
const libraries = await page.$$eval(
'[class*="library"], [class*="Library"], [data-testid*="library"]',
els => els.map(el => ({ text: el.textContent?.trim(), id: el.id }))
);
console.log(`Found ${libraries.length} library elements`);
// Collect all form links across all tabs
const downloaded: string[] = [];
const failed: string[] = [];
// Navigate to the forms list — look for a tab or section with all forms
// Try clicking "All" tab if it exists
const allTab = page.getByRole('tab', { name: /all/i }).first();
if (await allTab.isVisible().catch(() => false)) {
await allTab.click();
await page.waitForLoadState('networkidle');
}
// Scrape all visible form items across all library sections
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
// Also try each library tab
const tabs = await page.getByRole('tab').all();
for (const tab of tabs) {
const tabName = await tab.textContent();
if (!tabName) continue;
console.log(`\nChecking library tab: ${tabName.trim()}`);
await tab.click();
await page.waitForLoadState('networkidle');
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
}
console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`);
if (failed.length > 0) {
console.log(`✗ Failed: ${failed.join(', ')}`);
}
await browser.close();
}
async function downloadFormsOnPage(
page: import('playwright').Page,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Look for form items — SkySlope renders them as list rows with a download or view button
// Strategy: intercept PDF responses by clicking each form's download/view button
const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]');
if (formRows.length === 0) {
// Fallback: look for any link or button that mentions PDF or has a form name
console.log(' No form rows found with primary selectors, trying fallback...');
await downloadViaLinks(page, outputDir, downloaded, failed);
return;
}
console.log(` Found ${formRows.length} form rows`);
for (const row of formRows) {
const name = await row.$eval(
'[class*="name"], [class*="title"], span, td',
el => el.textContent?.trim()
).catch(() => null);
if (!name) continue;
// Skip already downloaded
const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim();
const destPath = path.join(outputDir, `${sanitized}.pdf`);
try {
await fs.access(destPath);
console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`);
continue;
} catch {
// file doesn't exist, proceed
}
// Find the download/view button within this row
const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]');
const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]');
const btn = downloadBtn ?? viewBtn;
if (!btn) {
console.log(` ⚠ No download button found for: ${name}`);
continue;
}
try {
// Intercept the PDF download
const [download] = await Promise.all([
page.waitForEvent('download', { timeout: 15_000 }),
btn.click(),
]).catch(async () => {
// If no download event, try waiting for a new page/tab with PDF
const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 });
await btn.click();
const newPage = await newPagePromise;
await newPage.waitForLoadState();
const url = newPage.url();
if (url.endsWith('.pdf') || url.includes('/pdf')) {
const pdfBuffer = await newPage.evaluate(async (pdfUrl) => {
const res = await fetch(pdfUrl);
const buf = await res.arrayBuffer();
return Array.from(new Uint8Array(buf));
}, url);
await fs.writeFile(destPath, Buffer.from(pdfBuffer));
await newPage.close();
return [null];
}
await newPage.close();
return [null];
});
if (download && typeof download === 'object' && 'saveAs' in download) {
await (download as import('playwright').Download).saveAs(destPath);
console.log(`${sanitized}.pdf`);
downloaded.push(sanitized);
} else {
console.log(`${sanitized}.pdf (via page)`);
downloaded.push(sanitized);
}
} catch (err) {
console.log(` ✗ Failed: ${name}${(err as Error).message}`);
failed.push(name);
}
}
}
async function downloadViaLinks(
page: import('playwright').Page,
outputDir: string,
downloaded: string[],
failed: string[]
) {
// Intercept all PDF network requests triggered by clicking form items
const pdfResponses: { url: string; name: string }[] = [];
page.on('response', async (response) => {
const contentType = response.headers()['content-type'] ?? '';
if (contentType.includes('pdf') && response.status() === 200) {
const url = response.url();
const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`;
pdfResponses.push({ url, name });
}
});
// Click each item that looks like a form
const items = await page.$$('li, [role="listitem"], [class*="item"]');
for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops
const text = await item.textContent().catch(() => '');
if (!text?.trim()) continue;
const before = pdfResponses.length;
await item.click({ timeout: 3_000 }).catch(() => {});
await page.waitForTimeout(1_000);
if (pdfResponses.length > before) {
const { url, name } = pdfResponses[pdfResponses.length - 1];
const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80);
const destPath = path.join(outputDir, `${sanitized}.pdf`);
try {
const res = await page.evaluate(async (pdfUrl) => {
const r = await fetch(pdfUrl);
const buf = await r.arrayBuffer();
return Array.from(new Uint8Array(buf));
}, url);
await fs.writeFile(destPath, Buffer.from(res));
console.log(`${sanitized}.pdf`);
downloaded.push(sanitized);
} catch (err) {
console.log(` ✗ Failed saving ${name}`);
failed.push(name);
}
}
}
}
main().catch(err => {
console.error('Fatal:', err);
process.exit(1);
});