231 lines
8.1 KiB
TypeScript
231 lines
8.1 KiB
TypeScript
|
|
/**
|
||
|
|
* SkySlope Forms Scraper
|
||
|
|
*
|
||
|
|
* Downloads all PDFs from your SkySlope form libraries into seeds/forms/.
|
||
|
|
* Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts
|
||
|
|
*
|
||
|
|
* Credentials are read from env vars:
|
||
|
|
* SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland)
|
||
|
|
* SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029)
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { chromium } from 'playwright';
|
||
|
|
import * as fs from 'node:fs/promises';
|
||
|
|
import * as path from 'node:path';
|
||
|
|
|
||
|
|
const LAST_NAME = process.env.SKYSLOPE_LAST_NAME;
|
||
|
|
const NRDS_ID = process.env.SKYSLOPE_NRDS_ID;
|
||
|
|
const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms');
|
||
|
|
const BASE_URL = 'https://forms.skyslope.com';
|
||
|
|
|
||
|
|
if (!LAST_NAME || !NRDS_ID) {
|
||
|
|
console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
||
|
|
|
||
|
|
const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene
|
||
|
|
const context = await browser.newContext({ acceptDownloads: true });
|
||
|
|
const page = await context.newPage();
|
||
|
|
|
||
|
|
console.log('Navigating to SkySlope Forms...');
|
||
|
|
await page.goto(`${BASE_URL}/welcome/authorization`);
|
||
|
|
|
||
|
|
// Fill NRDS authorization form
|
||
|
|
console.log('Filling NRDS credentials...');
|
||
|
|
await page.getByLabel(/last name/i).fill(LAST_NAME!);
|
||
|
|
await page.getByLabel(/nrds/i).fill(NRDS_ID!);
|
||
|
|
await page.getByRole('button', { name: /next/i }).click();
|
||
|
|
|
||
|
|
// Wait for the main forms library page to load
|
||
|
|
console.log('Waiting for forms library...');
|
||
|
|
await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 });
|
||
|
|
await page.waitForLoadState('networkidle');
|
||
|
|
|
||
|
|
// Find all library tabs/sections
|
||
|
|
const libraries = await page.$$eval(
|
||
|
|
'[class*="library"], [class*="Library"], [data-testid*="library"]',
|
||
|
|
els => els.map(el => ({ text: el.textContent?.trim(), id: el.id }))
|
||
|
|
);
|
||
|
|
console.log(`Found ${libraries.length} library elements`);
|
||
|
|
|
||
|
|
// Collect all form links across all tabs
|
||
|
|
const downloaded: string[] = [];
|
||
|
|
const failed: string[] = [];
|
||
|
|
|
||
|
|
// Navigate to the forms list — look for a tab or section with all forms
|
||
|
|
// Try clicking "All" tab if it exists
|
||
|
|
const allTab = page.getByRole('tab', { name: /all/i }).first();
|
||
|
|
if (await allTab.isVisible().catch(() => false)) {
|
||
|
|
await allTab.click();
|
||
|
|
await page.waitForLoadState('networkidle');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Scrape all visible form items across all library sections
|
||
|
|
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
||
|
|
|
||
|
|
// Also try each library tab
|
||
|
|
const tabs = await page.getByRole('tab').all();
|
||
|
|
for (const tab of tabs) {
|
||
|
|
const tabName = await tab.textContent();
|
||
|
|
if (!tabName) continue;
|
||
|
|
console.log(`\nChecking library tab: ${tabName.trim()}`);
|
||
|
|
await tab.click();
|
||
|
|
await page.waitForLoadState('networkidle');
|
||
|
|
await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`);
|
||
|
|
if (failed.length > 0) {
|
||
|
|
console.log(`✗ Failed: ${failed.join(', ')}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
await browser.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
async function downloadFormsOnPage(
|
||
|
|
page: import('playwright').Page,
|
||
|
|
outputDir: string,
|
||
|
|
downloaded: string[],
|
||
|
|
failed: string[]
|
||
|
|
) {
|
||
|
|
// Look for form items — SkySlope renders them as list rows with a download or view button
|
||
|
|
// Strategy: intercept PDF responses by clicking each form's download/view button
|
||
|
|
const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]');
|
||
|
|
|
||
|
|
if (formRows.length === 0) {
|
||
|
|
// Fallback: look for any link or button that mentions PDF or has a form name
|
||
|
|
console.log(' No form rows found with primary selectors, trying fallback...');
|
||
|
|
await downloadViaLinks(page, outputDir, downloaded, failed);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(` Found ${formRows.length} form rows`);
|
||
|
|
|
||
|
|
for (const row of formRows) {
|
||
|
|
const name = await row.$eval(
|
||
|
|
'[class*="name"], [class*="title"], span, td',
|
||
|
|
el => el.textContent?.trim()
|
||
|
|
).catch(() => null);
|
||
|
|
|
||
|
|
if (!name) continue;
|
||
|
|
|
||
|
|
// Skip already downloaded
|
||
|
|
const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim();
|
||
|
|
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
await fs.access(destPath);
|
||
|
|
console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`);
|
||
|
|
continue;
|
||
|
|
} catch {
|
||
|
|
// file doesn't exist, proceed
|
||
|
|
}
|
||
|
|
|
||
|
|
// Find the download/view button within this row
|
||
|
|
const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]');
|
||
|
|
const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]');
|
||
|
|
const btn = downloadBtn ?? viewBtn;
|
||
|
|
|
||
|
|
if (!btn) {
|
||
|
|
console.log(` ⚠ No download button found for: ${name}`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Intercept the PDF download
|
||
|
|
const [download] = await Promise.all([
|
||
|
|
page.waitForEvent('download', { timeout: 15_000 }),
|
||
|
|
btn.click(),
|
||
|
|
]).catch(async () => {
|
||
|
|
// If no download event, try waiting for a new page/tab with PDF
|
||
|
|
const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 });
|
||
|
|
await btn.click();
|
||
|
|
const newPage = await newPagePromise;
|
||
|
|
await newPage.waitForLoadState();
|
||
|
|
const url = newPage.url();
|
||
|
|
if (url.endsWith('.pdf') || url.includes('/pdf')) {
|
||
|
|
const pdfBuffer = await newPage.evaluate(async (pdfUrl) => {
|
||
|
|
const res = await fetch(pdfUrl);
|
||
|
|
const buf = await res.arrayBuffer();
|
||
|
|
return Array.from(new Uint8Array(buf));
|
||
|
|
}, url);
|
||
|
|
await fs.writeFile(destPath, Buffer.from(pdfBuffer));
|
||
|
|
await newPage.close();
|
||
|
|
return [null];
|
||
|
|
}
|
||
|
|
await newPage.close();
|
||
|
|
return [null];
|
||
|
|
});
|
||
|
|
|
||
|
|
if (download && typeof download === 'object' && 'saveAs' in download) {
|
||
|
|
await (download as import('playwright').Download).saveAs(destPath);
|
||
|
|
console.log(` ✓ ${sanitized}.pdf`);
|
||
|
|
downloaded.push(sanitized);
|
||
|
|
} else {
|
||
|
|
console.log(` ✓ ${sanitized}.pdf (via page)`);
|
||
|
|
downloaded.push(sanitized);
|
||
|
|
}
|
||
|
|
} catch (err) {
|
||
|
|
console.log(` ✗ Failed: ${name} — ${(err as Error).message}`);
|
||
|
|
failed.push(name);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function downloadViaLinks(
|
||
|
|
page: import('playwright').Page,
|
||
|
|
outputDir: string,
|
||
|
|
downloaded: string[],
|
||
|
|
failed: string[]
|
||
|
|
) {
|
||
|
|
// Intercept all PDF network requests triggered by clicking form items
|
||
|
|
const pdfResponses: { url: string; name: string }[] = [];
|
||
|
|
|
||
|
|
page.on('response', async (response) => {
|
||
|
|
const contentType = response.headers()['content-type'] ?? '';
|
||
|
|
if (contentType.includes('pdf') && response.status() === 200) {
|
||
|
|
const url = response.url();
|
||
|
|
const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`;
|
||
|
|
pdfResponses.push({ url, name });
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Click each item that looks like a form
|
||
|
|
const items = await page.$$('li, [role="listitem"], [class*="item"]');
|
||
|
|
for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops
|
||
|
|
const text = await item.textContent().catch(() => '');
|
||
|
|
if (!text?.trim()) continue;
|
||
|
|
|
||
|
|
const before = pdfResponses.length;
|
||
|
|
await item.click({ timeout: 3_000 }).catch(() => {});
|
||
|
|
await page.waitForTimeout(1_000);
|
||
|
|
|
||
|
|
if (pdfResponses.length > before) {
|
||
|
|
const { url, name } = pdfResponses[pdfResponses.length - 1];
|
||
|
|
const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80);
|
||
|
|
const destPath = path.join(outputDir, `${sanitized}.pdf`);
|
||
|
|
try {
|
||
|
|
const res = await page.evaluate(async (pdfUrl) => {
|
||
|
|
const r = await fetch(pdfUrl);
|
||
|
|
const buf = await r.arrayBuffer();
|
||
|
|
return Array.from(new Uint8Array(buf));
|
||
|
|
}, url);
|
||
|
|
await fs.writeFile(destPath, Buffer.from(res));
|
||
|
|
console.log(` ✓ ${sanitized}.pdf`);
|
||
|
|
downloaded.push(sanitized);
|
||
|
|
} catch (err) {
|
||
|
|
console.log(` ✗ Failed saving ${name}`);
|
||
|
|
failed.push(name);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch(err => {
|
||
|
|
console.error('Fatal:', err);
|
||
|
|
process.exit(1);
|
||
|
|
});
|