/** * SkySlope Forms Scraper * * Downloads all PDFs from your SkySlope form libraries into seeds/forms/. * Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts * * Credentials are read from env vars: * SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland) * SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029) */ import { chromium } from 'playwright'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; const LAST_NAME = process.env.SKYSLOPE_LAST_NAME; const NRDS_ID = process.env.SKYSLOPE_NRDS_ID; const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms'); const BASE_URL = 'https://forms.skyslope.com'; if (!LAST_NAME || !NRDS_ID) { console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID'); process.exit(1); } async function main() { await fs.mkdir(OUTPUT_DIR, { recursive: true }); const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene const context = await browser.newContext({ acceptDownloads: true }); const page = await context.newPage(); console.log('Navigating to SkySlope Forms...'); await page.goto(`${BASE_URL}/welcome/authorization`); // Fill NRDS authorization form console.log('Filling NRDS credentials...'); await page.getByLabel(/last name/i).fill(LAST_NAME!); await page.getByLabel(/nrds/i).fill(NRDS_ID!); await page.getByRole('button', { name: /next/i }).click(); // Wait for the main forms library page to load console.log('Waiting for forms library...'); await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 }); await page.waitForLoadState('networkidle'); // Find all library tabs/sections const libraries = await page.$$eval( '[class*="library"], [class*="Library"], [data-testid*="library"]', els => els.map(el => ({ text: el.textContent?.trim(), id: el.id })) ); console.log(`Found ${libraries.length} library elements`); // Collect all form links across all tabs const downloaded: string[] = []; const failed: string[] = []; // Navigate to the forms list — look for a tab or section with all forms // Try clicking "All" tab if it exists const allTab = page.getByRole('tab', { name: /all/i }).first(); if (await allTab.isVisible().catch(() => false)) { await allTab.click(); await page.waitForLoadState('networkidle'); } // Scrape all visible form items across all library sections await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed); // Also try each library tab const tabs = await page.getByRole('tab').all(); for (const tab of tabs) { const tabName = await tab.textContent(); if (!tabName) continue; console.log(`\nChecking library tab: ${tabName.trim()}`); await tab.click(); await page.waitForLoadState('networkidle'); await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed); } console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`); if (failed.length > 0) { console.log(`✗ Failed: ${failed.join(', ')}`); } await browser.close(); } async function downloadFormsOnPage( page: import('playwright').Page, outputDir: string, downloaded: string[], failed: string[] ) { // Look for form items — SkySlope renders them as list rows with a download or view button // Strategy: intercept PDF responses by clicking each form's download/view button const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]'); if (formRows.length === 0) { // Fallback: look for any link or button that mentions PDF or has a form name console.log(' No form rows found with primary selectors, trying fallback...'); await downloadViaLinks(page, outputDir, downloaded, failed); return; } console.log(` Found ${formRows.length} form rows`); for (const row of formRows) { const name = await row.$eval( '[class*="name"], [class*="title"], span, td', el => el.textContent?.trim() ).catch(() => null); if (!name) continue; // Skip already downloaded const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim(); const destPath = path.join(outputDir, `${sanitized}.pdf`); try { await fs.access(destPath); console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`); continue; } catch { // file doesn't exist, proceed } // Find the download/view button within this row const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]'); const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]'); const btn = downloadBtn ?? viewBtn; if (!btn) { console.log(` ⚠ No download button found for: ${name}`); continue; } try { // Intercept the PDF download const [download] = await Promise.all([ page.waitForEvent('download', { timeout: 15_000 }), btn.click(), ]).catch(async () => { // If no download event, try waiting for a new page/tab with PDF const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 }); await btn.click(); const newPage = await newPagePromise; await newPage.waitForLoadState(); const url = newPage.url(); if (url.endsWith('.pdf') || url.includes('/pdf')) { const pdfBuffer = await newPage.evaluate(async (pdfUrl) => { const res = await fetch(pdfUrl); const buf = await res.arrayBuffer(); return Array.from(new Uint8Array(buf)); }, url); await fs.writeFile(destPath, Buffer.from(pdfBuffer)); await newPage.close(); return [null]; } await newPage.close(); return [null]; }); if (download && typeof download === 'object' && 'saveAs' in download) { await (download as import('playwright').Download).saveAs(destPath); console.log(` ✓ ${sanitized}.pdf`); downloaded.push(sanitized); } else { console.log(` ✓ ${sanitized}.pdf (via page)`); downloaded.push(sanitized); } } catch (err) { console.log(` ✗ Failed: ${name} — ${(err as Error).message}`); failed.push(name); } } } async function downloadViaLinks( page: import('playwright').Page, outputDir: string, downloaded: string[], failed: string[] ) { // Intercept all PDF network requests triggered by clicking form items const pdfResponses: { url: string; name: string }[] = []; page.on('response', async (response) => { const contentType = response.headers()['content-type'] ?? ''; if (contentType.includes('pdf') && response.status() === 200) { const url = response.url(); const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`; pdfResponses.push({ url, name }); } }); // Click each item that looks like a form const items = await page.$$('li, [role="listitem"], [class*="item"]'); for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops const text = await item.textContent().catch(() => ''); if (!text?.trim()) continue; const before = pdfResponses.length; await item.click({ timeout: 3_000 }).catch(() => {}); await page.waitForTimeout(1_000); if (pdfResponses.length > before) { const { url, name } = pdfResponses[pdfResponses.length - 1]; const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80); const destPath = path.join(outputDir, `${sanitized}.pdf`); try { const res = await page.evaluate(async (pdfUrl) => { const r = await fetch(pdfUrl); const buf = await r.arrayBuffer(); return Array.from(new Uint8Array(buf)); }, url); await fs.writeFile(destPath, Buffer.from(res)); console.log(` ✓ ${sanitized}.pdf`); downloaded.push(sanitized); } catch (err) { console.log(` ✗ Failed saving ${name}`); failed.push(name); } } } } main().catch(err => { console.error('Fatal:', err); process.exit(1); });