From 27462a0ebbc16d405b37947cfa601ae66d31c937 Mon Sep 17 00:00:00 2001 From: Chandler Copeland Date: Thu, 19 Mar 2026 22:27:41 -0600 Subject: [PATCH] feat: add Playwright script to scrape SkySlope form libraries into seeds/forms/ --- teressa-copeland-homes/package.json | 4 +- .../scripts/scrape-skyslope-forms.ts | 230 ++++++++++++++++++ 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 teressa-copeland-homes/scripts/scrape-skyslope-forms.ts diff --git a/teressa-copeland-homes/package.json b/teressa-copeland-homes/package.json index eb61722..9715653 100644 --- a/teressa-copeland-homes/package.json +++ b/teressa-copeland-homes/package.json @@ -11,7 +11,8 @@ "db:migrate": "drizzle-kit migrate", "db:seed": "tsx scripts/seed.ts", "seed:forms": "DOTENV_CONFIG_PATH=.env.local npx tsx scripts/seed-forms.ts", - "db:studio": "drizzle-kit studio" + "db:studio": "drizzle-kit studio", + "scrape:forms": "DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts" }, "dependencies": { "@vercel/blob": "^2.3.1", @@ -38,6 +39,7 @@ "drizzle-kit": "^0.31.10", "eslint": "^9", "eslint-config-next": "16.2.0", + "playwright": "^1.58.2", "tailwindcss": "^4", "tsx": "^4.21.0", "typescript": "^5" diff --git a/teressa-copeland-homes/scripts/scrape-skyslope-forms.ts b/teressa-copeland-homes/scripts/scrape-skyslope-forms.ts new file mode 100644 index 0000000..72fe98c --- /dev/null +++ b/teressa-copeland-homes/scripts/scrape-skyslope-forms.ts @@ -0,0 +1,230 @@ +/** + * SkySlope Forms Scraper + * + * Downloads all PDFs from your SkySlope form libraries into seeds/forms/. + * Run: DOTENV_CONFIG_PATH=.env.local npx tsx scripts/scrape-skyslope-forms.ts + * + * Credentials are read from env vars: + * SKYSLOPE_LAST_NAME — last name on NAR records (e.g. Copeland) + * SKYSLOPE_NRDS_ID — 9-digit NRDS ID (e.g. 837075029) + */ + +import { chromium } from 'playwright'; +import * as fs from 'node:fs/promises'; +import * as path from 'node:path'; + +const LAST_NAME = process.env.SKYSLOPE_LAST_NAME; +const NRDS_ID = process.env.SKYSLOPE_NRDS_ID; +const OUTPUT_DIR = path.resolve(process.cwd(), 'seeds/forms'); +const BASE_URL = 'https://forms.skyslope.com'; + +if (!LAST_NAME || !NRDS_ID) { + console.error('Missing required env vars: SKYSLOPE_LAST_NAME, SKYSLOPE_NRDS_ID'); + process.exit(1); +} + +async function main() { + await fs.mkdir(OUTPUT_DIR, { recursive: true }); + + const browser = await chromium.launch({ headless: false }); // visible so you can watch/intervene + const context = await browser.newContext({ acceptDownloads: true }); + const page = await context.newPage(); + + console.log('Navigating to SkySlope Forms...'); + await page.goto(`${BASE_URL}/welcome/authorization`); + + // Fill NRDS authorization form + console.log('Filling NRDS credentials...'); + await page.getByLabel(/last name/i).fill(LAST_NAME!); + await page.getByLabel(/nrds/i).fill(NRDS_ID!); + await page.getByRole('button', { name: /next/i }).click(); + + // Wait for the main forms library page to load + console.log('Waiting for forms library...'); + await page.waitForURL(/forms\.skyslope\.com\/(?!welcome)/, { timeout: 30_000 }); + await page.waitForLoadState('networkidle'); + + // Find all library tabs/sections + const libraries = await page.$$eval( + '[class*="library"], [class*="Library"], [data-testid*="library"]', + els => els.map(el => ({ text: el.textContent?.trim(), id: el.id })) + ); + console.log(`Found ${libraries.length} library elements`); + + // Collect all form links across all tabs + const downloaded: string[] = []; + const failed: string[] = []; + + // Navigate to the forms list — look for a tab or section with all forms + // Try clicking "All" tab if it exists + const allTab = page.getByRole('tab', { name: /all/i }).first(); + if (await allTab.isVisible().catch(() => false)) { + await allTab.click(); + await page.waitForLoadState('networkidle'); + } + + // Scrape all visible form items across all library sections + await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed); + + // Also try each library tab + const tabs = await page.getByRole('tab').all(); + for (const tab of tabs) { + const tabName = await tab.textContent(); + if (!tabName) continue; + console.log(`\nChecking library tab: ${tabName.trim()}`); + await tab.click(); + await page.waitForLoadState('networkidle'); + await downloadFormsOnPage(page, OUTPUT_DIR, downloaded, failed); + } + + console.log(`\n✓ Downloaded ${downloaded.length} forms to ${OUTPUT_DIR}`); + if (failed.length > 0) { + console.log(`✗ Failed: ${failed.join(', ')}`); + } + + await browser.close(); +} + +async function downloadFormsOnPage( + page: import('playwright').Page, + outputDir: string, + downloaded: string[], + failed: string[] +) { + // Look for form items — SkySlope renders them as list rows with a download or view button + // Strategy: intercept PDF responses by clicking each form's download/view button + const formRows = await page.$$('[class*="form-item"], [class*="formItem"], [class*="form-row"], [role="row"], [class*="list-item"]'); + + if (formRows.length === 0) { + // Fallback: look for any link or button that mentions PDF or has a form name + console.log(' No form rows found with primary selectors, trying fallback...'); + await downloadViaLinks(page, outputDir, downloaded, failed); + return; + } + + console.log(` Found ${formRows.length} form rows`); + + for (const row of formRows) { + const name = await row.$eval( + '[class*="name"], [class*="title"], span, td', + el => el.textContent?.trim() + ).catch(() => null); + + if (!name) continue; + + // Skip already downloaded + const sanitized = name.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim(); + const destPath = path.join(outputDir, `${sanitized}.pdf`); + + try { + await fs.access(destPath); + console.log(` ⊙ Skipping (already exists): ${sanitized}.pdf`); + continue; + } catch { + // file doesn't exist, proceed + } + + // Find the download/view button within this row + const downloadBtn = await row.$('button[aria-label*="download" i], button[title*="download" i], [class*="download"]'); + const viewBtn = await row.$('button[aria-label*="view" i], button[title*="view" i], [class*="view"]'); + const btn = downloadBtn ?? viewBtn; + + if (!btn) { + console.log(` ⚠ No download button found for: ${name}`); + continue; + } + + try { + // Intercept the PDF download + const [download] = await Promise.all([ + page.waitForEvent('download', { timeout: 15_000 }), + btn.click(), + ]).catch(async () => { + // If no download event, try waiting for a new page/tab with PDF + const newPagePromise = page.context().waitForEvent('page', { timeout: 10_000 }); + await btn.click(); + const newPage = await newPagePromise; + await newPage.waitForLoadState(); + const url = newPage.url(); + if (url.endsWith('.pdf') || url.includes('/pdf')) { + const pdfBuffer = await newPage.evaluate(async (pdfUrl) => { + const res = await fetch(pdfUrl); + const buf = await res.arrayBuffer(); + return Array.from(new Uint8Array(buf)); + }, url); + await fs.writeFile(destPath, Buffer.from(pdfBuffer)); + await newPage.close(); + return [null]; + } + await newPage.close(); + return [null]; + }); + + if (download && typeof download === 'object' && 'saveAs' in download) { + await (download as import('playwright').Download).saveAs(destPath); + console.log(` ✓ ${sanitized}.pdf`); + downloaded.push(sanitized); + } else { + console.log(` ✓ ${sanitized}.pdf (via page)`); + downloaded.push(sanitized); + } + } catch (err) { + console.log(` ✗ Failed: ${name} — ${(err as Error).message}`); + failed.push(name); + } + } +} + +async function downloadViaLinks( + page: import('playwright').Page, + outputDir: string, + downloaded: string[], + failed: string[] +) { + // Intercept all PDF network requests triggered by clicking form items + const pdfResponses: { url: string; name: string }[] = []; + + page.on('response', async (response) => { + const contentType = response.headers()['content-type'] ?? ''; + if (contentType.includes('pdf') && response.status() === 200) { + const url = response.url(); + const name = path.basename(new URL(url).pathname, '.pdf') || `form-${Date.now()}`; + pdfResponses.push({ url, name }); + } + }); + + // Click each item that looks like a form + const items = await page.$$('li, [role="listitem"], [class*="item"]'); + for (const item of items.slice(0, 100)) { // cap at 100 to avoid infinite loops + const text = await item.textContent().catch(() => ''); + if (!text?.trim()) continue; + + const before = pdfResponses.length; + await item.click({ timeout: 3_000 }).catch(() => {}); + await page.waitForTimeout(1_000); + + if (pdfResponses.length > before) { + const { url, name } = pdfResponses[pdfResponses.length - 1]; + const sanitized = text.replace(/[^a-z0-9 ._-]/gi, ' ').replace(/\s+/g, ' ').trim().slice(0, 80); + const destPath = path.join(outputDir, `${sanitized}.pdf`); + try { + const res = await page.evaluate(async (pdfUrl) => { + const r = await fetch(pdfUrl); + const buf = await r.arrayBuffer(); + return Array.from(new Uint8Array(buf)); + }, url); + await fs.writeFile(destPath, Buffer.from(res)); + console.log(` ✓ ${sanitized}.pdf`); + downloaded.push(sanitized); + } catch (err) { + console.log(` ✗ Failed saving ${name}`); + failed.push(name); + } + } + } +} + +main().catch(err => { + console.error('Fatal:', err); + process.exit(1); +});