diff --git a/teressa-copeland-homes/Dockerfile b/teressa-copeland-homes/Dockerfile index d404ce5..2ce764c 100644 --- a/teressa-copeland-homes/Dockerfile +++ b/teressa-copeland-homes/Dockerfile @@ -30,6 +30,9 @@ COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static # Copy seeds/forms for form library import feature (runtime dependency) COPY --from=builder --chown=nextjs:nodejs /app/seeds ./seeds +# Copy polyfill script (required via NODE_OPTIONS before server starts) +COPY --from=builder --chown=nextjs:nodejs /app/scripts/polyfill-dom.cjs ./scripts/polyfill-dom.cjs + # Create uploads directory (will be mounted as volume) RUN mkdir -p uploads && chown nextjs:nodejs uploads diff --git a/teressa-copeland-homes/docker-compose.yml b/teressa-copeland-homes/docker-compose.yml index bd60353..2346c20 100644 --- a/teressa-copeland-homes/docker-compose.yml +++ b/teressa-copeland-homes/docker-compose.yml @@ -26,7 +26,7 @@ services: env_file: - .env.production environment: - - NODE_OPTIONS=--dns-result-order=ipv4first + - NODE_OPTIONS=--dns-result-order=ipv4first --require /app/scripts/polyfill-dom.cjs dns: - 8.8.8.8 - 1.1.1.1 diff --git a/teressa-copeland-homes/scripts/polyfill-dom.cjs b/teressa-copeland-homes/scripts/polyfill-dom.cjs new file mode 100644 index 0000000..fcc709b --- /dev/null +++ b/teressa-copeland-homes/scripts/polyfill-dom.cjs @@ -0,0 +1,11 @@ +// Polyfills for browser globals required by pdfjs-dist in Node.js. +// Required via NODE_OPTIONS=--require so it runs before any module evaluation. +if (typeof globalThis.DOMMatrix === 'undefined') { + globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } }; +} +if (typeof globalThis.ImageData === 'undefined') { + globalThis.ImageData = class ImageData { constructor() { return this; } }; +} +if (typeof globalThis.Path2D === 'undefined') { + globalThis.Path2D = class Path2D { constructor() { return this; } }; +} diff --git a/teressa-copeland-homes/src/instrumentation.ts b/teressa-copeland-homes/src/instrumentation.ts new file mode 100644 index 0000000..cd80d8f --- /dev/null +++ b/teressa-copeland-homes/src/instrumentation.ts @@ -0,0 +1,23 @@ +/** + * Next.js instrumentation hook — runs once before any route handler. + * Polyfills browser globals required by pdfjs-dist at module load time. + * @napi-rs/canvas normally provides these but may be absent in some server + * environments (e.g. linux/amd64 Docker image on ARM build host). + * Text extraction never actually calls these — pdfjs needs the classes defined. + */ +export async function register() { + if (process.env.NEXT_RUNTIME === 'nodejs') { + if (typeof globalThis.DOMMatrix === 'undefined') { + // @ts-expect-error minimal stub + globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } }; + } + if (typeof globalThis.ImageData === 'undefined') { + // @ts-expect-error minimal stub + globalThis.ImageData = class ImageData { constructor() { return this; } }; + } + if (typeof globalThis.Path2D === 'undefined') { + // @ts-expect-error minimal stub + globalThis.Path2D = class Path2D { constructor() { return this; } }; + } + } +} diff --git a/teressa-copeland-homes/src/lib/ai/extract-text.ts b/teressa-copeland-homes/src/lib/ai/extract-text.ts index 312e6a8..863327f 100644 --- a/teressa-copeland-homes/src/lib/ai/extract-text.ts +++ b/teressa-copeland-homes/src/lib/ai/extract-text.ts @@ -1,66 +1,310 @@ // server-only — never import from client components -// This module uses pdfjs-dist legacy build in Node.js fake-worker mode (no browser worker). -// The client components (PdfViewer.tsx, PreviewModal.tsx) set workerSrc independently. +// Extracts blank field positions from the PDF text layer by finding underscore sequences. +// No image rendering needed — coordinates come directly from pdfjs in PDF user-space. // @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs'; -import { createCanvas } from '@napi-rs/canvas'; import { readFile } from 'node:fs/promises'; import { join } from 'node:path'; -// pdfjs-dist 5.x fake-worker mode: must point workerSrc to the actual worker file so -// _setupFakeWorkerGlobal can `await import(this.workerSrc)` in Node.js. -// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs. GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`; -/** A rendered page image, ready to send to GPT-4o vision. */ -export interface PageImage { - page: number; // 1-indexed - width: number; // original PDF width in points (scale 1.0) - height: number; // original PDF height in points (scale 1.0) - base64: string; // JPEG base64 of the rendered page (no data: prefix) +/** + * A blank field detected from the PDF text layer. + * Coordinates are in PDF user-space (bottom-left origin, points). + */ +export interface BlankField { + page: number; + x: number; + y: number; + width: number; + fontSize: number; + contextBefore: string; + contextAfter: string; + contextAbove: string; + contextBelow: string; + /** 1-indexed position of this blank among all pure-underscore blanks on its line (Strategy 1 only). */ + rowIndex?: number; + /** Total pure-underscore blanks on this line (Strategy 1 only). */ + rowTotal?: number; } -// Legacy type alias kept for callers that still reference PageText -export type PageText = PageImage; +const MIN_UNDERSCORES = 2; -const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads +// Two consecutive underscore items are considered the SAME blank when their gap is ≤ this. +// Font-split items (same blank rendered as multiple runs) have gap ~0-2pt. +// Column separators in signature blocks are typically 5-30pt — kept separate. +const MAX_MERGE_GAP_PT = 3; -export async function extractPdfText(filePath: string): Promise { +interface RawTextItem { + str: string; + transform: number[]; + width: number; +} + +function isPureUnderscoreItem(item: RawTextItem): boolean { + const count = (item.str.match(/_/g) ?? []).length; + if (count < MIN_UNDERSCORES) return false; + return item.str.replace(/[_ ]/g, '').length <= 1; +} + +function groupIntoLines(items: RawTextItem[]): Array<{ y: number; items: RawTextItem[] }> { + const sorted = [...items].sort((a, b) => b.transform[5] - a.transform[5]); + const groups: Array<{ y: number; items: RawTextItem[] }> = []; + + for (const item of sorted) { + const itemY = item.transform[5]; + const last = groups[groups.length - 1]; + if (last && Math.abs(last.y - itemY) <= 5) { + last.items.push(item); + } else { + groups.push({ y: itemY, items: [item] }); + } + } + + for (const g of groups) g.items.sort((a, b) => a.transform[4] - b.transform[4]); + + return groups; +} + +/** + * Build context string from line items, excluding pure-underscore items + * (they're adjacent blanks, not label text) and stripping stray bracket chars. + */ +function lineContext(items: RawTextItem[]): string { + return items + .map(i => i.str.replace(/[\[\]]/g, '').trim()) // strip [ ] bracket chars + .filter(s => s.replace(/[_ ]/g, '').length > 0) // drop pure-underscore/space items + .join(' ') + .trim(); +} + +/** + * Extract all blank fields from a PDF using four detection strategies. + * + * Strategy 1 — pure underscore runs with geometric merging: + * Consecutive underscore items with a gap ≤ MAX_MERGE_GAP_PT are one blank. + * Large gaps (column spacing) keep blanks separate. + * + * Strategy 2 — embedded underscore runs: + * Items like "Date ___________" containing 5+ underscores mixed with label text. + * Position estimated by character ratio. + * + * Strategy 3 — single-item bracket blanks: + * "[ ]" (3+ internal spaces) in one text item. + * Width estimated from the bracket span within the item, not the full item width. + * + * Strategy 4 — multi-item bracket blanks: + * "Seller's Initials [" … spaces … "]" split across consecutive items. + * Width measured as the gap between the bracket items. + */ +export async function extractBlanks(filePath: string): Promise { const data = new Uint8Array(await readFile(filePath)); const pdf = await getDocument({ data }).promise; - const pages: PageImage[] = []; + const allBlanks: BlankField[] = []; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { const page = await pdf.getPage(pageNum); - const viewport = page.getViewport({ scale: RENDER_SCALE }); + const textContent = await page.getTextContent(); + const items = (textContent.items as RawTextItem[]).filter(i => i.str.length > 0); + const lines = groupIntoLines(items); - // Create an @napi-rs/canvas and render the PDF page into it - const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height)); - const ctx = canvas.getContext('2d'); + for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) { + const { y: lineY, items: lineItems } = lines[lineIdx]; - // @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge - await page.render({ canvasContext: ctx, viewport }).promise; + // Skip lines that are purely underscores (blank rows) so that stacked signature + // rows still find the "(Seller's Signature) (Address/Phone) (Date)" label line. + const contextAbove = (() => { + for (let li = lineIdx - 1; li >= 0; li--) { + const ctx = lineContext(lines[li].items); + if (ctx) return ctx.slice(0, 100); + } + return ''; + })(); + const contextBelow = (() => { + for (let li = lineIdx + 1; li < lines.length; li++) { + const ctx = lineContext(lines[li].items); + if (ctx) return ctx.slice(0, 100); + } + return ''; + })(); - // Stamp a visible page number in the top-left corner so GPT-4o can correlate - // each image to the correct page number when multiple images are sent in one prompt. - const label = `PAGE ${pageNum}`; - const fontSize = Math.round(viewport.height * 0.025); - ctx.fillStyle = 'rgba(220,30,30,0.85)'; - ctx.fillRect(0, 0, fontSize * (label.length * 0.65), fontSize * 1.5); - ctx.fillStyle = '#ffffff'; - ctx.font = `bold ${fontSize}px sans-serif`; - ctx.fillText(label, 4, fontSize * 1.15); + // ── Strategy 1: pure underscore runs (geometric merge) ─────────────────── + // Two-pass approach: first collect all run boundaries on this line, then emit + // each blank with its row position (rowIndex / rowTotal) so the AI can + // reliably identify first / middle / last blanks on multi-blank rows. + { + // Pass 1: collect run boundaries + const runs: Array<{ startIdx: number; endIdx: number }> = []; + let runStart = -1; + let lastUnderscoreIdx = -1; - const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85); + for (let i = 0; i < lineItems.length; i++) { + if (!isPureUnderscoreItem(lineItems[i])) continue; - pages.push({ - page: pageNum, - width: page.getViewport({ scale: 1.0 }).width, - height: page.getViewport({ scale: 1.0 }).height, - base64: jpegBuffer.toString('base64'), - }); + if (runStart < 0) { + runStart = i; + lastUnderscoreIdx = i; + } else { + const prev = lineItems[lastUnderscoreIdx]; + const gap = lineItems[i].transform[4] - (prev.transform[4] + prev.width); + if (gap <= MAX_MERGE_GAP_PT) { + lastUnderscoreIdx = i; // extend run + } else { + runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx }); + runStart = i; + lastUnderscoreIdx = i; + } + } + } + if (runStart >= 0) runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx }); + + // Pass 2: emit blanks with row position metadata + const rowTotal = runs.length; + for (let r = 0; r < rowTotal; r++) { + const { startIdx, endIdx } = runs[r]; + const first = lineItems[startIdx]; + const last = lineItems[endIdx]; + allBlanks.push({ + page: pageNum, + x: first.transform[4], + y: lineY, + width: (last.transform[4] + last.width) - first.transform[4], + fontSize: Math.abs(first.transform[0]) || 10, + contextBefore: lineContext(lineItems.slice(0, startIdx)).slice(-80), + contextAfter: lineContext(lineItems.slice(endIdx + 1)).slice(0, 80), + contextAbove, + contextBelow, + rowIndex: r + 1, + rowTotal, + }); + } + } + + // ── Strategy 2: embedded underscore runs ───────────────────────────────── + for (let i = 0; i < lineItems.length; i++) { + const item = lineItems[i]; + if (isPureUnderscoreItem(item)) continue; + + const match = item.str.match(/_{5,}/); + if (!match || match.index === undefined) continue; + + const runIdx = match.index; + const runLen = match[0].length; + const totalLen = item.str.length; + const xEst = item.transform[4] + (runIdx / totalLen) * item.width; + const widthEst = Math.max((runLen / totalLen) * item.width, 30); + + const prefixLabel = item.str.slice(0, runIdx).trim(); + const suffixLabel = item.str.slice(runIdx + runLen).trim(); + + allBlanks.push({ + page: pageNum, + x: xEst, + y: lineY, + width: widthEst, + fontSize: Math.abs(item.transform[0]) || 10, + contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel] + .filter(Boolean).join(' ').trim().slice(-80), + contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))] + .filter(Boolean).join(' ').trim().slice(0, 80), + contextAbove, + contextBelow, + }); + } + + // ── Strategy 3: single-item bracket blanks ─────────────────────────────── + // Estimate x/width from the bracket span within the item string, not item.width, + // so a footer like "Seller's Initials [ ] Date ___" gets the correct narrow width. + for (let i = 0; i < lineItems.length; i++) { + const item = lineItems[i]; + const match = item.str.match(/\[(\s{3,})\]/); + if (!match || match.index === undefined) continue; + + const bracketStart = match.index + 1; // char index of first space inside [ + const bracketEnd = match.index + match[0].length - 1; // char index of ] + const totalLen = item.str.length; + const xEst = item.transform[4] + (bracketStart / totalLen) * item.width; + const widthEst = Math.max(((bracketEnd - bracketStart) / totalLen) * item.width, 20); + + const prefixLabel = item.str.slice(0, match.index).trim(); + const suffixLabel = item.str.slice(match.index + match[0].length).trim(); + + allBlanks.push({ + page: pageNum, + x: xEst, + y: lineY, + width: widthEst, + fontSize: Math.abs(item.transform[0]) || 10, + contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel] + .filter(Boolean).join(' ').trim().slice(-80), + contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))] + .filter(Boolean).join(' ').trim().slice(0, 80), + contextAbove, + contextBelow, + }); + } + + // ── Strategy 4: multi-item bracket blanks ──────────────────────────────── + // "Seller's Initials [" … whitespace items … "]" + { + let openIdx = -1; + + for (let i = 0; i < lineItems.length; i++) { + const item = lineItems[i]; + + if (openIdx < 0) { + if (item.str.trimEnd().endsWith('[')) openIdx = i; + } else { + const isWhitespace = item.str.trim().length === 0; + const closesHere = item.str.trimStart().startsWith(']'); + + if (closesHere) { + const openItem = lineItems[openIdx]; + const closeItem = lineItems[i]; + const blankX = openItem.transform[4] + openItem.width; + const blankWidth = closeItem.transform[4] - blankX; + + if (blankWidth > 5) { + const prefixBracket = openItem.str.replace(/\[$/, '').trim(); + const suffixBracket = closeItem.str.replace(/^\]/, '').trim(); + allBlanks.push({ + page: pageNum, + x: blankX, + y: lineY, + width: blankWidth, + fontSize: Math.abs(openItem.transform[0]) || 10, + contextBefore: [...lineContext(lineItems.slice(0, openIdx)).split(' '), prefixBracket] + .filter(Boolean).join(' ').trim().slice(-80), + contextAfter: [suffixBracket, lineContext(lineItems.slice(i + 1))] + .filter(Boolean).join(' ').trim().slice(0, 80), + contextAbove, + contextBelow, + }); + } + openIdx = -1; + } else if (!isWhitespace) { + openIdx = -1; + } + } + } + } + } } - return pages; + // Deduplicate: Strategy 3 (single-item bracket) and Strategy 4 (multi-item bracket) + // can both fire on the same blank. Use x-range overlap to catch them regardless of + // how far apart their estimated x positions are. + const deduped: BlankField[] = []; + for (const b of allBlanks) { + const isDupe = deduped.some( + d => + d.page === b.page && + Math.abs(d.y - b.y) < 4 && + d.x < b.x + b.width && // ranges overlap + b.x < d.x + d.width, + ); + if (!isDupe) deduped.push(b); + } + return deduped; }