fix(docker): polyfill DOMMatrix/ImageData/Path2D for pdfjs-dist in linux/amd64 container via NODE_OPTIONS --require

2026-04-03 18:02:39 -06:00
parent f15e538f5c
commit 2d2a43a3c9
5 changed files with 322 additions and 41 deletions
--- a/teressa-copeland-homes/src/instrumentation.ts
+++ b/teressa-copeland-homes/src/instrumentation.ts
@@ -0,0 +1,23 @@
+/**
+ * Next.js instrumentation hook — runs once before any route handler.
+ * Polyfills browser globals required by pdfjs-dist at module load time.
+ * @napi-rs/canvas normally provides these but may be absent in some server
+ * environments (e.g. linux/amd64 Docker image on ARM build host).
+ * Text extraction never actually calls these — pdfjs needs the classes defined.
+ */
+export async function register() {
+  if (process.env.NEXT_RUNTIME === 'nodejs') {
+    if (typeof globalThis.DOMMatrix === 'undefined') {
+      // @ts-expect-error minimal stub
+      globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } };
+    }
+    if (typeof globalThis.ImageData === 'undefined') {
+      // @ts-expect-error minimal stub
+      globalThis.ImageData = class ImageData { constructor() { return this; } };
+    }
+    if (typeof globalThis.Path2D === 'undefined') {
+      // @ts-expect-error minimal stub
+      globalThis.Path2D = class Path2D { constructor() { return this; } };
+    }
+  }
+}
--- a/teressa-copeland-homes/src/lib/ai/extract-text.ts
+++ b/teressa-copeland-homes/src/lib/ai/extract-text.ts
@@ -1,66 +1,310 @@
 // server-only — never import from client components
-// This module uses pdfjs-dist legacy build in Node.js fake-worker mode (no browser worker).
-// The client components (PdfViewer.tsx, PreviewModal.tsx) set workerSrc independently.
+// Extracts blank field positions from the PDF text layer by finding underscore sequences.
+// No image rendering needed — coordinates come directly from pdfjs in PDF user-space.

 // @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
 import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
-import { createCanvas } from '@napi-rs/canvas';
 import { readFile } from 'node:fs/promises';
 import { join } from 'node:path';

-// pdfjs-dist 5.x fake-worker mode: must point workerSrc to the actual worker file so
-// _setupFakeWorkerGlobal can `await import(this.workerSrc)` in Node.js.
-// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
 GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;

-/** A rendered page image, ready to send to GPT-4o vision. */
-export interface PageImage {
-  page: number;       // 1-indexed
-  width: number;      // original PDF width in points (scale 1.0)
-  height: number;     // original PDF height in points (scale 1.0)
-  base64: string;     // JPEG base64 of the rendered page (no data: prefix)
+/**
+ * A blank field detected from the PDF text layer.
+ * Coordinates are in PDF user-space (bottom-left origin, points).
+ */
+export interface BlankField {
+  page: number;
+  x: number;
+  y: number;
+  width: number;
+  fontSize: number;
+  contextBefore: string;
+  contextAfter: string;
+  contextAbove: string;
+  contextBelow: string;
+  /** 1-indexed position of this blank among all pure-underscore blanks on its line (Strategy 1 only). */
+  rowIndex?: number;
+  /** Total pure-underscore blanks on this line (Strategy 1 only). */
+  rowTotal?: number;
 }

-// Legacy type alias kept for callers that still reference PageText
-export type PageText = PageImage;
+const MIN_UNDERSCORES = 2;

-const RENDER_SCALE = 1.5;  // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
+// Two consecutive underscore items are considered the SAME blank when their gap is ≤ this.
+// Font-split items (same blank rendered as multiple runs) have gap ~0-2pt.
+// Column separators in signature blocks are typically 5-30pt — kept separate.
+const MAX_MERGE_GAP_PT = 3;

-export async function extractPdfText(filePath: string): Promise<PageImage[]> {
+interface RawTextItem {
+  str: string;
+  transform: number[];
+  width: number;
+}
+
+function isPureUnderscoreItem(item: RawTextItem): boolean {
+  const count = (item.str.match(/_/g) ?? []).length;
+  if (count < MIN_UNDERSCORES) return false;
+  return item.str.replace(/[_ ]/g, '').length <= 1;
+}
+
+function groupIntoLines(items: RawTextItem[]): Array<{ y: number; items: RawTextItem[] }> {
+  const sorted = [...items].sort((a, b) => b.transform[5] - a.transform[5]);
+  const groups: Array<{ y: number; items: RawTextItem[] }> = [];
+
+  for (const item of sorted) {
+    const itemY = item.transform[5];
+    const last = groups[groups.length - 1];
+    if (last && Math.abs(last.y - itemY) <= 5) {
+      last.items.push(item);
+    } else {
+      groups.push({ y: itemY, items: [item] });
+    }
+  }
+
+  for (const g of groups) g.items.sort((a, b) => a.transform[4] - b.transform[4]);
+
+  return groups;
+}
+
+/**
+ * Build context string from line items, excluding pure-underscore items
+ * (they're adjacent blanks, not label text) and stripping stray bracket chars.
+ */
+function lineContext(items: RawTextItem[]): string {
+  return items
+    .map(i => i.str.replace(/[\[\]]/g, '').trim())   // strip [ ] bracket chars
+    .filter(s => s.replace(/[_ ]/g, '').length > 0)  // drop pure-underscore/space items
+    .join(' ')
+    .trim();
+}
+
+/**
+ * Extract all blank fields from a PDF using four detection strategies.
+ *
+ * Strategy 1 — pure underscore runs with geometric merging:
+ *   Consecutive underscore items with a gap ≤ MAX_MERGE_GAP_PT are one blank.
+ *   Large gaps (column spacing) keep blanks separate.
+ *
+ * Strategy 2 — embedded underscore runs:
+ *   Items like "Date ___________" containing 5+ underscores mixed with label text.
+ *   Position estimated by character ratio.
+ *
+ * Strategy 3 — single-item bracket blanks:
+ *   "[     ]" (3+ internal spaces) in one text item.
+ *   Width estimated from the bracket span within the item, not the full item width.
+ *
+ * Strategy 4 — multi-item bracket blanks:
+ *   "Seller's Initials [" … spaces … "]" split across consecutive items.
+ *   Width measured as the gap between the bracket items.
+ */
+export async function extractBlanks(filePath: string): Promise<BlankField[]> {
  const data = new Uint8Array(await readFile(filePath));
  const pdf = await getDocument({ data }).promise;
-  const pages: PageImage[] = [];
+  const allBlanks: BlankField[] = [];

  for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
    const page = await pdf.getPage(pageNum);
-    const viewport = page.getViewport({ scale: RENDER_SCALE });
+    const textContent = await page.getTextContent();
+    const items = (textContent.items as RawTextItem[]).filter(i => i.str.length > 0);
+    const lines = groupIntoLines(items);

-    // Create an @napi-rs/canvas and render the PDF page into it
-    const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
-    const ctx = canvas.getContext('2d');
+    for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
+      const { y: lineY, items: lineItems } = lines[lineIdx];

-    // @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
-    await page.render({ canvasContext: ctx, viewport }).promise;
+      // Skip lines that are purely underscores (blank rows) so that stacked signature
+      // rows still find the "(Seller's Signature) (Address/Phone) (Date)" label line.
+      const contextAbove = (() => {
+        for (let li = lineIdx - 1; li >= 0; li--) {
+          const ctx = lineContext(lines[li].items);
+          if (ctx) return ctx.slice(0, 100);
+        }
+        return '';
+      })();
+      const contextBelow = (() => {
+        for (let li = lineIdx + 1; li < lines.length; li++) {
+          const ctx = lineContext(lines[li].items);
+          if (ctx) return ctx.slice(0, 100);
+        }
+        return '';
+      })();

-    // Stamp a visible page number in the top-left corner so GPT-4o can correlate
-    // each image to the correct page number when multiple images are sent in one prompt.
-    const label = `PAGE ${pageNum}`;
-    const fontSize = Math.round(viewport.height * 0.025);
-    ctx.fillStyle = 'rgba(220,30,30,0.85)';
-    ctx.fillRect(0, 0, fontSize * (label.length * 0.65), fontSize * 1.5);
-    ctx.fillStyle = '#ffffff';
-    ctx.font = `bold ${fontSize}px sans-serif`;
-    ctx.fillText(label, 4, fontSize * 1.15);
+      // ── Strategy 1: pure underscore runs (geometric merge) ───────────────────
+      // Two-pass approach: first collect all run boundaries on this line, then emit
+      // each blank with its row position (rowIndex / rowTotal) so the AI can
+      // reliably identify first / middle / last blanks on multi-blank rows.
+      {
+        // Pass 1: collect run boundaries
+        const runs: Array<{ startIdx: number; endIdx: number }> = [];
+        let runStart = -1;
+        let lastUnderscoreIdx = -1;

-    const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
+        for (let i = 0; i < lineItems.length; i++) {
+          if (!isPureUnderscoreItem(lineItems[i])) continue;

-    pages.push({
-      page: pageNum,
-      width: page.getViewport({ scale: 1.0 }).width,
-      height: page.getViewport({ scale: 1.0 }).height,
-      base64: jpegBuffer.toString('base64'),
-    });
+          if (runStart < 0) {
+            runStart = i;
+            lastUnderscoreIdx = i;
+          } else {
+            const prev = lineItems[lastUnderscoreIdx];
+            const gap  = lineItems[i].transform[4] - (prev.transform[4] + prev.width);
+            if (gap <= MAX_MERGE_GAP_PT) {
+              lastUnderscoreIdx = i;  // extend run
+            } else {
+              runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
+              runStart = i;
+              lastUnderscoreIdx = i;
+            }
+          }
+        }
+        if (runStart >= 0) runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
+
+        // Pass 2: emit blanks with row position metadata
+        const rowTotal = runs.length;
+        for (let r = 0; r < rowTotal; r++) {
+          const { startIdx, endIdx } = runs[r];
+          const first = lineItems[startIdx];
+          const last  = lineItems[endIdx];
+          allBlanks.push({
+            page: pageNum,
+            x: first.transform[4],
+            y: lineY,
+            width: (last.transform[4] + last.width) - first.transform[4],
+            fontSize: Math.abs(first.transform[0]) || 10,
+            contextBefore: lineContext(lineItems.slice(0, startIdx)).slice(-80),
+            contextAfter:  lineContext(lineItems.slice(endIdx + 1)).slice(0, 80),
+            contextAbove,
+            contextBelow,
+            rowIndex: r + 1,
+            rowTotal,
+          });
+        }
+      }
+
+      // ── Strategy 2: embedded underscore runs ─────────────────────────────────
+      for (let i = 0; i < lineItems.length; i++) {
+        const item = lineItems[i];
+        if (isPureUnderscoreItem(item)) continue;
+
+        const match = item.str.match(/_{5,}/);
+        if (!match || match.index === undefined) continue;
+
+        const runIdx   = match.index;
+        const runLen   = match[0].length;
+        const totalLen = item.str.length;
+        const xEst     = item.transform[4] + (runIdx / totalLen) * item.width;
+        const widthEst = Math.max((runLen / totalLen) * item.width, 30);
+
+        const prefixLabel = item.str.slice(0, runIdx).trim();
+        const suffixLabel = item.str.slice(runIdx + runLen).trim();
+
+        allBlanks.push({
+          page: pageNum,
+          x: xEst,
+          y: lineY,
+          width: widthEst,
+          fontSize: Math.abs(item.transform[0]) || 10,
+          contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
+            .filter(Boolean).join(' ').trim().slice(-80),
+          contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
+            .filter(Boolean).join(' ').trim().slice(0, 80),
+          contextAbove,
+          contextBelow,
+        });
+      }
+
+      // ── Strategy 3: single-item bracket blanks ───────────────────────────────
+      // Estimate x/width from the bracket span within the item string, not item.width,
+      // so a footer like "Seller's Initials [     ] Date ___" gets the correct narrow width.
+      for (let i = 0; i < lineItems.length; i++) {
+        const item = lineItems[i];
+        const match = item.str.match(/\[(\s{3,})\]/);
+        if (!match || match.index === undefined) continue;
+
+        const bracketStart = match.index + 1;  // char index of first space inside [
+        const bracketEnd   = match.index + match[0].length - 1;  // char index of ]
+        const totalLen     = item.str.length;
+        const xEst         = item.transform[4] + (bracketStart / totalLen) * item.width;
+        const widthEst     = Math.max(((bracketEnd - bracketStart) / totalLen) * item.width, 20);
+
+        const prefixLabel = item.str.slice(0, match.index).trim();
+        const suffixLabel = item.str.slice(match.index + match[0].length).trim();
+
+        allBlanks.push({
+          page: pageNum,
+          x: xEst,
+          y: lineY,
+          width: widthEst,
+          fontSize: Math.abs(item.transform[0]) || 10,
+          contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
+            .filter(Boolean).join(' ').trim().slice(-80),
+          contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
+            .filter(Boolean).join(' ').trim().slice(0, 80),
+          contextAbove,
+          contextBelow,
+        });
+      }
+
+      // ── Strategy 4: multi-item bracket blanks ────────────────────────────────
+      // "Seller's Initials [" … whitespace items … "]"
+      {
+        let openIdx = -1;
+
+        for (let i = 0; i < lineItems.length; i++) {
+          const item = lineItems[i];
+
+          if (openIdx < 0) {
+            if (item.str.trimEnd().endsWith('[')) openIdx = i;
+          } else {
+            const isWhitespace = item.str.trim().length === 0;
+            const closesHere   = item.str.trimStart().startsWith(']');
+
+            if (closesHere) {
+              const openItem  = lineItems[openIdx];
+              const closeItem = lineItems[i];
+              const blankX     = openItem.transform[4] + openItem.width;
+              const blankWidth = closeItem.transform[4] - blankX;
+
+              if (blankWidth > 5) {
+                const prefixBracket = openItem.str.replace(/\[$/, '').trim();
+                const suffixBracket = closeItem.str.replace(/^\]/, '').trim();
+                allBlanks.push({
+                  page: pageNum,
+                  x: blankX,
+                  y: lineY,
+                  width: blankWidth,
+                  fontSize: Math.abs(openItem.transform[0]) || 10,
+                  contextBefore: [...lineContext(lineItems.slice(0, openIdx)).split(' '), prefixBracket]
+                    .filter(Boolean).join(' ').trim().slice(-80),
+                  contextAfter: [suffixBracket, lineContext(lineItems.slice(i + 1))]
+                    .filter(Boolean).join(' ').trim().slice(0, 80),
+                  contextAbove,
+                  contextBelow,
+                });
+              }
+              openIdx = -1;
+            } else if (!isWhitespace) {
+              openIdx = -1;
+            }
+          }
+        }
+      }
+    }
  }

-  return pages;
+  // Deduplicate: Strategy 3 (single-item bracket) and Strategy 4 (multi-item bracket)
+  // can both fire on the same blank. Use x-range overlap to catch them regardless of
+  // how far apart their estimated x positions are.
+  const deduped: BlankField[] = [];
+  for (const b of allBlanks) {
+    const isDupe = deduped.some(
+      d =>
+        d.page === b.page &&
+        Math.abs(d.y - b.y) < 4 &&
+        d.x < b.x + b.width &&   // ranges overlap
+        b.x < d.x + d.width,
+    );
+    if (!isDupe) deduped.push(b);
+  }
+  return deduped;
 }