fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement

- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi) - field-placement.ts sends rendered page images to GPT-4o with vision (detail: high) - AI can now visually identify underlines, signature blocks, date fields, initials boxes - System prompt focuses on visual cues (blank lines, boxes) not text pattern matching - Handles multi-field lines: separate fields for signature blank and date blank on same line Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:40:47 -06:00
parent b5216a8542
commit e7bf5abb9f
4 changed files with 75 additions and 86 deletions
--- a/teressa-copeland-homes/package-lock.json
+++ b/teressa-copeland-homes/package-lock.json
@@ -11,6 +11,7 @@
        "@cantoo/pdf-lib": "^2.6.3",
        "@dnd-kit/core": "^6.3.1",
        "@dnd-kit/utilities": "^3.2.2",
+        "@napi-rs/canvas": "^0.1.97",
        "@react-email/components": "^1.0.10",
        "@react-email/render": "^2.0.4",
        "@vercel/blob": "^2.3.1",
@@ -3307,7 +3308,6 @@
      "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
      "integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
      "license": "MIT",
-      "optional": true,
      "workspaces": [
        "e2e/*"
      ],
--- a/teressa-copeland-homes/package.json
+++ b/teressa-copeland-homes/package.json
@@ -18,6 +18,7 @@
    "@cantoo/pdf-lib": "^2.6.3",
    "@dnd-kit/core": "^6.3.1",
    "@dnd-kit/utilities": "^3.2.2",
+    "@napi-rs/canvas": "^0.1.97",
    "@react-email/components": "^1.0.10",
    "@react-email/render": "^2.0.4",
    "@vercel/blob": "^2.3.1",
--- a/teressa-copeland-homes/src/lib/ai/extract-text.ts
+++ b/teressa-copeland-homes/src/lib/ai/extract-text.ts
@@ -4,6 +4,7 @@

 // @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
 import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
+import { createCanvas } from '@napi-rs/canvas';
 import { readFile } from 'node:fs/promises';
 import { join } from 'node:path';

@@ -12,73 +13,43 @@ import { join } from 'node:path';
 // Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
 GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;

-/** A single line of text, grouped by approximate Y position. */
-export interface TextLine {
-  yPct: number;    // % from page TOP (0 = top, 100 = bottom)
-  xPct: number;    // % from page LEFT of the first item on this line
-  text: string;    // all items on this line joined
+/** A rendered page image, ready to send to GPT-4o vision. */
+export interface PageImage {
+  page: number;       // 1-indexed
+  width: number;      // original PDF width in points (scale 1.0)
+  height: number;     // original PDF height in points (scale 1.0)
+  base64: string;     // JPEG base64 of the rendered page (no data: prefix)
 }

-/** Per-page structured data for AI consumption. */
-export interface PageText {
-  page: number;    // 1-indexed
-  width: number;   // page width in PDF points
-  height: number;  // page height in PDF points
-  lines: TextLine[];  // text grouped into lines, sorted top-to-bottom
-}
+// Legacy type alias kept for callers that still reference PageText
+export type PageText = PageImage;

-export async function extractPdfText(filePath: string): Promise<PageText[]> {
+const RENDER_SCALE = 1.5;  // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
+
+export async function extractPdfText(filePath: string): Promise<PageImage[]> {
  const data = new Uint8Array(await readFile(filePath));
  const pdf = await getDocument({ data }).promise;
-  const pages: PageText[] = [];
+  const pages: PageImage[] = [];

  for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
    const page = await pdf.getPage(pageNum);
-    const viewport = page.getViewport({ scale: 1.0 });
-    const textContent = await page.getTextContent();
+    const viewport = page.getViewport({ scale: RENDER_SCALE });

-    const W = viewport.width;
-    const H = viewport.height;
+    // Create an @napi-rs/canvas and render the PDF page into it
+    const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
+    const ctx = canvas.getContext('2d');

-    // Collect raw items with positions
-    const rawItems: { text: string; x: number; yFromTop: number }[] = [];
-    for (const item of textContent.items) {
-      if (typeof item !== 'object' || item === null || !('str' in item)) continue;
-      const i = item as { str: string; transform: number[] };
-      if (!i.str.trim()) continue;
-      const x = i.transform[4];
-      const yFromTop = H - i.transform[5];  // PDF y is from bottom; flip to screen coords
-      rawItems.push({ text: i.str, x, yFromTop });
-    }
+    // @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
+    await page.render({ canvasContext: ctx, viewport }).promise;

-    // Group items into lines by rounding yFromTop to nearest 4pt bucket
-    const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
-    for (const item of rawItems) {
-      const bucket = Math.round(item.yFromTop / 4) * 4;
-      const existing = lineMap.get(bucket);
-      if (existing) {
-        existing.items.push(item);
-        existing.minX = Math.min(existing.minX, item.x);
-      } else {
-        lineMap.set(bucket, { items: [item], minX: item.x });
-      }
-    }
+    const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);

-    // Sort lines top-to-bottom, join items left-to-right
-    const lines: TextLine[] = Array.from(lineMap.entries())
-      .sort(([a], [b]) => a - b)
-      .map(([yBucket, { items, minX }]) => {
-        const sorted = items.sort((a, b) => a.x - b.x);
-        return {
-          yPct: Math.round((yBucket / H) * 1000) / 10,  // 1 decimal place
-          xPct: Math.round((minX / W) * 1000) / 10,
-          text: sorted.map((i) => i.text).join(' '),
-        };
-      })
-      // Cap at 120 lines per page to stay within context limits
-      .slice(0, 120);
-
-    pages.push({ page: pageNum, width: W, height: H, lines });
+    pages.push({
+      page: pageNum,
+      width: page.getViewport({ scale: 1.0 }).width,
+      height: page.getViewport({ scale: 1.0 }).height,
+      base64: jpegBuffer.toString('base64'),
+    });
  }

  return pages;
--- a/teressa-copeland-homes/src/lib/ai/field-placement.ts
+++ b/teressa-copeland-homes/src/lib/ai/field-placement.ts
@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
  const clientName = client?.name ?? 'Unknown';
  const propertyAddress = client?.propertyAddress ?? 'Unknown';

-  // Build structured page summary — each line includes yPct/xPct so the AI has spatial context
-  const pagesSummary = pageTexts.map((p) => {
-    const linesSummary = p.lines
-      .map((l) => `  y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
-      .join('\n');
-    return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
-  }).join('\n\n');
+  // Build vision messages — one image_url block per page
+  type ContentBlock =
+    | { type: 'text'; text: string }
+    | { type: 'image_url'; image_url: { url: string; detail: 'high' } };
+
+  const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
+    type: 'image_url',
+    image_url: {
+      url: `data:image/jpeg;base64,${p.base64}`,
+      detail: 'high',
+    },
+  }));

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
-        content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
+        content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.

-Your job: identify every location that requires a FIELD to be filled in.
+WHAT TO PLACE FIELDS ON:
+- Blank underlines: ____________
+- Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
+- Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
+- Date lines labeled "(Date)" or with a date underline
+- Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins

-FIELD PLACEMENT RULES:
-1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
-2. NEVER place fields inside paragraph body text, headings, or descriptive content.
-3. Look for these patterns as indicators of form fields:
-   - Lines of underscores: "_______" or "___________"
-   - Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
-   - Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
-   - Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
-   - Date lines: "(Date)", "Date ___", "___ / ___ / ___"
-4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
-5. Match field types:
-   - "client-signature" → buyer/client signature lines
-   - "agent-signature" → agent/listing agent signature lines
-   - "initials" → initials boxes or short initial blanks
-   - "agent-initials" → agent-specific initials
-   - "date" → date fields
-   - "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
-6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
-7. Do NOT place checkbox fields.
-8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
+WHAT NOT TO PLACE FIELDS ON:
+- Paragraph body text, instructions, legal boilerplate
+- Headings and section titles
+
+FIELD TYPES:
+- "client-signature" → buyer or seller/client signature lines
+- "agent-signature" → agent or listing agent signature lines
+- "initials" → buyer/seller initials boxes
+- "agent-initials" → agent initials boxes
+- "date" → any date field
+- "text" → all other blanks (names, addresses, prices, terms, etc.)
+
+POSITIONING:
+- xPct and yPct are percentages from the TOP-LEFT of that specific page image
+- Place the field AT the blank line, not above or below it
+- For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
+- Do NOT place checkbox fields
+
+PREFILL:
+- For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
+- All other fields: prefillValue = ""`,
      },
      {
        role: 'user',
-        content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
+        content: [
+          {
+            type: 'text',
+            text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
+          },
+          ...imageBlocks,
+        ] as ContentBlock[],
      },
    ],
    response_format: {