fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement

- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi) - field-placement.ts sends rendered page images to GPT-4o with vision (detail: high) - AI can now visually identify underlines, signature blocks, date fields, initials boxes - System prompt focuses on visual cues (blank lines, boxes) not text pattern matching - Handles multi-field lines: separate fields for signature blank and date blank on same line Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:40:47 -06:00
parent b5216a8542
commit e7bf5abb9f
4 changed files with 75 additions and 86 deletions
--- a/teressa-copeland-homes/package-lock.json
+++ b/teressa-copeland-homes/package-lock.json
@@ -11,6 +11,7 @@
        "@cantoo/pdf-lib": "^2.6.3",
        "@dnd-kit/core": "^6.3.1",
        "@dnd-kit/utilities": "^3.2.2",
        "@napi-rs/canvas": "^0.1.97",
        "@react-email/components": "^1.0.10",
        "@react-email/render": "^2.0.4",
        "@vercel/blob": "^2.3.1",
@@ -3307,7 +3308,6 @@
      "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
      "integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
      "license": "MIT",
      "optional": true,
      "workspaces": [
        "e2e/*"
      ],
--- a/teressa-copeland-homes/package.json
+++ b/teressa-copeland-homes/package.json
@@ -18,6 +18,7 @@
    "@cantoo/pdf-lib": "^2.6.3",
    "@dnd-kit/core": "^6.3.1",
    "@dnd-kit/utilities": "^3.2.2",
    "@napi-rs/canvas": "^0.1.97",
    "@react-email/components": "^1.0.10",
    "@react-email/render": "^2.0.4",
    "@vercel/blob": "^2.3.1",
--- a/teressa-copeland-homes/src/lib/ai/extract-text.ts
+++ b/teressa-copeland-homes/src/lib/ai/extract-text.ts
@@ -4,6 +4,7 @@
 // @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
 import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
 import { createCanvas } from '@napi-rs/canvas';
 import { readFile } from 'node:fs/promises';
 import { join } from 'node:path';
@@ -12,73 +13,43 @@ import { join } from 'node:path';
 // Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
 GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
-/** A single line of text, grouped by approximate Y position. */
+/** A rendered page image, ready to send to GPT-4o vision. */
-export interface TextLine {
+export interface PageImage {
  yPct: number;    // % from page TOP (0 = top, 100 = bottom)
  xPct: number;    // % from page LEFT of the first item on this line
  text: string;    // all items on this line joined
 }
 /** Per-page structured data for AI consumption. */
 export interface PageText {
  page: number;       // 1-indexed
-  width: number;   // page width in PDF points
+  width: number;      // original PDF width in points (scale 1.0)
-  height: number;  // page height in PDF points
+  height: number;     // original PDF height in points (scale 1.0)
-  lines: TextLine[];  // text grouped into lines, sorted top-to-bottom
+  base64: string;     // JPEG base64 of the rendered page (no data: prefix)
 }
-export async function extractPdfText(filePath: string): Promise<PageText[]> {
+// Legacy type alias kept for callers that still reference PageText
 export type PageText = PageImage;
 const RENDER_SCALE = 1.5;  // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
 export async function extractPdfText(filePath: string): Promise<PageImage[]> {
  const data = new Uint8Array(await readFile(filePath));
  const pdf = await getDocument({ data }).promise;
-  const pages: PageText[] = [];
+  const pages: PageImage[] = [];
  for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
    const page = await pdf.getPage(pageNum);
-    const viewport = page.getViewport({ scale: 1.0 });
+    const viewport = page.getViewport({ scale: RENDER_SCALE });
    const textContent = await page.getTextContent();
-    const W = viewport.width;
+    // Create an @napi-rs/canvas and render the PDF page into it
-    const H = viewport.height;
+    const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
    const ctx = canvas.getContext('2d');
-    // Collect raw items with positions
+    // @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
-    const rawItems: { text: string; x: number; yFromTop: number }[] = [];
+    await page.render({ canvasContext: ctx, viewport }).promise;
    for (const item of textContent.items) {
      if (typeof item !== 'object' || item === null || !('str' in item)) continue;
      const i = item as { str: string; transform: number[] };
      if (!i.str.trim()) continue;
      const x = i.transform[4];
      const yFromTop = H - i.transform[5];  // PDF y is from bottom; flip to screen coords
      rawItems.push({ text: i.str, x, yFromTop });
    }
-    // Group items into lines by rounding yFromTop to nearest 4pt bucket
+    const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
    const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
    for (const item of rawItems) {
      const bucket = Math.round(item.yFromTop / 4) * 4;
      const existing = lineMap.get(bucket);
      if (existing) {
        existing.items.push(item);
        existing.minX = Math.min(existing.minX, item.x);
      } else {
        lineMap.set(bucket, { items: [item], minX: item.x });
      }
    }
-    // Sort lines top-to-bottom, join items left-to-right
+    pages.push({
-    const lines: TextLine[] = Array.from(lineMap.entries())
+      page: pageNum,
-      .sort(([a], [b]) => a - b)
+      width: page.getViewport({ scale: 1.0 }).width,
-      .map(([yBucket, { items, minX }]) => {
+      height: page.getViewport({ scale: 1.0 }).height,
-        const sorted = items.sort((a, b) => a.x - b.x);
+      base64: jpegBuffer.toString('base64'),
-        return {
+    });
          yPct: Math.round((yBucket / H) * 1000) / 10,  // 1 decimal place
          xPct: Math.round((minX / W) * 1000) / 10,
          text: sorted.map((i) => i.text).join(' '),
        };
      })
      // Cap at 120 lines per page to stay within context limits
      .slice(0, 120);
    pages.push({ page: pageNum, width: W, height: H, lines });
  }
  return pages;
--- a/teressa-copeland-homes/src/lib/ai/field-placement.ts
+++ b/teressa-copeland-homes/src/lib/ai/field-placement.ts
@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
  const clientName = client?.name ?? 'Unknown';
  const propertyAddress = client?.propertyAddress ?? 'Unknown';
-  // Build structured page summary — each line includes yPct/xPct so the AI has spatial context
+  // Build vision messages — one image_url block per page
-  const pagesSummary = pageTexts.map((p) => {
+  type ContentBlock =
-    const linesSummary = p.lines
+    | { type: 'text'; text: string }
-      .map((l) => `  y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
+    | { type: 'image_url'; image_url: { url: string; detail: 'high' } };
-      .join('\n');
+
-    return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
+  const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
-  }).join('\n\n');
+    type: 'image_url',
    image_url: {
      url: `data:image/jpeg;base64,${p.base64}`,
      detail: 'high',
    },
  }));
  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [
      {
        role: 'system',
-        content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
+        content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.
-Your job: identify every location that requires a FIELD to be filled in.
+WHAT TO PLACE FIELDS ON:
 - Blank underlines: ____________
 - Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
 - Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
 - Date lines labeled "(Date)" or with a date underline
 - Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins
-FIELD PLACEMENT RULES:
+WHAT NOT TO PLACE FIELDS ON:
-1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
+- Paragraph body text, instructions, legal boilerplate
-2. NEVER place fields inside paragraph body text, headings, or descriptive content.
+- Headings and section titles
-3. Look for these patterns as indicators of form fields:
+
-   - Lines of underscores: "_______" or "___________"
+FIELD TYPES:
-   - Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
+- "client-signature" → buyer or seller/client signature lines
-   - Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
+- "agent-signature" → agent or listing agent signature lines
-   - Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
+- "initials" → buyer/seller initials boxes
-   - Date lines: "(Date)", "Date ___", "___ / ___ / ___"
+- "agent-initials" → agent initials boxes
-4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
+- "date" → any date field
-5. Match field types:
+- "text" → all other blanks (names, addresses, prices, terms, etc.)
-   - "client-signature" → buyer/client signature lines
+
-   - "agent-signature" → agent/listing agent signature lines
+POSITIONING:
-   - "initials" → initials boxes or short initial blanks
+- xPct and yPct are percentages from the TOP-LEFT of that specific page image
-   - "agent-initials" → agent-specific initials
+- Place the field AT the blank line, not above or below it
-   - "date" → date fields
+- For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
-   - "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
+- Do NOT place checkbox fields
-6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
+
-7. Do NOT place checkbox fields.
+PREFILL:
-8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
+- For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
 - All other fields: prefillValue = ""`,
      },
      {
        role: 'user',
-        content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
+        content: [
          {
            type: 'text',
            text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
          },
          ...imageBlocks,
        ] as ContentBlock[],
      },
    ],
    response_format: {