fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement

- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi)
- field-placement.ts sends rendered page images to GPT-4o with vision (detail: high)
- AI can now visually identify underlines, signature blocks, date fields, initials boxes
- System prompt focuses on visual cues (blank lines, boxes) not text pattern matching
- Handles multi-field lines: separate fields for signature blank and date blank on same line

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chandler Copeland
2026-03-21 17:40:47 -06:00
parent b5216a8542
commit e7bf5abb9f
4 changed files with 75 additions and 86 deletions

View File

@@ -11,6 +11,7 @@
"@cantoo/pdf-lib": "^2.6.3", "@cantoo/pdf-lib": "^2.6.3",
"@dnd-kit/core": "^6.3.1", "@dnd-kit/core": "^6.3.1",
"@dnd-kit/utilities": "^3.2.2", "@dnd-kit/utilities": "^3.2.2",
"@napi-rs/canvas": "^0.1.97",
"@react-email/components": "^1.0.10", "@react-email/components": "^1.0.10",
"@react-email/render": "^2.0.4", "@react-email/render": "^2.0.4",
"@vercel/blob": "^2.3.1", "@vercel/blob": "^2.3.1",
@@ -3307,7 +3308,6 @@
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz", "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
"integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==", "integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
"license": "MIT", "license": "MIT",
"optional": true,
"workspaces": [ "workspaces": [
"e2e/*" "e2e/*"
], ],

View File

@@ -18,6 +18,7 @@
"@cantoo/pdf-lib": "^2.6.3", "@cantoo/pdf-lib": "^2.6.3",
"@dnd-kit/core": "^6.3.1", "@dnd-kit/core": "^6.3.1",
"@dnd-kit/utilities": "^3.2.2", "@dnd-kit/utilities": "^3.2.2",
"@napi-rs/canvas": "^0.1.97",
"@react-email/components": "^1.0.10", "@react-email/components": "^1.0.10",
"@react-email/render": "^2.0.4", "@react-email/render": "^2.0.4",
"@vercel/blob": "^2.3.1", "@vercel/blob": "^2.3.1",

View File

@@ -4,6 +4,7 @@
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration // @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs'; import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { createCanvas } from '@napi-rs/canvas';
import { readFile } from 'node:fs/promises'; import { readFile } from 'node:fs/promises';
import { join } from 'node:path'; import { join } from 'node:path';
@@ -12,73 +13,43 @@ import { join } from 'node:path';
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs. // Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`; GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
/** A single line of text, grouped by approximate Y position. */ /** A rendered page image, ready to send to GPT-4o vision. */
export interface TextLine { export interface PageImage {
yPct: number; // % from page TOP (0 = top, 100 = bottom)
xPct: number; // % from page LEFT of the first item on this line
text: string; // all items on this line joined
}
/** Per-page structured data for AI consumption. */
export interface PageText {
page: number; // 1-indexed page: number; // 1-indexed
width: number; // page width in PDF points width: number; // original PDF width in points (scale 1.0)
height: number; // page height in PDF points height: number; // original PDF height in points (scale 1.0)
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom base64: string; // JPEG base64 of the rendered page (no data: prefix)
} }
export async function extractPdfText(filePath: string): Promise<PageText[]> { // Legacy type alias kept for callers that still reference PageText
export type PageText = PageImage;
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
const data = new Uint8Array(await readFile(filePath)); const data = new Uint8Array(await readFile(filePath));
const pdf = await getDocument({ data }).promise; const pdf = await getDocument({ data }).promise;
const pages: PageText[] = []; const pages: PageImage[] = [];
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum); const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 }); const viewport = page.getViewport({ scale: RENDER_SCALE });
const textContent = await page.getTextContent();
const W = viewport.width; // Create an @napi-rs/canvas and render the PDF page into it
const H = viewport.height; const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
const ctx = canvas.getContext('2d');
// Collect raw items with positions // @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
const rawItems: { text: string; x: number; yFromTop: number }[] = []; await page.render({ canvasContext: ctx, viewport }).promise;
for (const item of textContent.items) {
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
const i = item as { str: string; transform: number[] };
if (!i.str.trim()) continue;
const x = i.transform[4];
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
rawItems.push({ text: i.str, x, yFromTop });
}
// Group items into lines by rounding yFromTop to nearest 4pt bucket const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
for (const item of rawItems) {
const bucket = Math.round(item.yFromTop / 4) * 4;
const existing = lineMap.get(bucket);
if (existing) {
existing.items.push(item);
existing.minX = Math.min(existing.minX, item.x);
} else {
lineMap.set(bucket, { items: [item], minX: item.x });
}
}
// Sort lines top-to-bottom, join items left-to-right pages.push({
const lines: TextLine[] = Array.from(lineMap.entries()) page: pageNum,
.sort(([a], [b]) => a - b) width: page.getViewport({ scale: 1.0 }).width,
.map(([yBucket, { items, minX }]) => { height: page.getViewport({ scale: 1.0 }).height,
const sorted = items.sort((a, b) => a.x - b.x); base64: jpegBuffer.toString('base64'),
return { });
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
xPct: Math.round((minX / W) * 1000) / 10,
text: sorted.map((i) => i.text).join(' '),
};
})
// Cap at 120 lines per page to stay within context limits
.slice(0, 120);
pages.push({ page: pageNum, width: W, height: H, lines });
} }
return pages; return pages;

View File

@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
const clientName = client?.name ?? 'Unknown'; const clientName = client?.name ?? 'Unknown';
const propertyAddress = client?.propertyAddress ?? 'Unknown'; const propertyAddress = client?.propertyAddress ?? 'Unknown';
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context // Build vision messages — one image_url block per page
const pagesSummary = pageTexts.map((p) => { type ContentBlock =
const linesSummary = p.lines | { type: 'text'; text: string }
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`) | { type: 'image_url'; image_url: { url: string; detail: 'high' } };
.join('\n');
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`; const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
}).join('\n\n'); type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${p.base64}`,
detail: 'high',
},
}));
const response = await openai.chat.completions.create({ const response = await openai.chat.completions.create({
model: 'gpt-4o', model: 'gpt-4o',
messages: [ messages: [
{ {
role: 'system', role: 'system',
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left). content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.
Your job: identify every location that requires a FIELD to be filled in. WHAT TO PLACE FIELDS ON:
- Blank underlines: ____________
- Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
- Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
- Date lines labeled "(Date)" or with a date underline
- Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins
FIELD PLACEMENT RULES: WHAT NOT TO PLACE FIELDS ON:
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes. - Paragraph body text, instructions, legal boilerplate
2. NEVER place fields inside paragraph body text, headings, or descriptive content. - Headings and section titles
3. Look for these patterns as indicators of form fields:
- Lines of underscores: "_______" or "___________" FIELD TYPES:
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___" - "client-signature" → buyer or seller/client signature lines
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer" - "agent-signature" → agent or listing agent signature lines
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials" - "initials" → buyer/seller initials boxes
- Date lines: "(Date)", "Date ___", "___ / ___ / ___" - "agent-initials" → agent initials boxes
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value. - "date" → any date field
5. Match field types: - "text" → all other blanks (names, addresses, prices, terms, etc.)
- "client-signature" → buyer/client signature lines
- "agent-signature" → agent/listing agent signature lines POSITIONING:
- "initials" → initials boxes or short initial blanks - xPct and yPct are percentages from the TOP-LEFT of that specific page image
- "agent-initials" → agent-specific initials - Place the field AT the blank line, not above or below it
- "date" → date fields - For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.) - Do NOT place checkbox fields
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
7. Do NOT place checkbox fields. PREFILL:
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`, - For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
- All other fields: prefillValue = ""`,
}, },
{ {
role: 'user', role: 'user',
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`, content: [
{
type: 'text',
text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
},
...imageBlocks,
] as ContentBlock[],
}, },
], ],
response_format: { response_format: {