fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement

- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi)
- field-placement.ts sends rendered page images to GPT-4o with vision (detail: high)
- AI can now visually identify underlines, signature blocks, date fields, initials boxes
- System prompt focuses on visual cues (blank lines, boxes) not text pattern matching
- Handles multi-field lines: separate fields for signature blank and date blank on same line

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chandler Copeland
2026-03-21 17:40:47 -06:00
parent b5216a8542
commit e7bf5abb9f
4 changed files with 75 additions and 86 deletions

View File

@@ -11,6 +11,7 @@
"@cantoo/pdf-lib": "^2.6.3",
"@dnd-kit/core": "^6.3.1",
"@dnd-kit/utilities": "^3.2.2",
"@napi-rs/canvas": "^0.1.97",
"@react-email/components": "^1.0.10",
"@react-email/render": "^2.0.4",
"@vercel/blob": "^2.3.1",
@@ -3307,7 +3308,6 @@
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
"integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
"license": "MIT",
"optional": true,
"workspaces": [
"e2e/*"
],

View File

@@ -18,6 +18,7 @@
"@cantoo/pdf-lib": "^2.6.3",
"@dnd-kit/core": "^6.3.1",
"@dnd-kit/utilities": "^3.2.2",
"@napi-rs/canvas": "^0.1.97",
"@react-email/components": "^1.0.10",
"@react-email/render": "^2.0.4",
"@vercel/blob": "^2.3.1",

View File

@@ -4,6 +4,7 @@
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { createCanvas } from '@napi-rs/canvas';
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
@@ -12,73 +13,43 @@ import { join } from 'node:path';
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
/** A single line of text, grouped by approximate Y position. */
export interface TextLine {
yPct: number; // % from page TOP (0 = top, 100 = bottom)
xPct: number; // % from page LEFT of the first item on this line
text: string; // all items on this line joined
}
/** Per-page structured data for AI consumption. */
export interface PageText {
/** A rendered page image, ready to send to GPT-4o vision. */
export interface PageImage {
page: number; // 1-indexed
width: number; // page width in PDF points
height: number; // page height in PDF points
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
width: number; // original PDF width in points (scale 1.0)
height: number; // original PDF height in points (scale 1.0)
base64: string; // JPEG base64 of the rendered page (no data: prefix)
}
export async function extractPdfText(filePath: string): Promise<PageText[]> {
// Legacy type alias kept for callers that still reference PageText
export type PageText = PageImage;
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
const data = new Uint8Array(await readFile(filePath));
const pdf = await getDocument({ data }).promise;
const pages: PageText[] = [];
const pages: PageImage[] = [];
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 });
const textContent = await page.getTextContent();
const viewport = page.getViewport({ scale: RENDER_SCALE });
const W = viewport.width;
const H = viewport.height;
// Create an @napi-rs/canvas and render the PDF page into it
const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
const ctx = canvas.getContext('2d');
// Collect raw items with positions
const rawItems: { text: string; x: number; yFromTop: number }[] = [];
for (const item of textContent.items) {
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
const i = item as { str: string; transform: number[] };
if (!i.str.trim()) continue;
const x = i.transform[4];
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
rawItems.push({ text: i.str, x, yFromTop });
}
// @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
await page.render({ canvasContext: ctx, viewport }).promise;
// Group items into lines by rounding yFromTop to nearest 4pt bucket
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
for (const item of rawItems) {
const bucket = Math.round(item.yFromTop / 4) * 4;
const existing = lineMap.get(bucket);
if (existing) {
existing.items.push(item);
existing.minX = Math.min(existing.minX, item.x);
} else {
lineMap.set(bucket, { items: [item], minX: item.x });
}
}
const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
// Sort lines top-to-bottom, join items left-to-right
const lines: TextLine[] = Array.from(lineMap.entries())
.sort(([a], [b]) => a - b)
.map(([yBucket, { items, minX }]) => {
const sorted = items.sort((a, b) => a.x - b.x);
return {
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
xPct: Math.round((minX / W) * 1000) / 10,
text: sorted.map((i) => i.text).join(' '),
};
})
// Cap at 120 lines per page to stay within context limits
.slice(0, 120);
pages.push({ page: pageNum, width: W, height: H, lines });
pages.push({
page: pageNum,
width: page.getViewport({ scale: 1.0 }).width,
height: page.getViewport({ scale: 1.0 }).height,
base64: jpegBuffer.toString('base64'),
});
}
return pages;

View File

@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
const clientName = client?.name ?? 'Unknown';
const propertyAddress = client?.propertyAddress ?? 'Unknown';
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context
const pagesSummary = pageTexts.map((p) => {
const linesSummary = p.lines
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
.join('\n');
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
}).join('\n\n');
// Build vision messages — one image_url block per page
type ContentBlock =
| { type: 'text'; text: string }
| { type: 'image_url'; image_url: { url: string; detail: 'high' } };
const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${p.base64}`,
detail: 'high',
},
}));
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'system',
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.
Your job: identify every location that requires a FIELD to be filled in.
WHAT TO PLACE FIELDS ON:
- Blank underlines: ____________
- Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
- Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
- Date lines labeled "(Date)" or with a date underline
- Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins
FIELD PLACEMENT RULES:
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
2. NEVER place fields inside paragraph body text, headings, or descriptive content.
3. Look for these patterns as indicators of form fields:
- Lines of underscores: "_______" or "___________"
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
5. Match field types:
- "client-signature" → buyer/client signature lines
- "agent-signature" → agent/listing agent signature lines
- "initials" → initials boxes or short initial blanks
- "agent-initials" → agent-specific initials
- "date" → date fields
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
7. Do NOT place checkbox fields.
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
WHAT NOT TO PLACE FIELDS ON:
- Paragraph body text, instructions, legal boilerplate
- Headings and section titles
FIELD TYPES:
- "client-signature" → buyer or seller/client signature lines
- "agent-signature" → agent or listing agent signature lines
- "initials" → buyer/seller initials boxes
- "agent-initials" → agent initials boxes
- "date" → any date field
- "text" → all other blanks (names, addresses, prices, terms, etc.)
POSITIONING:
- xPct and yPct are percentages from the TOP-LEFT of that specific page image
- Place the field AT the blank line, not above or below it
- For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
- Do NOT place checkbox fields
PREFILL:
- For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
- All other fields: prefillValue = ""`,
},
{
role: 'user',
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
content: [
{
type: 'text',
text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
},
...imageBlocks,
] as ContentBlock[],
},
],
response_format: {