fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement
- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi) - field-placement.ts sends rendered page images to GPT-4o with vision (detail: high) - AI can now visually identify underlines, signature blocks, date fields, initials boxes - System prompt focuses on visual cues (blank lines, boxes) not text pattern matching - Handles multi-field lines: separate fields for signature blank and date blank on same line Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2
teressa-copeland-homes/package-lock.json
generated
2
teressa-copeland-homes/package-lock.json
generated
@@ -11,6 +11,7 @@
|
|||||||
"@cantoo/pdf-lib": "^2.6.3",
|
"@cantoo/pdf-lib": "^2.6.3",
|
||||||
"@dnd-kit/core": "^6.3.1",
|
"@dnd-kit/core": "^6.3.1",
|
||||||
"@dnd-kit/utilities": "^3.2.2",
|
"@dnd-kit/utilities": "^3.2.2",
|
||||||
|
"@napi-rs/canvas": "^0.1.97",
|
||||||
"@react-email/components": "^1.0.10",
|
"@react-email/components": "^1.0.10",
|
||||||
"@react-email/render": "^2.0.4",
|
"@react-email/render": "^2.0.4",
|
||||||
"@vercel/blob": "^2.3.1",
|
"@vercel/blob": "^2.3.1",
|
||||||
@@ -3307,7 +3308,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
|
||||||
"integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
|
"integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"optional": true,
|
|
||||||
"workspaces": [
|
"workspaces": [
|
||||||
"e2e/*"
|
"e2e/*"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
"@cantoo/pdf-lib": "^2.6.3",
|
"@cantoo/pdf-lib": "^2.6.3",
|
||||||
"@dnd-kit/core": "^6.3.1",
|
"@dnd-kit/core": "^6.3.1",
|
||||||
"@dnd-kit/utilities": "^3.2.2",
|
"@dnd-kit/utilities": "^3.2.2",
|
||||||
|
"@napi-rs/canvas": "^0.1.97",
|
||||||
"@react-email/components": "^1.0.10",
|
"@react-email/components": "^1.0.10",
|
||||||
"@react-email/render": "^2.0.4",
|
"@react-email/render": "^2.0.4",
|
||||||
"@vercel/blob": "^2.3.1",
|
"@vercel/blob": "^2.3.1",
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
|
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
|
||||||
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||||
|
import { createCanvas } from '@napi-rs/canvas';
|
||||||
import { readFile } from 'node:fs/promises';
|
import { readFile } from 'node:fs/promises';
|
||||||
import { join } from 'node:path';
|
import { join } from 'node:path';
|
||||||
|
|
||||||
@@ -12,73 +13,43 @@ import { join } from 'node:path';
|
|||||||
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
||||||
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
||||||
|
|
||||||
/** A single line of text, grouped by approximate Y position. */
|
/** A rendered page image, ready to send to GPT-4o vision. */
|
||||||
export interface TextLine {
|
export interface PageImage {
|
||||||
yPct: number; // % from page TOP (0 = top, 100 = bottom)
|
page: number; // 1-indexed
|
||||||
xPct: number; // % from page LEFT of the first item on this line
|
width: number; // original PDF width in points (scale 1.0)
|
||||||
text: string; // all items on this line joined
|
height: number; // original PDF height in points (scale 1.0)
|
||||||
|
base64: string; // JPEG base64 of the rendered page (no data: prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Per-page structured data for AI consumption. */
|
// Legacy type alias kept for callers that still reference PageText
|
||||||
export interface PageText {
|
export type PageText = PageImage;
|
||||||
page: number; // 1-indexed
|
|
||||||
width: number; // page width in PDF points
|
|
||||||
height: number; // page height in PDF points
|
|
||||||
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function extractPdfText(filePath: string): Promise<PageText[]> {
|
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
|
||||||
|
|
||||||
|
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
|
||||||
const data = new Uint8Array(await readFile(filePath));
|
const data = new Uint8Array(await readFile(filePath));
|
||||||
const pdf = await getDocument({ data }).promise;
|
const pdf = await getDocument({ data }).promise;
|
||||||
const pages: PageText[] = [];
|
const pages: PageImage[] = [];
|
||||||
|
|
||||||
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
|
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
|
||||||
const page = await pdf.getPage(pageNum);
|
const page = await pdf.getPage(pageNum);
|
||||||
const viewport = page.getViewport({ scale: 1.0 });
|
const viewport = page.getViewport({ scale: RENDER_SCALE });
|
||||||
const textContent = await page.getTextContent();
|
|
||||||
|
|
||||||
const W = viewport.width;
|
// Create an @napi-rs/canvas and render the PDF page into it
|
||||||
const H = viewport.height;
|
const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
|
||||||
|
const ctx = canvas.getContext('2d');
|
||||||
|
|
||||||
// Collect raw items with positions
|
// @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
|
||||||
const rawItems: { text: string; x: number; yFromTop: number }[] = [];
|
await page.render({ canvasContext: ctx, viewport }).promise;
|
||||||
for (const item of textContent.items) {
|
|
||||||
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
|
|
||||||
const i = item as { str: string; transform: number[] };
|
|
||||||
if (!i.str.trim()) continue;
|
|
||||||
const x = i.transform[4];
|
|
||||||
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
|
|
||||||
rawItems.push({ text: i.str, x, yFromTop });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Group items into lines by rounding yFromTop to nearest 4pt bucket
|
const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
|
||||||
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
|
|
||||||
for (const item of rawItems) {
|
|
||||||
const bucket = Math.round(item.yFromTop / 4) * 4;
|
|
||||||
const existing = lineMap.get(bucket);
|
|
||||||
if (existing) {
|
|
||||||
existing.items.push(item);
|
|
||||||
existing.minX = Math.min(existing.minX, item.x);
|
|
||||||
} else {
|
|
||||||
lineMap.set(bucket, { items: [item], minX: item.x });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort lines top-to-bottom, join items left-to-right
|
pages.push({
|
||||||
const lines: TextLine[] = Array.from(lineMap.entries())
|
page: pageNum,
|
||||||
.sort(([a], [b]) => a - b)
|
width: page.getViewport({ scale: 1.0 }).width,
|
||||||
.map(([yBucket, { items, minX }]) => {
|
height: page.getViewport({ scale: 1.0 }).height,
|
||||||
const sorted = items.sort((a, b) => a.x - b.x);
|
base64: jpegBuffer.toString('base64'),
|
||||||
return {
|
});
|
||||||
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
|
|
||||||
xPct: Math.round((minX / W) * 1000) / 10,
|
|
||||||
text: sorted.map((i) => i.text).join(' '),
|
|
||||||
};
|
|
||||||
})
|
|
||||||
// Cap at 120 lines per page to stay within context limits
|
|
||||||
.slice(0, 120);
|
|
||||||
|
|
||||||
pages.push({ page: pageNum, width: W, height: H, lines });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return pages;
|
return pages;
|
||||||
|
|||||||
@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
|
|||||||
const clientName = client?.name ?? 'Unknown';
|
const clientName = client?.name ?? 'Unknown';
|
||||||
const propertyAddress = client?.propertyAddress ?? 'Unknown';
|
const propertyAddress = client?.propertyAddress ?? 'Unknown';
|
||||||
|
|
||||||
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context
|
// Build vision messages — one image_url block per page
|
||||||
const pagesSummary = pageTexts.map((p) => {
|
type ContentBlock =
|
||||||
const linesSummary = p.lines
|
| { type: 'text'; text: string }
|
||||||
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
|
| { type: 'image_url'; image_url: { url: string; detail: 'high' } };
|
||||||
.join('\n');
|
|
||||||
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
|
const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
|
||||||
}).join('\n\n');
|
type: 'image_url',
|
||||||
|
image_url: {
|
||||||
|
url: `data:image/jpeg;base64,${p.base64}`,
|
||||||
|
detail: 'high',
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
const response = await openai.chat.completions.create({
|
||||||
model: 'gpt-4o',
|
model: 'gpt-4o',
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: 'system',
|
role: 'system',
|
||||||
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
|
content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.
|
||||||
|
|
||||||
Your job: identify every location that requires a FIELD to be filled in.
|
WHAT TO PLACE FIELDS ON:
|
||||||
|
- Blank underlines: ____________
|
||||||
|
- Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
|
||||||
|
- Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
|
||||||
|
- Date lines labeled "(Date)" or with a date underline
|
||||||
|
- Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins
|
||||||
|
|
||||||
FIELD PLACEMENT RULES:
|
WHAT NOT TO PLACE FIELDS ON:
|
||||||
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
|
- Paragraph body text, instructions, legal boilerplate
|
||||||
2. NEVER place fields inside paragraph body text, headings, or descriptive content.
|
- Headings and section titles
|
||||||
3. Look for these patterns as indicators of form fields:
|
|
||||||
- Lines of underscores: "_______" or "___________"
|
FIELD TYPES:
|
||||||
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
|
- "client-signature" → buyer or seller/client signature lines
|
||||||
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
|
- "agent-signature" → agent or listing agent signature lines
|
||||||
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
|
- "initials" → buyer/seller initials boxes
|
||||||
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
|
- "agent-initials" → agent initials boxes
|
||||||
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
|
- "date" → any date field
|
||||||
5. Match field types:
|
- "text" → all other blanks (names, addresses, prices, terms, etc.)
|
||||||
- "client-signature" → buyer/client signature lines
|
|
||||||
- "agent-signature" → agent/listing agent signature lines
|
POSITIONING:
|
||||||
- "initials" → initials boxes or short initial blanks
|
- xPct and yPct are percentages from the TOP-LEFT of that specific page image
|
||||||
- "agent-initials" → agent-specific initials
|
- Place the field AT the blank line, not above or below it
|
||||||
- "date" → date fields
|
- For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
|
||||||
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
|
- Do NOT place checkbox fields
|
||||||
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
|
|
||||||
7. Do NOT place checkbox fields.
|
PREFILL:
|
||||||
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
|
- For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
|
||||||
|
- All other fields: prefillValue = ""`,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: 'user',
|
role: 'user',
|
||||||
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
|
content: [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
|
||||||
|
},
|
||||||
|
...imageBlocks,
|
||||||
|
] as ContentBlock[],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
response_format: {
|
response_format: {
|
||||||
|
|||||||
Reference in New Issue
Block a user