fix(13): switch to GPT-4o vision — render PDF pages as images for accurate field placement
- extractPdfText now renders each page to JPEG via @napi-rs/canvas + pdfjs-dist (108dpi) - field-placement.ts sends rendered page images to GPT-4o with vision (detail: high) - AI can now visually identify underlines, signature blocks, date fields, initials boxes - System prompt focuses on visual cues (blank lines, boxes) not text pattern matching - Handles multi-field lines: separate fields for signature blank and date blank on same line Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2
teressa-copeland-homes/package-lock.json
generated
2
teressa-copeland-homes/package-lock.json
generated
@@ -11,6 +11,7 @@
|
||||
"@cantoo/pdf-lib": "^2.6.3",
|
||||
"@dnd-kit/core": "^6.3.1",
|
||||
"@dnd-kit/utilities": "^3.2.2",
|
||||
"@napi-rs/canvas": "^0.1.97",
|
||||
"@react-email/components": "^1.0.10",
|
||||
"@react-email/render": "^2.0.4",
|
||||
"@vercel/blob": "^2.3.1",
|
||||
@@ -3307,7 +3308,6 @@
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
|
||||
"integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"workspaces": [
|
||||
"e2e/*"
|
||||
],
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
"@cantoo/pdf-lib": "^2.6.3",
|
||||
"@dnd-kit/core": "^6.3.1",
|
||||
"@dnd-kit/utilities": "^3.2.2",
|
||||
"@napi-rs/canvas": "^0.1.97",
|
||||
"@react-email/components": "^1.0.10",
|
||||
"@react-email/render": "^2.0.4",
|
||||
"@vercel/blob": "^2.3.1",
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
|
||||
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import { createCanvas } from '@napi-rs/canvas';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
|
||||
@@ -12,73 +13,43 @@ import { join } from 'node:path';
|
||||
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
||||
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
||||
|
||||
/** A single line of text, grouped by approximate Y position. */
|
||||
export interface TextLine {
|
||||
yPct: number; // % from page TOP (0 = top, 100 = bottom)
|
||||
xPct: number; // % from page LEFT of the first item on this line
|
||||
text: string; // all items on this line joined
|
||||
/** A rendered page image, ready to send to GPT-4o vision. */
|
||||
export interface PageImage {
|
||||
page: number; // 1-indexed
|
||||
width: number; // original PDF width in points (scale 1.0)
|
||||
height: number; // original PDF height in points (scale 1.0)
|
||||
base64: string; // JPEG base64 of the rendered page (no data: prefix)
|
||||
}
|
||||
|
||||
/** Per-page structured data for AI consumption. */
|
||||
export interface PageText {
|
||||
page: number; // 1-indexed
|
||||
width: number; // page width in PDF points
|
||||
height: number; // page height in PDF points
|
||||
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
|
||||
}
|
||||
// Legacy type alias kept for callers that still reference PageText
|
||||
export type PageText = PageImage;
|
||||
|
||||
export async function extractPdfText(filePath: string): Promise<PageText[]> {
|
||||
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
|
||||
|
||||
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
|
||||
const data = new Uint8Array(await readFile(filePath));
|
||||
const pdf = await getDocument({ data }).promise;
|
||||
const pages: PageText[] = [];
|
||||
const pages: PageImage[] = [];
|
||||
|
||||
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
const textContent = await page.getTextContent();
|
||||
const viewport = page.getViewport({ scale: RENDER_SCALE });
|
||||
|
||||
const W = viewport.width;
|
||||
const H = viewport.height;
|
||||
// Create an @napi-rs/canvas and render the PDF page into it
|
||||
const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
|
||||
const ctx = canvas.getContext('2d');
|
||||
|
||||
// Collect raw items with positions
|
||||
const rawItems: { text: string; x: number; yFromTop: number }[] = [];
|
||||
for (const item of textContent.items) {
|
||||
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
|
||||
const i = item as { str: string; transform: number[] };
|
||||
if (!i.str.trim()) continue;
|
||||
const x = i.transform[4];
|
||||
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
|
||||
rawItems.push({ text: i.str, x, yFromTop });
|
||||
}
|
||||
// @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
|
||||
await page.render({ canvasContext: ctx, viewport }).promise;
|
||||
|
||||
// Group items into lines by rounding yFromTop to nearest 4pt bucket
|
||||
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
|
||||
for (const item of rawItems) {
|
||||
const bucket = Math.round(item.yFromTop / 4) * 4;
|
||||
const existing = lineMap.get(bucket);
|
||||
if (existing) {
|
||||
existing.items.push(item);
|
||||
existing.minX = Math.min(existing.minX, item.x);
|
||||
} else {
|
||||
lineMap.set(bucket, { items: [item], minX: item.x });
|
||||
}
|
||||
}
|
||||
const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
|
||||
|
||||
// Sort lines top-to-bottom, join items left-to-right
|
||||
const lines: TextLine[] = Array.from(lineMap.entries())
|
||||
.sort(([a], [b]) => a - b)
|
||||
.map(([yBucket, { items, minX }]) => {
|
||||
const sorted = items.sort((a, b) => a.x - b.x);
|
||||
return {
|
||||
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
|
||||
xPct: Math.round((minX / W) * 1000) / 10,
|
||||
text: sorted.map((i) => i.text).join(' '),
|
||||
};
|
||||
})
|
||||
// Cap at 120 lines per page to stay within context limits
|
||||
.slice(0, 120);
|
||||
|
||||
pages.push({ page: pageNum, width: W, height: H, lines });
|
||||
pages.push({
|
||||
page: pageNum,
|
||||
width: page.getViewport({ scale: 1.0 }).width,
|
||||
height: page.getViewport({ scale: 1.0 }).height,
|
||||
base64: jpegBuffer.toString('base64'),
|
||||
});
|
||||
}
|
||||
|
||||
return pages;
|
||||
|
||||
@@ -99,47 +99,64 @@ export async function classifyFieldsWithAI(
|
||||
const clientName = client?.name ?? 'Unknown';
|
||||
const propertyAddress = client?.propertyAddress ?? 'Unknown';
|
||||
|
||||
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context
|
||||
const pagesSummary = pageTexts.map((p) => {
|
||||
const linesSummary = p.lines
|
||||
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
|
||||
.join('\n');
|
||||
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
|
||||
}).join('\n\n');
|
||||
// Build vision messages — one image_url block per page
|
||||
type ContentBlock =
|
||||
| { type: 'text'; text: string }
|
||||
| { type: 'image_url'; image_url: { url: string; detail: 'high' } };
|
||||
|
||||
const imageBlocks: ContentBlock[] = pageTexts.map((p) => ({
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${p.base64}`,
|
||||
detail: 'high',
|
||||
},
|
||||
}));
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: 'gpt-4o',
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
|
||||
content: `You are a real estate document form field extractor. You will receive images of PDF pages. Your job is to identify every location that needs to be filled in.
|
||||
|
||||
Your job: identify every location that requires a FIELD to be filled in.
|
||||
WHAT TO PLACE FIELDS ON:
|
||||
- Blank underlines: ____________
|
||||
- Labeled blanks: "Name: ______", "Address: ______", "Price: $______"
|
||||
- Signature lines with labels like "(Seller's Signature)", "(Buyer's Signature)", "(Agent)"
|
||||
- Date lines labeled "(Date)" or with a date underline
|
||||
- Initials boxes: "[ ]" or "_____ Initials" or small boxes at page bottoms/margins
|
||||
|
||||
FIELD PLACEMENT RULES:
|
||||
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
|
||||
2. NEVER place fields inside paragraph body text, headings, or descriptive content.
|
||||
3. Look for these patterns as indicators of form fields:
|
||||
- Lines of underscores: "_______" or "___________"
|
||||
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
|
||||
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
|
||||
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
|
||||
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
|
||||
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
|
||||
5. Match field types:
|
||||
- "client-signature" → buyer/client signature lines
|
||||
- "agent-signature" → agent/listing agent signature lines
|
||||
- "initials" → initials boxes or short initial blanks
|
||||
- "agent-initials" → agent-specific initials
|
||||
- "date" → date fields
|
||||
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
|
||||
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
|
||||
7. Do NOT place checkbox fields.
|
||||
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
|
||||
WHAT NOT TO PLACE FIELDS ON:
|
||||
- Paragraph body text, instructions, legal boilerplate
|
||||
- Headings and section titles
|
||||
|
||||
FIELD TYPES:
|
||||
- "client-signature" → buyer or seller/client signature lines
|
||||
- "agent-signature" → agent or listing agent signature lines
|
||||
- "initials" → buyer/seller initials boxes
|
||||
- "agent-initials" → agent initials boxes
|
||||
- "date" → any date field
|
||||
- "text" → all other blanks (names, addresses, prices, terms, etc.)
|
||||
|
||||
POSITIONING:
|
||||
- xPct and yPct are percentages from the TOP-LEFT of that specific page image
|
||||
- Place the field AT the blank line, not above or below it
|
||||
- For a line like "Buyer's Signature __________ Date _______", place a client-signature at the signature blank's x/y and a date field at the date blank's x/y — they are separate fields on the same line
|
||||
- Do NOT place checkbox fields
|
||||
|
||||
PREFILL:
|
||||
- For text fields: if the blank is clearly for client name ("${clientName}") or property address ("${propertyAddress}"), set prefillValue to that value
|
||||
- All other fields: prefillValue = ""`,
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nAnalyze every page below. Return ALL blanks and form fields you can see — one field per blank line/box. Pages are in order starting from page 1.`,
|
||||
},
|
||||
...imageBlocks,
|
||||
] as ContentBlock[],
|
||||
},
|
||||
],
|
||||
response_format: {
|
||||
|
||||
Reference in New Issue
Block a user