fix(13): extract text with line positions for accurate AI field placement
- extractPdfText now returns TextLine[] with yPct/xPct per line instead of flat text blob - AI can now see spatial layout (where blank lines/underscores actually are vs body text) - Rewrote system prompt: explicit rules about blank lines/underscores/signature blocks, place ALL blanks even without prefill value, match field type to label pattern Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,11 +12,19 @@ import { join } from 'node:path';
|
|||||||
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
||||||
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
||||||
|
|
||||||
|
/** A single line of text, grouped by approximate Y position. */
|
||||||
|
export interface TextLine {
|
||||||
|
yPct: number; // % from page TOP (0 = top, 100 = bottom)
|
||||||
|
xPct: number; // % from page LEFT of the first item on this line
|
||||||
|
text: string; // all items on this line joined
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Per-page structured data for AI consumption. */
|
||||||
export interface PageText {
|
export interface PageText {
|
||||||
page: number; // 1-indexed
|
page: number; // 1-indexed
|
||||||
text: string; // all text items joined with spaces, capped at 2000 chars
|
width: number; // page width in PDF points
|
||||||
width: number; // page width in PDF points (72 DPI)
|
height: number; // page height in PDF points
|
||||||
height: number; // page height in PDF points (72 DPI)
|
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function extractPdfText(filePath: string): Promise<PageText[]> {
|
export async function extractPdfText(filePath: string): Promise<PageText[]> {
|
||||||
@@ -28,18 +36,50 @@ export async function extractPdfText(filePath: string): Promise<PageText[]> {
|
|||||||
const page = await pdf.getPage(pageNum);
|
const page = await pdf.getPage(pageNum);
|
||||||
const viewport = page.getViewport({ scale: 1.0 });
|
const viewport = page.getViewport({ scale: 1.0 });
|
||||||
const textContent = await page.getTextContent();
|
const textContent = await page.getTextContent();
|
||||||
const rawText = textContent.items
|
|
||||||
.filter((item: unknown) => typeof item === 'object' && item !== null && 'str' in item)
|
const W = viewport.width;
|
||||||
.map((item: unknown) => (item as { str: string }).str)
|
const H = viewport.height;
|
||||||
.join(' ');
|
|
||||||
// Cap text per page at 2000 chars to stay within GPT-4o-mini context limits
|
// Collect raw items with positions
|
||||||
const text = rawText.slice(0, 2000);
|
const rawItems: { text: string; x: number; yFromTop: number }[] = [];
|
||||||
pages.push({
|
for (const item of textContent.items) {
|
||||||
page: pageNum,
|
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
|
||||||
width: viewport.width,
|
const i = item as { str: string; transform: number[] };
|
||||||
height: viewport.height,
|
if (!i.str.trim()) continue;
|
||||||
text,
|
const x = i.transform[4];
|
||||||
});
|
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
|
||||||
|
rawItems.push({ text: i.str, x, yFromTop });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Group items into lines by rounding yFromTop to nearest 4pt bucket
|
||||||
|
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
|
||||||
|
for (const item of rawItems) {
|
||||||
|
const bucket = Math.round(item.yFromTop / 4) * 4;
|
||||||
|
const existing = lineMap.get(bucket);
|
||||||
|
if (existing) {
|
||||||
|
existing.items.push(item);
|
||||||
|
existing.minX = Math.min(existing.minX, item.x);
|
||||||
|
} else {
|
||||||
|
lineMap.set(bucket, { items: [item], minX: item.x });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort lines top-to-bottom, join items left-to-right
|
||||||
|
const lines: TextLine[] = Array.from(lineMap.entries())
|
||||||
|
.sort(([a], [b]) => a - b)
|
||||||
|
.map(([yBucket, { items, minX }]) => {
|
||||||
|
const sorted = items.sort((a, b) => a.x - b.x);
|
||||||
|
return {
|
||||||
|
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
|
||||||
|
xPct: Math.round((minX / W) * 1000) / 10,
|
||||||
|
text: sorted.map((i) => i.text).join(' '),
|
||||||
|
};
|
||||||
|
})
|
||||||
|
// Cap at 120 lines per page to stay within context limits
|
||||||
|
.slice(0, 120);
|
||||||
|
|
||||||
|
pages.push({ page: pageNum, width: W, height: H, lines });
|
||||||
|
}
|
||||||
|
|
||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -99,28 +99,47 @@ export async function classifyFieldsWithAI(
|
|||||||
const clientName = client?.name ?? 'Unknown';
|
const clientName = client?.name ?? 'Unknown';
|
||||||
const propertyAddress = client?.propertyAddress ?? 'Unknown';
|
const propertyAddress = client?.propertyAddress ?? 'Unknown';
|
||||||
|
|
||||||
// Build pages summary — text already capped at 2000 chars per page in extractPdfText
|
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context
|
||||||
const pagesSummary = pageTexts
|
const pagesSummary = pageTexts.map((p) => {
|
||||||
.map((p) => `Page ${p.page} (${p.width}x${p.height}pt):\n${p.text}`)
|
const linesSummary = p.lines
|
||||||
.join('\n\n');
|
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
|
||||||
|
.join('\n');
|
||||||
|
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
|
||||||
|
}).join('\n\n');
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
const response = await openai.chat.completions.create({
|
||||||
model: 'gpt-4o',
|
model: 'gpt-4o',
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: 'system',
|
role: 'system',
|
||||||
content: `You are a real estate document form field extractor.
|
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
|
||||||
Given extracted text from a PDF page (with context about page number and dimensions),
|
|
||||||
identify where signature, initials, text, and date fields should be placed.
|
Your job: identify every location that requires a FIELD to be filled in.
|
||||||
Return fields as percentage positions (0-100) from the TOP-LEFT of the page.
|
|
||||||
Use these field types: text (for typed values), initials, date, client-signature, agent-signature, agent-initials.
|
FIELD PLACEMENT RULES:
|
||||||
Only place fields where the document clearly requires them — prefer fewer, high-confidence placements.
|
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
|
||||||
For text fields that match the client name or property address, set prefillValue to the known value. Otherwise use empty string.
|
2. NEVER place fields inside paragraph body text, headings, or descriptive content.
|
||||||
Do NOT place checkbox fields.`,
|
3. Look for these patterns as indicators of form fields:
|
||||||
|
- Lines of underscores: "_______" or "___________"
|
||||||
|
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
|
||||||
|
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
|
||||||
|
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
|
||||||
|
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
|
||||||
|
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
|
||||||
|
5. Match field types:
|
||||||
|
- "client-signature" → buyer/client signature lines
|
||||||
|
- "agent-signature" → agent/listing agent signature lines
|
||||||
|
- "initials" → initials boxes or short initial blanks
|
||||||
|
- "agent-initials" → agent-specific initials
|
||||||
|
- "date" → date fields
|
||||||
|
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
|
||||||
|
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
|
||||||
|
7. Do NOT place checkbox fields.
|
||||||
|
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: 'user',
|
role: 'user',
|
||||||
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nPDF pages:\n${pagesSummary}`,
|
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
response_format: {
|
response_format: {
|
||||||
|
|||||||
Reference in New Issue
Block a user