fix(13): extract text with line positions for accurate AI field placement

- extractPdfText now returns TextLine[] with yPct/xPct per line instead of flat text blob
- AI can now see spatial layout (where blank lines/underscores actually are vs body text)
- Rewrote system prompt: explicit rules about blank lines/underscores/signature blocks,
  place ALL blanks even without prefill value, match field type to label pattern

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chandler Copeland
2026-03-21 17:35:02 -06:00
parent c67d56dc48
commit b5216a8542
2 changed files with 87 additions and 28 deletions

View File

@@ -12,11 +12,19 @@ import { join } from 'node:path';
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
/** A single line of text, grouped by approximate Y position. */
export interface TextLine {
yPct: number; // % from page TOP (0 = top, 100 = bottom)
xPct: number; // % from page LEFT of the first item on this line
text: string; // all items on this line joined
}
/** Per-page structured data for AI consumption. */
export interface PageText {
page: number; // 1-indexed
text: string; // all text items joined with spaces, capped at 2000 chars
width: number; // page width in PDF points (72 DPI)
height: number; // page height in PDF points (72 DPI)
width: number; // page width in PDF points
height: number; // page height in PDF points
lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
}
export async function extractPdfText(filePath: string): Promise<PageText[]> {
@@ -28,18 +36,50 @@ export async function extractPdfText(filePath: string): Promise<PageText[]> {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 });
const textContent = await page.getTextContent();
const rawText = textContent.items
.filter((item: unknown) => typeof item === 'object' && item !== null && 'str' in item)
.map((item: unknown) => (item as { str: string }).str)
.join(' ');
// Cap text per page at 2000 chars to stay within GPT-4o-mini context limits
const text = rawText.slice(0, 2000);
pages.push({
page: pageNum,
width: viewport.width,
height: viewport.height,
text,
});
const W = viewport.width;
const H = viewport.height;
// Collect raw items with positions
const rawItems: { text: string; x: number; yFromTop: number }[] = [];
for (const item of textContent.items) {
if (typeof item !== 'object' || item === null || !('str' in item)) continue;
const i = item as { str: string; transform: number[] };
if (!i.str.trim()) continue;
const x = i.transform[4];
const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
rawItems.push({ text: i.str, x, yFromTop });
}
// Group items into lines by rounding yFromTop to nearest 4pt bucket
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
for (const item of rawItems) {
const bucket = Math.round(item.yFromTop / 4) * 4;
const existing = lineMap.get(bucket);
if (existing) {
existing.items.push(item);
existing.minX = Math.min(existing.minX, item.x);
} else {
lineMap.set(bucket, { items: [item], minX: item.x });
}
}
// Sort lines top-to-bottom, join items left-to-right
const lines: TextLine[] = Array.from(lineMap.entries())
.sort(([a], [b]) => a - b)
.map(([yBucket, { items, minX }]) => {
const sorted = items.sort((a, b) => a.x - b.x);
return {
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
xPct: Math.round((minX / W) * 1000) / 10,
text: sorted.map((i) => i.text).join(' '),
};
})
// Cap at 120 lines per page to stay within context limits
.slice(0, 120);
pages.push({ page: pageNum, width: W, height: H, lines });
}
return pages;
}

View File

@@ -99,28 +99,47 @@ export async function classifyFieldsWithAI(
const clientName = client?.name ?? 'Unknown';
const propertyAddress = client?.propertyAddress ?? 'Unknown';
// Build pages summary — text already capped at 2000 chars per page in extractPdfText
const pagesSummary = pageTexts
.map((p) => `Page ${p.page} (${p.width}x${p.height}pt):\n${p.text}`)
.join('\n\n');
// Build structured page summary — each line includes yPct/xPct so the AI has spatial context
const pagesSummary = pageTexts.map((p) => {
const linesSummary = p.lines
.map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
.join('\n');
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
}).join('\n\n');
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'system',
content: `You are a real estate document form field extractor.
Given extracted text from a PDF page (with context about page number and dimensions),
identify where signature, initials, text, and date fields should be placed.
Return fields as percentage positions (0-100) from the TOP-LEFT of the page.
Use these field types: text (for typed values), initials, date, client-signature, agent-signature, agent-initials.
Only place fields where the document clearly requires them — prefer fewer, high-confidence placements.
For text fields that match the client name or property address, set prefillValue to the known value. Otherwise use empty string.
Do NOT place checkbox fields.`,
content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
Your job: identify every location that requires a FIELD to be filled in.
FIELD PLACEMENT RULES:
1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
2. NEVER place fields inside paragraph body text, headings, or descriptive content.
3. Look for these patterns as indicators of form fields:
- Lines of underscores: "_______" or "___________"
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
5. Match field types:
- "client-signature" → buyer/client signature lines
- "agent-signature" → agent/listing agent signature lines
- "initials" → initials boxes or short initial blanks
- "agent-initials" → agent-specific initials
- "date" → date fields
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
7. Do NOT place checkbox fields.
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
},
{
role: 'user',
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nPDF pages:\n${pagesSummary}`,
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
},
],
response_format: {