fix(13): extract text with line positions for accurate AI field placement

- extractPdfText now returns TextLine[] with yPct/xPct per line instead of flat text blob
- AI can now see spatial layout (where blank lines/underscores actually are vs body text)
- Rewrote system prompt: explicit rules about blank lines/underscores/signature blocks,
  place ALL blanks even without prefill value, match field type to label pattern

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chandler Copeland
2026-03-21 17:35:02 -06:00
parent c67d56dc48
commit b5216a8542
2 changed files with 87 additions and 28 deletions

View File

@@ -12,11 +12,19 @@ import { join } from 'node:path';
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs. // Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`; GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
/** A single line of text, grouped by approximate Y position. */
export interface TextLine {
yPct: number; // % from page TOP (0 = top, 100 = bottom)
xPct: number; // % from page LEFT of the first item on this line
text: string; // all items on this line joined
}
/** Per-page structured data for AI consumption. */
export interface PageText { export interface PageText {
page: number; // 1-indexed page: number; // 1-indexed
text: string; // all text items joined with spaces, capped at 2000 chars width: number; // page width in PDF points
width: number; // page width in PDF points (72 DPI) height: number; // page height in PDF points
height: number; // page height in PDF points (72 DPI) lines: TextLine[]; // text grouped into lines, sorted top-to-bottom
} }
export async function extractPdfText(filePath: string): Promise<PageText[]> { export async function extractPdfText(filePath: string): Promise<PageText[]> {
@@ -28,18 +36,50 @@ export async function extractPdfText(filePath: string): Promise<PageText[]> {
const page = await pdf.getPage(pageNum); const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 }); const viewport = page.getViewport({ scale: 1.0 });
const textContent = await page.getTextContent(); const textContent = await page.getTextContent();
const rawText = textContent.items
.filter((item: unknown) => typeof item === 'object' && item !== null && 'str' in item) const W = viewport.width;
.map((item: unknown) => (item as { str: string }).str) const H = viewport.height;
.join(' ');
// Cap text per page at 2000 chars to stay within GPT-4o-mini context limits // Collect raw items with positions
const text = rawText.slice(0, 2000); const rawItems: { text: string; x: number; yFromTop: number }[] = [];
pages.push({ for (const item of textContent.items) {
page: pageNum, if (typeof item !== 'object' || item === null || !('str' in item)) continue;
width: viewport.width, const i = item as { str: string; transform: number[] };
height: viewport.height, if (!i.str.trim()) continue;
text, const x = i.transform[4];
}); const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords
rawItems.push({ text: i.str, x, yFromTop });
} }
// Group items into lines by rounding yFromTop to nearest 4pt bucket
const lineMap = new Map<number, { items: typeof rawItems; minX: number }>();
for (const item of rawItems) {
const bucket = Math.round(item.yFromTop / 4) * 4;
const existing = lineMap.get(bucket);
if (existing) {
existing.items.push(item);
existing.minX = Math.min(existing.minX, item.x);
} else {
lineMap.set(bucket, { items: [item], minX: item.x });
}
}
// Sort lines top-to-bottom, join items left-to-right
const lines: TextLine[] = Array.from(lineMap.entries())
.sort(([a], [b]) => a - b)
.map(([yBucket, { items, minX }]) => {
const sorted = items.sort((a, b) => a.x - b.x);
return {
yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place
xPct: Math.round((minX / W) * 1000) / 10,
text: sorted.map((i) => i.text).join(' '),
};
})
// Cap at 120 lines per page to stay within context limits
.slice(0, 120);
pages.push({ page: pageNum, width: W, height: H, lines });
}
return pages; return pages;
} }

View File

@@ -99,28 +99,47 @@ export async function classifyFieldsWithAI(
const clientName = client?.name ?? 'Unknown'; const clientName = client?.name ?? 'Unknown';
const propertyAddress = client?.propertyAddress ?? 'Unknown'; const propertyAddress = client?.propertyAddress ?? 'Unknown';
// Build pages summary — text already capped at 2000 chars per page in extractPdfText // Build structured page summary — each line includes yPct/xPct so the AI has spatial context
const pagesSummary = pageTexts const pagesSummary = pageTexts.map((p) => {
.map((p) => `Page ${p.page} (${p.width}x${p.height}pt):\n${p.text}`) const linesSummary = p.lines
.join('\n\n'); .map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`)
.join('\n');
return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`;
}).join('\n\n');
const response = await openai.chat.completions.create({ const response = await openai.chat.completions.create({
model: 'gpt-4o', model: 'gpt-4o',
messages: [ messages: [
{ {
role: 'system', role: 'system',
content: `You are a real estate document form field extractor. content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left).
Given extracted text from a PDF page (with context about page number and dimensions),
identify where signature, initials, text, and date fields should be placed. Your job: identify every location that requires a FIELD to be filled in.
Return fields as percentage positions (0-100) from the TOP-LEFT of the page.
Use these field types: text (for typed values), initials, date, client-signature, agent-signature, agent-initials. FIELD PLACEMENT RULES:
Only place fields where the document clearly requires them — prefer fewer, high-confidence placements. 1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes.
For text fields that match the client name or property address, set prefillValue to the known value. Otherwise use empty string. 2. NEVER place fields inside paragraph body text, headings, or descriptive content.
Do NOT place checkbox fields.`, 3. Look for these patterns as indicators of form fields:
- Lines of underscores: "_______" or "___________"
- Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___"
- Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer"
- Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials"
- Date lines: "(Date)", "Date ___", "___ / ___ / ___"
4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value.
5. Match field types:
- "client-signature" → buyer/client signature lines
- "agent-signature" → agent/listing agent signature lines
- "initials" → initials boxes or short initial blanks
- "agent-initials" → agent-specific initials
- "date" → date fields
- "text" → any other fill-in-the-blank (names, addresses, prices, etc.)
6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct.
7. Do NOT place checkbox fields.
8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`,
}, },
{ {
role: 'user', role: 'user',
content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nPDF pages:\n${pagesSummary}`, content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`,
}, },
], ],
response_format: { response_format: {