diff --git a/teressa-copeland-homes/src/lib/ai/extract-text.ts b/teressa-copeland-homes/src/lib/ai/extract-text.ts index 6d94eb8..bb2e225 100644 --- a/teressa-copeland-homes/src/lib/ai/extract-text.ts +++ b/teressa-copeland-homes/src/lib/ai/extract-text.ts @@ -12,11 +12,19 @@ import { join } from 'node:path'; // Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs. GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`; +/** A single line of text, grouped by approximate Y position. */ +export interface TextLine { + yPct: number; // % from page TOP (0 = top, 100 = bottom) + xPct: number; // % from page LEFT of the first item on this line + text: string; // all items on this line joined +} + +/** Per-page structured data for AI consumption. */ export interface PageText { page: number; // 1-indexed - text: string; // all text items joined with spaces, capped at 2000 chars - width: number; // page width in PDF points (72 DPI) - height: number; // page height in PDF points (72 DPI) + width: number; // page width in PDF points + height: number; // page height in PDF points + lines: TextLine[]; // text grouped into lines, sorted top-to-bottom } export async function extractPdfText(filePath: string): Promise { @@ -28,18 +36,50 @@ export async function extractPdfText(filePath: string): Promise { const page = await pdf.getPage(pageNum); const viewport = page.getViewport({ scale: 1.0 }); const textContent = await page.getTextContent(); - const rawText = textContent.items - .filter((item: unknown) => typeof item === 'object' && item !== null && 'str' in item) - .map((item: unknown) => (item as { str: string }).str) - .join(' '); - // Cap text per page at 2000 chars to stay within GPT-4o-mini context limits - const text = rawText.slice(0, 2000); - pages.push({ - page: pageNum, - width: viewport.width, - height: viewport.height, - text, - }); + + const W = viewport.width; + const H = viewport.height; + + // Collect raw items with positions + const rawItems: { text: string; x: number; yFromTop: number }[] = []; + for (const item of textContent.items) { + if (typeof item !== 'object' || item === null || !('str' in item)) continue; + const i = item as { str: string; transform: number[] }; + if (!i.str.trim()) continue; + const x = i.transform[4]; + const yFromTop = H - i.transform[5]; // PDF y is from bottom; flip to screen coords + rawItems.push({ text: i.str, x, yFromTop }); + } + + // Group items into lines by rounding yFromTop to nearest 4pt bucket + const lineMap = new Map(); + for (const item of rawItems) { + const bucket = Math.round(item.yFromTop / 4) * 4; + const existing = lineMap.get(bucket); + if (existing) { + existing.items.push(item); + existing.minX = Math.min(existing.minX, item.x); + } else { + lineMap.set(bucket, { items: [item], minX: item.x }); + } + } + + // Sort lines top-to-bottom, join items left-to-right + const lines: TextLine[] = Array.from(lineMap.entries()) + .sort(([a], [b]) => a - b) + .map(([yBucket, { items, minX }]) => { + const sorted = items.sort((a, b) => a.x - b.x); + return { + yPct: Math.round((yBucket / H) * 1000) / 10, // 1 decimal place + xPct: Math.round((minX / W) * 1000) / 10, + text: sorted.map((i) => i.text).join(' '), + }; + }) + // Cap at 120 lines per page to stay within context limits + .slice(0, 120); + + pages.push({ page: pageNum, width: W, height: H, lines }); } + return pages; } diff --git a/teressa-copeland-homes/src/lib/ai/field-placement.ts b/teressa-copeland-homes/src/lib/ai/field-placement.ts index b9dc808..729a97f 100644 --- a/teressa-copeland-homes/src/lib/ai/field-placement.ts +++ b/teressa-copeland-homes/src/lib/ai/field-placement.ts @@ -99,28 +99,47 @@ export async function classifyFieldsWithAI( const clientName = client?.name ?? 'Unknown'; const propertyAddress = client?.propertyAddress ?? 'Unknown'; - // Build pages summary — text already capped at 2000 chars per page in extractPdfText - const pagesSummary = pageTexts - .map((p) => `Page ${p.page} (${p.width}x${p.height}pt):\n${p.text}`) - .join('\n\n'); + // Build structured page summary — each line includes yPct/xPct so the AI has spatial context + const pagesSummary = pageTexts.map((p) => { + const linesSummary = p.lines + .map((l) => ` y=${l.yPct}% x=${l.xPct}%: ${l.text}`) + .join('\n'); + return `=== Page ${p.page} (${p.width}x${p.height}pt) ===\n${linesSummary}`; + }).join('\n\n'); const response = await openai.chat.completions.create({ model: 'gpt-4o', messages: [ { role: 'system', - content: `You are a real estate document form field extractor. -Given extracted text from a PDF page (with context about page number and dimensions), -identify where signature, initials, text, and date fields should be placed. -Return fields as percentage positions (0-100) from the TOP-LEFT of the page. -Use these field types: text (for typed values), initials, date, client-signature, agent-signature, agent-initials. -Only place fields where the document clearly requires them — prefer fewer, high-confidence placements. -For text fields that match the client name or property address, set prefillValue to the known value. Otherwise use empty string. -Do NOT place checkbox fields.`, + content: `You are a real estate document form field extractor. You receive structured text from PDF pages where each line includes its Y position (% from top) and X position (% from left). + +Your job: identify every location that requires a FIELD to be filled in. + +FIELD PLACEMENT RULES: +1. Only place fields at actual form field locations — blank lines (___), labeled input areas, signature blocks, date lines, and initials boxes. +2. NEVER place fields inside paragraph body text, headings, or descriptive content. +3. Look for these patterns as indicators of form fields: + - Lines of underscores: "_______" or "___________" + - Labels followed by blank space: "Date: ___", "Name: ___", "Address: ___" + - Signature lines labeled: "(Seller's Signature)", "(Buyer's Signature)", "(Agent Signature)", "Seller", "Buyer" + - Initials indicators: "Initials", "[ ]", "(Initials)", "_________ Initials" + - Date lines: "(Date)", "Date ___", "___ / ___ / ___" +4. For EVERY such blank or label you find, add a field — even if you have nothing to prefill. Leave prefillValue as "" if you don't know the value. +5. Match field types: + - "client-signature" → buyer/client signature lines + - "agent-signature" → agent/listing agent signature lines + - "initials" → initials boxes or short initial blanks + - "agent-initials" → agent-specific initials + - "date" → date fields + - "text" → any other fill-in-the-blank (names, addresses, prices, etc.) +6. Place the field AT the blank/label's yPct. Use the xPct from that line for xPct. +7. Do NOT place checkbox fields. +8. For text fields where the value matches the client name or property address, set prefillValue. Otherwise use "".`, }, { role: 'user', - content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nPDF pages:\n${pagesSummary}`, + content: `Client name: ${clientName}\nProperty address: ${propertyAddress}\n\nDocument pages (each line shows position and text):\n\n${pagesSummary}`, }, ], response_format: {