fix(docker): polyfill DOMMatrix/ImageData/Path2D for pdfjs-dist in linux/amd64 container via NODE_OPTIONS --require
This commit is contained in:
@@ -30,6 +30,9 @@ COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
|
||||
# Copy seeds/forms for form library import feature (runtime dependency)
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/seeds ./seeds
|
||||
|
||||
# Copy polyfill script (required via NODE_OPTIONS before server starts)
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/scripts/polyfill-dom.cjs ./scripts/polyfill-dom.cjs
|
||||
|
||||
# Create uploads directory (will be mounted as volume)
|
||||
RUN mkdir -p uploads && chown nextjs:nodejs uploads
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ services:
|
||||
env_file:
|
||||
- .env.production
|
||||
environment:
|
||||
- NODE_OPTIONS=--dns-result-order=ipv4first
|
||||
- NODE_OPTIONS=--dns-result-order=ipv4first --require /app/scripts/polyfill-dom.cjs
|
||||
dns:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
|
||||
11
teressa-copeland-homes/scripts/polyfill-dom.cjs
Normal file
11
teressa-copeland-homes/scripts/polyfill-dom.cjs
Normal file
@@ -0,0 +1,11 @@
|
||||
// Polyfills for browser globals required by pdfjs-dist in Node.js.
|
||||
// Required via NODE_OPTIONS=--require so it runs before any module evaluation.
|
||||
if (typeof globalThis.DOMMatrix === 'undefined') {
|
||||
globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } };
|
||||
}
|
||||
if (typeof globalThis.ImageData === 'undefined') {
|
||||
globalThis.ImageData = class ImageData { constructor() { return this; } };
|
||||
}
|
||||
if (typeof globalThis.Path2D === 'undefined') {
|
||||
globalThis.Path2D = class Path2D { constructor() { return this; } };
|
||||
}
|
||||
23
teressa-copeland-homes/src/instrumentation.ts
Normal file
23
teressa-copeland-homes/src/instrumentation.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
/**
|
||||
* Next.js instrumentation hook — runs once before any route handler.
|
||||
* Polyfills browser globals required by pdfjs-dist at module load time.
|
||||
* @napi-rs/canvas normally provides these but may be absent in some server
|
||||
* environments (e.g. linux/amd64 Docker image on ARM build host).
|
||||
* Text extraction never actually calls these — pdfjs needs the classes defined.
|
||||
*/
|
||||
export async function register() {
|
||||
if (process.env.NEXT_RUNTIME === 'nodejs') {
|
||||
if (typeof globalThis.DOMMatrix === 'undefined') {
|
||||
// @ts-expect-error minimal stub
|
||||
globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } };
|
||||
}
|
||||
if (typeof globalThis.ImageData === 'undefined') {
|
||||
// @ts-expect-error minimal stub
|
||||
globalThis.ImageData = class ImageData { constructor() { return this; } };
|
||||
}
|
||||
if (typeof globalThis.Path2D === 'undefined') {
|
||||
// @ts-expect-error minimal stub
|
||||
globalThis.Path2D = class Path2D { constructor() { return this; } };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,66 +1,310 @@
|
||||
// server-only — never import from client components
|
||||
// This module uses pdfjs-dist legacy build in Node.js fake-worker mode (no browser worker).
|
||||
// The client components (PdfViewer.tsx, PreviewModal.tsx) set workerSrc independently.
|
||||
// Extracts blank field positions from the PDF text layer by finding underscore sequences.
|
||||
// No image rendering needed — coordinates come directly from pdfjs in PDF user-space.
|
||||
|
||||
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
|
||||
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import { createCanvas } from '@napi-rs/canvas';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
|
||||
// pdfjs-dist 5.x fake-worker mode: must point workerSrc to the actual worker file so
|
||||
// _setupFakeWorkerGlobal can `await import(this.workerSrc)` in Node.js.
|
||||
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
|
||||
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
|
||||
|
||||
/** A rendered page image, ready to send to GPT-4o vision. */
|
||||
export interface PageImage {
|
||||
page: number; // 1-indexed
|
||||
width: number; // original PDF width in points (scale 1.0)
|
||||
height: number; // original PDF height in points (scale 1.0)
|
||||
base64: string; // JPEG base64 of the rendered page (no data: prefix)
|
||||
/**
|
||||
* A blank field detected from the PDF text layer.
|
||||
* Coordinates are in PDF user-space (bottom-left origin, points).
|
||||
*/
|
||||
export interface BlankField {
|
||||
page: number;
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
fontSize: number;
|
||||
contextBefore: string;
|
||||
contextAfter: string;
|
||||
contextAbove: string;
|
||||
contextBelow: string;
|
||||
/** 1-indexed position of this blank among all pure-underscore blanks on its line (Strategy 1 only). */
|
||||
rowIndex?: number;
|
||||
/** Total pure-underscore blanks on this line (Strategy 1 only). */
|
||||
rowTotal?: number;
|
||||
}
|
||||
|
||||
// Legacy type alias kept for callers that still reference PageText
|
||||
export type PageText = PageImage;
|
||||
const MIN_UNDERSCORES = 2;
|
||||
|
||||
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
|
||||
// Two consecutive underscore items are considered the SAME blank when their gap is ≤ this.
|
||||
// Font-split items (same blank rendered as multiple runs) have gap ~0-2pt.
|
||||
// Column separators in signature blocks are typically 5-30pt — kept separate.
|
||||
const MAX_MERGE_GAP_PT = 3;
|
||||
|
||||
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
|
||||
interface RawTextItem {
|
||||
str: string;
|
||||
transform: number[];
|
||||
width: number;
|
||||
}
|
||||
|
||||
function isPureUnderscoreItem(item: RawTextItem): boolean {
|
||||
const count = (item.str.match(/_/g) ?? []).length;
|
||||
if (count < MIN_UNDERSCORES) return false;
|
||||
return item.str.replace(/[_ ]/g, '').length <= 1;
|
||||
}
|
||||
|
||||
function groupIntoLines(items: RawTextItem[]): Array<{ y: number; items: RawTextItem[] }> {
|
||||
const sorted = [...items].sort((a, b) => b.transform[5] - a.transform[5]);
|
||||
const groups: Array<{ y: number; items: RawTextItem[] }> = [];
|
||||
|
||||
for (const item of sorted) {
|
||||
const itemY = item.transform[5];
|
||||
const last = groups[groups.length - 1];
|
||||
if (last && Math.abs(last.y - itemY) <= 5) {
|
||||
last.items.push(item);
|
||||
} else {
|
||||
groups.push({ y: itemY, items: [item] });
|
||||
}
|
||||
}
|
||||
|
||||
for (const g of groups) g.items.sort((a, b) => a.transform[4] - b.transform[4]);
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build context string from line items, excluding pure-underscore items
|
||||
* (they're adjacent blanks, not label text) and stripping stray bracket chars.
|
||||
*/
|
||||
function lineContext(items: RawTextItem[]): string {
|
||||
return items
|
||||
.map(i => i.str.replace(/[\[\]]/g, '').trim()) // strip [ ] bracket chars
|
||||
.filter(s => s.replace(/[_ ]/g, '').length > 0) // drop pure-underscore/space items
|
||||
.join(' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all blank fields from a PDF using four detection strategies.
|
||||
*
|
||||
* Strategy 1 — pure underscore runs with geometric merging:
|
||||
* Consecutive underscore items with a gap ≤ MAX_MERGE_GAP_PT are one blank.
|
||||
* Large gaps (column spacing) keep blanks separate.
|
||||
*
|
||||
* Strategy 2 — embedded underscore runs:
|
||||
* Items like "Date ___________" containing 5+ underscores mixed with label text.
|
||||
* Position estimated by character ratio.
|
||||
*
|
||||
* Strategy 3 — single-item bracket blanks:
|
||||
* "[ ]" (3+ internal spaces) in one text item.
|
||||
* Width estimated from the bracket span within the item, not the full item width.
|
||||
*
|
||||
* Strategy 4 — multi-item bracket blanks:
|
||||
* "Seller's Initials [" … spaces … "]" split across consecutive items.
|
||||
* Width measured as the gap between the bracket items.
|
||||
*/
|
||||
export async function extractBlanks(filePath: string): Promise<BlankField[]> {
|
||||
const data = new Uint8Array(await readFile(filePath));
|
||||
const pdf = await getDocument({ data }).promise;
|
||||
const pages: PageImage[] = [];
|
||||
const allBlanks: BlankField[] = [];
|
||||
|
||||
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: RENDER_SCALE });
|
||||
const textContent = await page.getTextContent();
|
||||
const items = (textContent.items as RawTextItem[]).filter(i => i.str.length > 0);
|
||||
const lines = groupIntoLines(items);
|
||||
|
||||
// Create an @napi-rs/canvas and render the PDF page into it
|
||||
const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
|
||||
const ctx = canvas.getContext('2d');
|
||||
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
|
||||
const { y: lineY, items: lineItems } = lines[lineIdx];
|
||||
|
||||
// @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
|
||||
await page.render({ canvasContext: ctx, viewport }).promise;
|
||||
// Skip lines that are purely underscores (blank rows) so that stacked signature
|
||||
// rows still find the "(Seller's Signature) (Address/Phone) (Date)" label line.
|
||||
const contextAbove = (() => {
|
||||
for (let li = lineIdx - 1; li >= 0; li--) {
|
||||
const ctx = lineContext(lines[li].items);
|
||||
if (ctx) return ctx.slice(0, 100);
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
const contextBelow = (() => {
|
||||
for (let li = lineIdx + 1; li < lines.length; li++) {
|
||||
const ctx = lineContext(lines[li].items);
|
||||
if (ctx) return ctx.slice(0, 100);
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
|
||||
// Stamp a visible page number in the top-left corner so GPT-4o can correlate
|
||||
// each image to the correct page number when multiple images are sent in one prompt.
|
||||
const label = `PAGE ${pageNum}`;
|
||||
const fontSize = Math.round(viewport.height * 0.025);
|
||||
ctx.fillStyle = 'rgba(220,30,30,0.85)';
|
||||
ctx.fillRect(0, 0, fontSize * (label.length * 0.65), fontSize * 1.5);
|
||||
ctx.fillStyle = '#ffffff';
|
||||
ctx.font = `bold ${fontSize}px sans-serif`;
|
||||
ctx.fillText(label, 4, fontSize * 1.15);
|
||||
// ── Strategy 1: pure underscore runs (geometric merge) ───────────────────
|
||||
// Two-pass approach: first collect all run boundaries on this line, then emit
|
||||
// each blank with its row position (rowIndex / rowTotal) so the AI can
|
||||
// reliably identify first / middle / last blanks on multi-blank rows.
|
||||
{
|
||||
// Pass 1: collect run boundaries
|
||||
const runs: Array<{ startIdx: number; endIdx: number }> = [];
|
||||
let runStart = -1;
|
||||
let lastUnderscoreIdx = -1;
|
||||
|
||||
const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
|
||||
for (let i = 0; i < lineItems.length; i++) {
|
||||
if (!isPureUnderscoreItem(lineItems[i])) continue;
|
||||
|
||||
pages.push({
|
||||
page: pageNum,
|
||||
width: page.getViewport({ scale: 1.0 }).width,
|
||||
height: page.getViewport({ scale: 1.0 }).height,
|
||||
base64: jpegBuffer.toString('base64'),
|
||||
});
|
||||
if (runStart < 0) {
|
||||
runStart = i;
|
||||
lastUnderscoreIdx = i;
|
||||
} else {
|
||||
const prev = lineItems[lastUnderscoreIdx];
|
||||
const gap = lineItems[i].transform[4] - (prev.transform[4] + prev.width);
|
||||
if (gap <= MAX_MERGE_GAP_PT) {
|
||||
lastUnderscoreIdx = i; // extend run
|
||||
} else {
|
||||
runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
|
||||
runStart = i;
|
||||
lastUnderscoreIdx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (runStart >= 0) runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
|
||||
|
||||
// Pass 2: emit blanks with row position metadata
|
||||
const rowTotal = runs.length;
|
||||
for (let r = 0; r < rowTotal; r++) {
|
||||
const { startIdx, endIdx } = runs[r];
|
||||
const first = lineItems[startIdx];
|
||||
const last = lineItems[endIdx];
|
||||
allBlanks.push({
|
||||
page: pageNum,
|
||||
x: first.transform[4],
|
||||
y: lineY,
|
||||
width: (last.transform[4] + last.width) - first.transform[4],
|
||||
fontSize: Math.abs(first.transform[0]) || 10,
|
||||
contextBefore: lineContext(lineItems.slice(0, startIdx)).slice(-80),
|
||||
contextAfter: lineContext(lineItems.slice(endIdx + 1)).slice(0, 80),
|
||||
contextAbove,
|
||||
contextBelow,
|
||||
rowIndex: r + 1,
|
||||
rowTotal,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ── Strategy 2: embedded underscore runs ─────────────────────────────────
|
||||
for (let i = 0; i < lineItems.length; i++) {
|
||||
const item = lineItems[i];
|
||||
if (isPureUnderscoreItem(item)) continue;
|
||||
|
||||
const match = item.str.match(/_{5,}/);
|
||||
if (!match || match.index === undefined) continue;
|
||||
|
||||
const runIdx = match.index;
|
||||
const runLen = match[0].length;
|
||||
const totalLen = item.str.length;
|
||||
const xEst = item.transform[4] + (runIdx / totalLen) * item.width;
|
||||
const widthEst = Math.max((runLen / totalLen) * item.width, 30);
|
||||
|
||||
const prefixLabel = item.str.slice(0, runIdx).trim();
|
||||
const suffixLabel = item.str.slice(runIdx + runLen).trim();
|
||||
|
||||
allBlanks.push({
|
||||
page: pageNum,
|
||||
x: xEst,
|
||||
y: lineY,
|
||||
width: widthEst,
|
||||
fontSize: Math.abs(item.transform[0]) || 10,
|
||||
contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
|
||||
.filter(Boolean).join(' ').trim().slice(-80),
|
||||
contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
|
||||
.filter(Boolean).join(' ').trim().slice(0, 80),
|
||||
contextAbove,
|
||||
contextBelow,
|
||||
});
|
||||
}
|
||||
|
||||
// ── Strategy 3: single-item bracket blanks ───────────────────────────────
|
||||
// Estimate x/width from the bracket span within the item string, not item.width,
|
||||
// so a footer like "Seller's Initials [ ] Date ___" gets the correct narrow width.
|
||||
for (let i = 0; i < lineItems.length; i++) {
|
||||
const item = lineItems[i];
|
||||
const match = item.str.match(/\[(\s{3,})\]/);
|
||||
if (!match || match.index === undefined) continue;
|
||||
|
||||
const bracketStart = match.index + 1; // char index of first space inside [
|
||||
const bracketEnd = match.index + match[0].length - 1; // char index of ]
|
||||
const totalLen = item.str.length;
|
||||
const xEst = item.transform[4] + (bracketStart / totalLen) * item.width;
|
||||
const widthEst = Math.max(((bracketEnd - bracketStart) / totalLen) * item.width, 20);
|
||||
|
||||
const prefixLabel = item.str.slice(0, match.index).trim();
|
||||
const suffixLabel = item.str.slice(match.index + match[0].length).trim();
|
||||
|
||||
allBlanks.push({
|
||||
page: pageNum,
|
||||
x: xEst,
|
||||
y: lineY,
|
||||
width: widthEst,
|
||||
fontSize: Math.abs(item.transform[0]) || 10,
|
||||
contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
|
||||
.filter(Boolean).join(' ').trim().slice(-80),
|
||||
contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
|
||||
.filter(Boolean).join(' ').trim().slice(0, 80),
|
||||
contextAbove,
|
||||
contextBelow,
|
||||
});
|
||||
}
|
||||
|
||||
// ── Strategy 4: multi-item bracket blanks ────────────────────────────────
|
||||
// "Seller's Initials [" … whitespace items … "]"
|
||||
{
|
||||
let openIdx = -1;
|
||||
|
||||
for (let i = 0; i < lineItems.length; i++) {
|
||||
const item = lineItems[i];
|
||||
|
||||
if (openIdx < 0) {
|
||||
if (item.str.trimEnd().endsWith('[')) openIdx = i;
|
||||
} else {
|
||||
const isWhitespace = item.str.trim().length === 0;
|
||||
const closesHere = item.str.trimStart().startsWith(']');
|
||||
|
||||
if (closesHere) {
|
||||
const openItem = lineItems[openIdx];
|
||||
const closeItem = lineItems[i];
|
||||
const blankX = openItem.transform[4] + openItem.width;
|
||||
const blankWidth = closeItem.transform[4] - blankX;
|
||||
|
||||
if (blankWidth > 5) {
|
||||
const prefixBracket = openItem.str.replace(/\[$/, '').trim();
|
||||
const suffixBracket = closeItem.str.replace(/^\]/, '').trim();
|
||||
allBlanks.push({
|
||||
page: pageNum,
|
||||
x: blankX,
|
||||
y: lineY,
|
||||
width: blankWidth,
|
||||
fontSize: Math.abs(openItem.transform[0]) || 10,
|
||||
contextBefore: [...lineContext(lineItems.slice(0, openIdx)).split(' '), prefixBracket]
|
||||
.filter(Boolean).join(' ').trim().slice(-80),
|
||||
contextAfter: [suffixBracket, lineContext(lineItems.slice(i + 1))]
|
||||
.filter(Boolean).join(' ').trim().slice(0, 80),
|
||||
contextAbove,
|
||||
contextBelow,
|
||||
});
|
||||
}
|
||||
openIdx = -1;
|
||||
} else if (!isWhitespace) {
|
||||
openIdx = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pages;
|
||||
// Deduplicate: Strategy 3 (single-item bracket) and Strategy 4 (multi-item bracket)
|
||||
// can both fire on the same blank. Use x-range overlap to catch them regardless of
|
||||
// how far apart their estimated x positions are.
|
||||
const deduped: BlankField[] = [];
|
||||
for (const b of allBlanks) {
|
||||
const isDupe = deduped.some(
|
||||
d =>
|
||||
d.page === b.page &&
|
||||
Math.abs(d.y - b.y) < 4 &&
|
||||
d.x < b.x + b.width && // ranges overlap
|
||||
b.x < d.x + d.width,
|
||||
);
|
||||
if (!isDupe) deduped.push(b);
|
||||
}
|
||||
return deduped;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user