fix(docker): polyfill DOMMatrix/ImageData/Path2D for pdfjs-dist in linux/amd64 container via NODE_OPTIONS --require

This commit is contained in:
Chandler Copeland
2026-04-03 18:02:39 -06:00
parent f15e538f5c
commit 2d2a43a3c9
5 changed files with 322 additions and 41 deletions

View File

@@ -0,0 +1,23 @@
/**
* Next.js instrumentation hook — runs once before any route handler.
* Polyfills browser globals required by pdfjs-dist at module load time.
* @napi-rs/canvas normally provides these but may be absent in some server
* environments (e.g. linux/amd64 Docker image on ARM build host).
* Text extraction never actually calls these — pdfjs needs the classes defined.
*/
export async function register() {
if (process.env.NEXT_RUNTIME === 'nodejs') {
if (typeof globalThis.DOMMatrix === 'undefined') {
// @ts-expect-error minimal stub
globalThis.DOMMatrix = class DOMMatrix { constructor() { return this; } };
}
if (typeof globalThis.ImageData === 'undefined') {
// @ts-expect-error minimal stub
globalThis.ImageData = class ImageData { constructor() { return this; } };
}
if (typeof globalThis.Path2D === 'undefined') {
// @ts-expect-error minimal stub
globalThis.Path2D = class Path2D { constructor() { return this; } };
}
}
}

View File

@@ -1,66 +1,310 @@
// server-only — never import from client components
// This module uses pdfjs-dist legacy build in Node.js fake-worker mode (no browser worker).
// The client components (PdfViewer.tsx, PreviewModal.tsx) set workerSrc independently.
// Extracts blank field positions from the PDF text layer by finding underscore sequences.
// No image rendering needed — coordinates come directly from pdfjs in PDF user-space.
// @ts-ignore — legacy .mjs build; types re-exported from main pdfjs-dist declaration
import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { createCanvas } from '@napi-rs/canvas';
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
// pdfjs-dist 5.x fake-worker mode: must point workerSrc to the actual worker file so
// _setupFakeWorkerGlobal can `await import(this.workerSrc)` in Node.js.
// Empty string is falsy → PDFWorker.workerSrc getter throws before the import runs.
GlobalWorkerOptions.workerSrc = `file://${join(process.cwd(), 'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs')}`;
/** A rendered page image, ready to send to GPT-4o vision. */
export interface PageImage {
page: number; // 1-indexed
width: number; // original PDF width in points (scale 1.0)
height: number; // original PDF height in points (scale 1.0)
base64: string; // JPEG base64 of the rendered page (no data: prefix)
/**
* A blank field detected from the PDF text layer.
* Coordinates are in PDF user-space (bottom-left origin, points).
*/
export interface BlankField {
page: number;
x: number;
y: number;
width: number;
fontSize: number;
contextBefore: string;
contextAfter: string;
contextAbove: string;
contextBelow: string;
/** 1-indexed position of this blank among all pure-underscore blanks on its line (Strategy 1 only). */
rowIndex?: number;
/** Total pure-underscore blanks on this line (Strategy 1 only). */
rowTotal?: number;
}
// Legacy type alias kept for callers that still reference PageText
export type PageText = PageImage;
const MIN_UNDERSCORES = 2;
const RENDER_SCALE = 1.5; // 72dpi × 1.5 = 108dpi — good for vision without huge payloads
// Two consecutive underscore items are considered the SAME blank when their gap is ≤ this.
// Font-split items (same blank rendered as multiple runs) have gap ~0-2pt.
// Column separators in signature blocks are typically 5-30pt — kept separate.
const MAX_MERGE_GAP_PT = 3;
export async function extractPdfText(filePath: string): Promise<PageImage[]> {
interface RawTextItem {
str: string;
transform: number[];
width: number;
}
function isPureUnderscoreItem(item: RawTextItem): boolean {
const count = (item.str.match(/_/g) ?? []).length;
if (count < MIN_UNDERSCORES) return false;
return item.str.replace(/[_ ]/g, '').length <= 1;
}
function groupIntoLines(items: RawTextItem[]): Array<{ y: number; items: RawTextItem[] }> {
const sorted = [...items].sort((a, b) => b.transform[5] - a.transform[5]);
const groups: Array<{ y: number; items: RawTextItem[] }> = [];
for (const item of sorted) {
const itemY = item.transform[5];
const last = groups[groups.length - 1];
if (last && Math.abs(last.y - itemY) <= 5) {
last.items.push(item);
} else {
groups.push({ y: itemY, items: [item] });
}
}
for (const g of groups) g.items.sort((a, b) => a.transform[4] - b.transform[4]);
return groups;
}
/**
* Build context string from line items, excluding pure-underscore items
* (they're adjacent blanks, not label text) and stripping stray bracket chars.
*/
function lineContext(items: RawTextItem[]): string {
return items
.map(i => i.str.replace(/[\[\]]/g, '').trim()) // strip [ ] bracket chars
.filter(s => s.replace(/[_ ]/g, '').length > 0) // drop pure-underscore/space items
.join(' ')
.trim();
}
/**
* Extract all blank fields from a PDF using four detection strategies.
*
* Strategy 1 — pure underscore runs with geometric merging:
* Consecutive underscore items with a gap ≤ MAX_MERGE_GAP_PT are one blank.
* Large gaps (column spacing) keep blanks separate.
*
* Strategy 2 — embedded underscore runs:
* Items like "Date ___________" containing 5+ underscores mixed with label text.
* Position estimated by character ratio.
*
* Strategy 3 — single-item bracket blanks:
* "[ ]" (3+ internal spaces) in one text item.
* Width estimated from the bracket span within the item, not the full item width.
*
* Strategy 4 — multi-item bracket blanks:
* "Seller's Initials [" … spaces … "]" split across consecutive items.
* Width measured as the gap between the bracket items.
*/
export async function extractBlanks(filePath: string): Promise<BlankField[]> {
const data = new Uint8Array(await readFile(filePath));
const pdf = await getDocument({ data }).promise;
const pages: PageImage[] = [];
const allBlanks: BlankField[] = [];
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: RENDER_SCALE });
const textContent = await page.getTextContent();
const items = (textContent.items as RawTextItem[]).filter(i => i.str.length > 0);
const lines = groupIntoLines(items);
// Create an @napi-rs/canvas and render the PDF page into it
const canvas = createCanvas(Math.round(viewport.width), Math.round(viewport.height));
const ctx = canvas.getContext('2d');
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const { y: lineY, items: lineItems } = lines[lineIdx];
// @ts-ignore — @napi-rs/canvas context is compatible at runtime but types diverge
await page.render({ canvasContext: ctx, viewport }).promise;
// Skip lines that are purely underscores (blank rows) so that stacked signature
// rows still find the "(Seller's Signature) (Address/Phone) (Date)" label line.
const contextAbove = (() => {
for (let li = lineIdx - 1; li >= 0; li--) {
const ctx = lineContext(lines[li].items);
if (ctx) return ctx.slice(0, 100);
}
return '';
})();
const contextBelow = (() => {
for (let li = lineIdx + 1; li < lines.length; li++) {
const ctx = lineContext(lines[li].items);
if (ctx) return ctx.slice(0, 100);
}
return '';
})();
// Stamp a visible page number in the top-left corner so GPT-4o can correlate
// each image to the correct page number when multiple images are sent in one prompt.
const label = `PAGE ${pageNum}`;
const fontSize = Math.round(viewport.height * 0.025);
ctx.fillStyle = 'rgba(220,30,30,0.85)';
ctx.fillRect(0, 0, fontSize * (label.length * 0.65), fontSize * 1.5);
ctx.fillStyle = '#ffffff';
ctx.font = `bold ${fontSize}px sans-serif`;
ctx.fillText(label, 4, fontSize * 1.15);
// ── Strategy 1: pure underscore runs (geometric merge) ───────────────────
// Two-pass approach: first collect all run boundaries on this line, then emit
// each blank with its row position (rowIndex / rowTotal) so the AI can
// reliably identify first / middle / last blanks on multi-blank rows.
{
// Pass 1: collect run boundaries
const runs: Array<{ startIdx: number; endIdx: number }> = [];
let runStart = -1;
let lastUnderscoreIdx = -1;
const jpegBuffer = canvas.toBuffer('image/jpeg' as never, 85);
for (let i = 0; i < lineItems.length; i++) {
if (!isPureUnderscoreItem(lineItems[i])) continue;
pages.push({
page: pageNum,
width: page.getViewport({ scale: 1.0 }).width,
height: page.getViewport({ scale: 1.0 }).height,
base64: jpegBuffer.toString('base64'),
});
if (runStart < 0) {
runStart = i;
lastUnderscoreIdx = i;
} else {
const prev = lineItems[lastUnderscoreIdx];
const gap = lineItems[i].transform[4] - (prev.transform[4] + prev.width);
if (gap <= MAX_MERGE_GAP_PT) {
lastUnderscoreIdx = i; // extend run
} else {
runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
runStart = i;
lastUnderscoreIdx = i;
}
}
}
if (runStart >= 0) runs.push({ startIdx: runStart, endIdx: lastUnderscoreIdx });
// Pass 2: emit blanks with row position metadata
const rowTotal = runs.length;
for (let r = 0; r < rowTotal; r++) {
const { startIdx, endIdx } = runs[r];
const first = lineItems[startIdx];
const last = lineItems[endIdx];
allBlanks.push({
page: pageNum,
x: first.transform[4],
y: lineY,
width: (last.transform[4] + last.width) - first.transform[4],
fontSize: Math.abs(first.transform[0]) || 10,
contextBefore: lineContext(lineItems.slice(0, startIdx)).slice(-80),
contextAfter: lineContext(lineItems.slice(endIdx + 1)).slice(0, 80),
contextAbove,
contextBelow,
rowIndex: r + 1,
rowTotal,
});
}
}
// ── Strategy 2: embedded underscore runs ─────────────────────────────────
for (let i = 0; i < lineItems.length; i++) {
const item = lineItems[i];
if (isPureUnderscoreItem(item)) continue;
const match = item.str.match(/_{5,}/);
if (!match || match.index === undefined) continue;
const runIdx = match.index;
const runLen = match[0].length;
const totalLen = item.str.length;
const xEst = item.transform[4] + (runIdx / totalLen) * item.width;
const widthEst = Math.max((runLen / totalLen) * item.width, 30);
const prefixLabel = item.str.slice(0, runIdx).trim();
const suffixLabel = item.str.slice(runIdx + runLen).trim();
allBlanks.push({
page: pageNum,
x: xEst,
y: lineY,
width: widthEst,
fontSize: Math.abs(item.transform[0]) || 10,
contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
.filter(Boolean).join(' ').trim().slice(-80),
contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
.filter(Boolean).join(' ').trim().slice(0, 80),
contextAbove,
contextBelow,
});
}
// ── Strategy 3: single-item bracket blanks ───────────────────────────────
// Estimate x/width from the bracket span within the item string, not item.width,
// so a footer like "Seller's Initials [ ] Date ___" gets the correct narrow width.
for (let i = 0; i < lineItems.length; i++) {
const item = lineItems[i];
const match = item.str.match(/\[(\s{3,})\]/);
if (!match || match.index === undefined) continue;
const bracketStart = match.index + 1; // char index of first space inside [
const bracketEnd = match.index + match[0].length - 1; // char index of ]
const totalLen = item.str.length;
const xEst = item.transform[4] + (bracketStart / totalLen) * item.width;
const widthEst = Math.max(((bracketEnd - bracketStart) / totalLen) * item.width, 20);
const prefixLabel = item.str.slice(0, match.index).trim();
const suffixLabel = item.str.slice(match.index + match[0].length).trim();
allBlanks.push({
page: pageNum,
x: xEst,
y: lineY,
width: widthEst,
fontSize: Math.abs(item.transform[0]) || 10,
contextBefore: [...lineContext(lineItems.slice(0, i)).split(' '), prefixLabel]
.filter(Boolean).join(' ').trim().slice(-80),
contextAfter: [suffixLabel, lineContext(lineItems.slice(i + 1))]
.filter(Boolean).join(' ').trim().slice(0, 80),
contextAbove,
contextBelow,
});
}
// ── Strategy 4: multi-item bracket blanks ────────────────────────────────
// "Seller's Initials [" … whitespace items … "]"
{
let openIdx = -1;
for (let i = 0; i < lineItems.length; i++) {
const item = lineItems[i];
if (openIdx < 0) {
if (item.str.trimEnd().endsWith('[')) openIdx = i;
} else {
const isWhitespace = item.str.trim().length === 0;
const closesHere = item.str.trimStart().startsWith(']');
if (closesHere) {
const openItem = lineItems[openIdx];
const closeItem = lineItems[i];
const blankX = openItem.transform[4] + openItem.width;
const blankWidth = closeItem.transform[4] - blankX;
if (blankWidth > 5) {
const prefixBracket = openItem.str.replace(/\[$/, '').trim();
const suffixBracket = closeItem.str.replace(/^\]/, '').trim();
allBlanks.push({
page: pageNum,
x: blankX,
y: lineY,
width: blankWidth,
fontSize: Math.abs(openItem.transform[0]) || 10,
contextBefore: [...lineContext(lineItems.slice(0, openIdx)).split(' '), prefixBracket]
.filter(Boolean).join(' ').trim().slice(-80),
contextAfter: [suffixBracket, lineContext(lineItems.slice(i + 1))]
.filter(Boolean).join(' ').trim().slice(0, 80),
contextAbove,
contextBelow,
});
}
openIdx = -1;
} else if (!isWhitespace) {
openIdx = -1;
}
}
}
}
}
}
return pages;
// Deduplicate: Strategy 3 (single-item bracket) and Strategy 4 (multi-item bracket)
// can both fire on the same blank. Use x-range overlap to catch them regardless of
// how far apart their estimated x positions are.
const deduped: BlankField[] = [];
for (const b of allBlanks) {
const isDupe = deduped.some(
d =>
d.page === b.page &&
Math.abs(d.y - b.y) < 4 &&
d.x < b.x + b.width && // ranges overlap
b.x < d.x + d.width,
);
if (!isDupe) deduped.push(b);
}
return deduped;
}