import { execSync } from 'node:child_process';
import { readFileSync, mkdirSync } from 'node:fs';
import { createRedactor, deploy } from '@okrapdf/edge-kit';
import type { PageInput } from '@okrapdf/edge-kit';
const PDF_PATH = './tax-form.pdf';
const OUT_DIR = './docling-output';
// ── Step 1: Parse locally with Docling ──────────────────────────
// PDF bytes never leave your machine
mkdirSync(OUT_DIR, { recursive: true });
execSync(`docling --to json "${PDF_PATH}" --output "${OUT_DIR}"`);
const jsonFile = execSync(`ls "${OUT_DIR}"/*.json`).toString().trim();
const doc = JSON.parse(readFileSync(jsonFile, 'utf-8'));
// ── Step 2: Convert Docling JSON → PageInput[] ─────────────────
// Group text items by page, preserve bounding boxes
const pageMap = new Map<number, { texts: string[]; items: PageInput['items'] }>();
for (const text of doc.texts) {
const prov = text.prov?.[0];
if (!prov) continue;
if (!pageMap.has(prov.page_no)) {
pageMap.set(prov.page_no, { texts: [], items: [] });
}
const entry = pageMap.get(prov.page_no)!;
entry.texts.push(text.text);
if (prov.bbox) {
entry.items!.push({
text: text.text,
bbox: {
x: prov.bbox.l,
y: prov.bbox.t,
w: prov.bbox.r - prov.bbox.l,
h: prov.bbox.t - prov.bbox.b,
},
});
}
}
const pages: PageInput[] = [...pageMap.entries()]
.sort(([a], [b]) => a - b)
.map(([pageNum, { texts, items }]) => ({
pageNum,
text: texts.join('\n'),
items,
}));
// ── Step 3: Configure PII detection ─────────────────────────────
const pii = {
patterns: ['SSN', 'EMAIL', 'PHONE_US', 'TAX_ID_US'],
includeNames: true,
includeAddresses: true,
};
// ── Step 4: Preview redaction locally (optional) ────────────────
// Useful for auditing what will be redacted before deploying
const redact = createRedactor({
pii,
publicFieldAllowlist: ['Form 1099-R', 'Instructions for Recipient'],
});
const result = redact(pages);
console.log(result.stats);
// { totalMatches: 8, pagesAffected: 1, byRule: { SSN: 1, PHONE_US: 3, EMAIL: 2, PERSON_NAME: 2 } }
// Inspect what each role sees
console.log(result.view('admin', 1)); // full text: "SSN: 123-45-6789"
console.log(result.view('viewer', 1)); // redacted: "SSN: [SSN_6978]"
console.log(result.view('public', 1)); // restricted: allowlisted sections only
// ── Step 5: Deploy to edge ──────────────────────────────────────
// Full text stored at edge, redaction applied at serve-time per role
const deployed = await deploy({
pages,
meta: { title: 'Tax Form 1099-R', filename: '1099-r.pdf' },
redact: {
pii,
publicFieldAllowlist: ['Form 1099-R', 'Instructions for Recipient'],
},
apiKey: process.env.OKRA_API_KEY!,
});
console.log(deployed.urls.admin); // full text — internal use only
console.log(deployed.urls.viewer); // PII redacted — safe for external sharing
console.log(deployed.urls.public); // allowlisted sections — public embedding