From 34ab1d23fa8fc4016c4ec3cc2bea10e536c86bb2 Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Fri, 19 Jun 2026 18:09:13 -0700 Subject: [PATCH 1/9] fix(logs): run PII redaction over HTTP and fix Presidio provisioning - resolve the guardrails venv via candidate paths and fail fast instead of silently falling back to system python3 (the misleading "Presidio not installed" that broke redaction and the guardrails block in deployed runtimes) - install the en_core_web_lg spaCy model in setup.sh and app.Dockerfile - route log redaction through an internal /api/guardrails/mask-batch endpoint so Presidio always runs in the app container, including async executions that persist inside the trigger.dev runtime --- .../api/guardrails/mask-batch/route.test.ts | 64 +++++++++++++++++++ .../app/api/guardrails/mask-batch/route.ts | 31 +++++++++ apps/sim/lib/api/contracts/hotspots.ts | 28 ++++++++ apps/sim/lib/guardrails/mask-client.ts | 44 +++++++++++++ apps/sim/lib/guardrails/setup.sh | 5 ++ apps/sim/lib/guardrails/validate_pii.ts | 41 ++++++++---- .../lib/logs/execution/pii-redaction.test.ts | 4 +- apps/sim/lib/logs/execution/pii-redaction.ts | 9 ++- docker/app.Dockerfile | 5 +- scripts/check-api-validation-contracts.ts | 4 +- 10 files changed, 213 insertions(+), 22 deletions(-) create mode 100644 apps/sim/app/api/guardrails/mask-batch/route.test.ts create mode 100644 apps/sim/app/api/guardrails/mask-batch/route.ts create mode 100644 apps/sim/lib/guardrails/mask-client.ts diff --git a/apps/sim/app/api/guardrails/mask-batch/route.test.ts b/apps/sim/app/api/guardrails/mask-batch/route.test.ts new file mode 100644 index 00000000000..cbb5b12265f --- /dev/null +++ b/apps/sim/app/api/guardrails/mask-batch/route.test.ts @@ -0,0 +1,64 @@ +/** + * @vitest-environment node + */ +import { createMockRequest } from '@sim/testing' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { mockCheckInternalAuth, mockMaskPIIBatch } = vi.hoisted(() => ({ + mockCheckInternalAuth: vi.fn(), + mockMaskPIIBatch: vi.fn(), +})) + +vi.mock('@/lib/auth/hybrid', () => ({ + checkInternalAuth: mockCheckInternalAuth, +})) + +vi.mock('@/lib/guardrails/validate_pii', () => ({ + maskPIIBatch: mockMaskPIIBatch, +})) + +import { POST } from '@/app/api/guardrails/mask-batch/route' + +describe('POST /api/guardrails/mask-batch', () => { + beforeEach(() => { + vi.clearAllMocks() + mockCheckInternalAuth.mockResolvedValue({ success: true }) + mockMaskPIIBatch.mockImplementation(async (texts: string[]) => texts.map((t) => `M(${t})`)) + }) + + it('returns 401 without internal auth', async () => { + mockCheckInternalAuth.mockResolvedValue({ + success: false, + error: 'Internal authentication required', + }) + + const res = await POST( + createMockRequest('POST', { texts: ['a@b.com'], entityTypes: ['EMAIL_ADDRESS'] }) + ) + + expect(res.status).toBe(401) + expect(mockMaskPIIBatch).not.toHaveBeenCalled() + }) + + it('masks the batch in-process and preserves order', async () => { + const res = await POST( + createMockRequest('POST', { + texts: ['a@b.com', 'hello'], + entityTypes: ['EMAIL_ADDRESS'], + language: 'en', + }) + ) + + expect(res.status).toBe(200) + const json = await res.json() + expect(json.masked).toEqual(['M(a@b.com)', 'M(hello)']) + expect(mockMaskPIIBatch).toHaveBeenCalledWith(['a@b.com', 'hello'], ['EMAIL_ADDRESS'], 'en') + }) + + it('rejects an invalid body with 400', async () => { + const res = await POST(createMockRequest('POST', { texts: 'not-an-array', entityTypes: [] })) + + expect(res.status).toBe(400) + expect(mockMaskPIIBatch).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/app/api/guardrails/mask-batch/route.ts b/apps/sim/app/api/guardrails/mask-batch/route.ts new file mode 100644 index 00000000000..490c2056bc3 --- /dev/null +++ b/apps/sim/app/api/guardrails/mask-batch/route.ts @@ -0,0 +1,31 @@ +import { createLogger } from '@sim/logger' +import { type NextRequest, NextResponse } from 'next/server' +import { guardrailsMaskBatchContract } from '@/lib/api/contracts' +import { parseRequest } from '@/lib/api/server' +import { checkInternalAuth } from '@/lib/auth/hybrid' +import { withRouteHandler } from '@/lib/core/utils/with-route-handler' +import { maskPIIBatch } from '@/lib/guardrails/validate_pii' + +const logger = createLogger('GuardrailsMaskBatchAPI') + +/** + * Internal batch PII masking. The log-redaction persist path runs in both the + * Next.js server and the trigger.dev runtime, but Presidio (Python venv) lives + * only in the app container — so redaction calls this endpoint server-to-server + * (internal JWT) to keep Presidio centralized here. + */ +export const POST = withRouteHandler(async (request: NextRequest) => { + const auth = await checkInternalAuth(request, { requireWorkflowId: false }) + if (!auth.success) { + return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }) + } + + const parsed = await parseRequest(guardrailsMaskBatchContract, request, {}) + if (!parsed.success) return parsed.response + + const { texts, entityTypes, language } = parsed.data.body + + const masked = await maskPIIBatch(texts, entityTypes, language) + logger.info('Masked PII batch', { count: texts.length }) + return NextResponse.json({ masked }) +}) diff --git a/apps/sim/lib/api/contracts/hotspots.ts b/apps/sim/lib/api/contracts/hotspots.ts index 6c280898c39..897c99fad56 100644 --- a/apps/sim/lib/api/contracts/hotspots.ts +++ b/apps/sim/lib/api/contracts/hotspots.ts @@ -45,6 +45,34 @@ export const guardrailsValidateContract = defineRouteContract({ }, }) +const guardrailsMaskBatchBodySchema = z.object({ + texts: z.array(z.string()).max(100_000), + entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(200), + language: z.string().min(1).max(20).optional(), +}) + +const guardrailsMaskBatchResponseSchema = z.object({ + masked: z.array(z.string()), +}) + +/** + * Internal batch PII masking. Called server-to-server (internal JWT) from the + * log-redaction persist path so Presidio always runs in the app container, + * including for async executions that persist inside the trigger.dev runtime. + */ +export const guardrailsMaskBatchContract = defineRouteContract({ + method: 'POST', + path: '/api/guardrails/mask-batch', + body: guardrailsMaskBatchBodySchema, + response: { + mode: 'json', + schema: guardrailsMaskBatchResponseSchema, + }, +}) + +export type GuardrailsMaskBatchBody = z.input +export type GuardrailsMaskBatchResult = z.output + const chatMessageSchema = z.object({ role: z.enum(['user', 'assistant', 'system']), content: z.string(), diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts new file mode 100644 index 00000000000..88bd490d94e --- /dev/null +++ b/apps/sim/lib/guardrails/mask-client.ts @@ -0,0 +1,44 @@ +import type { GuardrailsMaskBatchResult } from '@/lib/api/contracts' +import { generateInternalToken } from '@/lib/auth/internal' +import { getInternalApiBaseUrl } from '@/lib/core/utils/urls' + +/** + * Mask PII across many strings via the internal app-container endpoint. + * + * Presidio (a Python venv) only exists in the app container, but the + * log-redaction persist path also runs inside the trigger.dev runtime — so + * redaction always routes through HTTP, the same way the guardrails tool does. + * Order is preserved: the returned array matches `texts` length. + * + * Rejects on any non-2xx or shape mismatch so the caller can apply its own + * fail-safe (scrubbing rather than leaking). + */ +export async function maskPIIBatchViaHttp( + texts: string[], + entityTypes: string[], + language?: string +): Promise { + const token = await generateInternalToken() + const url = `${getInternalApiBaseUrl()}/api/guardrails/mask-batch` + + // boundary-raw-fetch: internal server-to-server call to the app container (internal JWT auth, configurable base URL) + const response = await fetch(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + authorization: `Bearer ${token}`, + }, + body: JSON.stringify({ texts, entityTypes, language }), + }) + + if (!response.ok) { + const detail = await response.text().catch(() => '') + throw new Error(`PII mask-batch request failed (${response.status}): ${detail.slice(0, 200)}`) + } + + const data = (await response.json()) as GuardrailsMaskBatchResult + if (!Array.isArray(data.masked) || data.masked.length !== texts.length) { + throw new Error('PII mask-batch returned an unexpected result') + } + return data.masked +} diff --git a/apps/sim/lib/guardrails/setup.sh b/apps/sim/lib/guardrails/setup.sh index 233e9a51a27..20eba4247ee 100755 --- a/apps/sim/lib/guardrails/setup.sh +++ b/apps/sim/lib/guardrails/setup.sh @@ -30,6 +30,11 @@ source "$VENV_DIR/bin/activate" pip install --upgrade pip pip install -r "$SCRIPT_DIR/requirements.txt" +# Presidio's default AnalyzerEngine loads the en_core_web_lg spaCy model; it is +# not a pip dependency, so download the version compatible with the installed spaCy. +echo "Downloading spaCy model (en_core_web_lg)..." +python -m spacy download en_core_web_lg + echo "" echo "✅ Setup complete! Guardrails validators are ready to use." echo "" diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index ba6886bb92d..3e1ec90edb3 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -13,6 +13,33 @@ const DEFAULT_TIMEOUT = 30000 // 30 seconds */ const PII_CHUNK_MAX_BYTES = 256 * 1024 +/** + * Resolve the guardrails Presidio interpreter + script path. + * + * `process.cwd()` is not stable across runtimes — the Next standalone container + * launches from the monorepo root while local dev and some paths run from + * `apps/sim` — so probe both layouts (mirrors the candidate-path resolution in + * `lib/execution/isolated-vm.ts`). Requires the bundled venv: throws if it is + * absent rather than silently falling back to the system `python3`, which has no + * Presidio and reports a misleading "not installed". + */ +function resolveGuardrailsPython(): { pythonCmd: string; scriptPath: string } { + const candidateDirs = [ + path.join(process.cwd(), 'apps', 'sim', 'lib', 'guardrails'), + path.join(process.cwd(), 'lib', 'guardrails'), + ] + for (const dir of candidateDirs) { + const venvPython = path.join(dir, 'venv', 'bin', 'python3') + if (fs.existsSync(venvPython)) { + return { pythonCmd: venvPython, scriptPath: path.join(dir, 'validate_pii.py') } + } + } + const probed = candidateDirs.map((d) => path.join(d, 'venv', 'bin', 'python3')).join(', ') + throw new Error( + `Guardrails Presidio venv not found (looked in ${probed}). Provision it with apps/sim/lib/guardrails/setup.sh locally, or verify the image build installs it.` + ) +} + export interface PIIValidationInput { text: string entityTypes: string[] // e.g., ["PERSON", "EMAIL_ADDRESS", "CREDIT_CARD"] @@ -136,10 +163,7 @@ export async function maskPIIBatch( */ function runPythonScript(payload: Record): Promise { return new Promise((resolve, reject) => { - const guardrailsDir = path.join(process.cwd(), 'lib/guardrails') - const scriptPath = path.join(guardrailsDir, 'validate_pii.py') - const venvPython = path.join(guardrailsDir, 'venv/bin/python3') - const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3' + const { pythonCmd, scriptPath } = resolveGuardrailsPython() const python = spawn(pythonCmd, [scriptPath]) let stdout = '' @@ -208,14 +232,7 @@ async function executePythonPIIDetection( requestId: string ): Promise { return new Promise((resolve, reject) => { - // Use path relative to project root - // In Next.js, process.cwd() returns the project root - const guardrailsDir = path.join(process.cwd(), 'lib/guardrails') - const scriptPath = path.join(guardrailsDir, 'validate_pii.py') - const venvPython = path.join(guardrailsDir, 'venv/bin/python3') - - // Use venv Python if it exists, otherwise fall back to system python3 - const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3' + const { pythonCmd, scriptPath } = resolveGuardrailsPython() const python = spawn(pythonCmd, [scriptPath]) diff --git a/apps/sim/lib/logs/execution/pii-redaction.test.ts b/apps/sim/lib/logs/execution/pii-redaction.test.ts index dccbc59cc38..5a2da7a5996 100644 --- a/apps/sim/lib/logs/execution/pii-redaction.test.ts +++ b/apps/sim/lib/logs/execution/pii-redaction.test.ts @@ -7,8 +7,8 @@ const { mockMaskPIIBatch } = vi.hoisted(() => ({ mockMaskPIIBatch: vi.fn(), })) -vi.mock('@/lib/guardrails/validate_pii', () => ({ - maskPIIBatch: mockMaskPIIBatch, +vi.mock('@/lib/guardrails/mask-client', () => ({ + maskPIIBatchViaHttp: mockMaskPIIBatch, })) import { REDACTION_FAILED_MARKER, redactPIIFromExecution } from '@/lib/logs/execution/pii-redaction' diff --git a/apps/sim/lib/logs/execution/pii-redaction.ts b/apps/sim/lib/logs/execution/pii-redaction.ts index 7b4794fd483..8cd0fac5326 100644 --- a/apps/sim/lib/logs/execution/pii-redaction.ts +++ b/apps/sim/lib/logs/execution/pii-redaction.ts @@ -1,5 +1,6 @@ import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' +import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client' const logger = createLogger('PiiRedaction') @@ -158,11 +159,9 @@ export async function redactPIIFromExecution( masked = collected.map(() => REDACTION_FAILED_MARKER) } else { try { - // Lazy import keeps the Python-spawning guardrails module (child_process + - // a `lib/guardrails` dir reference) out of the static middleware/RSC graph; - // it's only loaded at runtime on the Node log-persist path. - const { maskPIIBatch } = await import('@/lib/guardrails/validate_pii') - masked = await maskPIIBatch(collected, entityTypes, language) + // Presidio runs only in the app container; the persist path also runs in + // the trigger.dev runtime, so masking always goes over HTTP to the app. + masked = await maskPIIBatchViaHttp(collected, entityTypes, language) } catch (error) { logger.error('PII masking failed; scrubbing text to avoid leaking PII', { error: getErrorMessage(error), diff --git a/docker/app.Dockerfile b/docker/app.Dockerfile index 67eb5f02c77..2323ebb1df4 100644 --- a/docker/app.Dockerfile +++ b/docker/app.Dockerfile @@ -118,11 +118,14 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/execution/sandbox/bu COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/requirements.txt ./apps/sim/lib/guardrails/requirements.txt COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/validate_pii.py ./apps/sim/lib/guardrails/validate_pii.py -# Install Python dependencies with pip cache mount for faster rebuilds +# Install Python dependencies with pip cache mount for faster rebuilds. +# Presidio's default AnalyzerEngine loads en_core_web_lg, which is not a pip +# dependency — download the spaCy model into the venv after installing Presidio. RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m venv ./apps/sim/lib/guardrails/venv && \ ./apps/sim/lib/guardrails/venv/bin/pip install --upgrade pip && \ ./apps/sim/lib/guardrails/venv/bin/pip install -r ./apps/sim/lib/guardrails/requirements.txt && \ + ./apps/sim/lib/guardrails/venv/bin/python -m spacy download en_core_web_lg && \ chown -R nextjs:nodejs /app/apps/sim/lib/guardrails # Create .next/cache directory with correct ownership diff --git a/scripts/check-api-validation-contracts.ts b/scripts/check-api-validation-contracts.ts index 09744c629ba..17f0a25fa29 100644 --- a/scripts/check-api-validation-contracts.ts +++ b/scripts/check-api-validation-contracts.ts @@ -9,8 +9,8 @@ const QUERY_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/queries') const SELECTOR_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/selectors') const BASELINE = { - totalRoutes: 859, - zodRoutes: 859, + totalRoutes: 860, + zodRoutes: 860, nonZodRoutes: 0, } as const From d68546a494e8a753a152fac7c1fa4666f9d835c6 Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Fri, 19 Jun 2026 18:31:21 -0700 Subject: [PATCH 2/9] fix(guardrails): chunk + time-bound internal PII mask requests - chunk maskPIIBatchViaHttp by count (2000) and bytes (256KB) so large executions split across requests and never hit the contract's 100k cap - add AbortSignal.timeout(45s) per request so a slow/unreachable app container aborts and the caller scrubs, instead of hanging the trigger.dev job - catch maskPIIBatch failures in the route: log and return a structured 500 (broken venv fails loudly server-side; caller still scrubs, no leak) - add mask-client tests (order across chunks, count split, non-2xx, empty) --- .../app/api/guardrails/mask-batch/route.ts | 20 +++++- apps/sim/lib/guardrails/mask-client.test.ts | 68 +++++++++++++++++++ apps/sim/lib/guardrails/mask-client.ts | 61 +++++++++++++++-- 3 files changed, 142 insertions(+), 7 deletions(-) create mode 100644 apps/sim/lib/guardrails/mask-client.test.ts diff --git a/apps/sim/app/api/guardrails/mask-batch/route.ts b/apps/sim/app/api/guardrails/mask-batch/route.ts index 490c2056bc3..43979c611c9 100644 --- a/apps/sim/app/api/guardrails/mask-batch/route.ts +++ b/apps/sim/app/api/guardrails/mask-batch/route.ts @@ -1,4 +1,5 @@ import { createLogger } from '@sim/logger' +import { getErrorMessage } from '@sim/utils/errors' import { type NextRequest, NextResponse } from 'next/server' import { guardrailsMaskBatchContract } from '@/lib/api/contracts' import { parseRequest } from '@/lib/api/server' @@ -25,7 +26,20 @@ export const POST = withRouteHandler(async (request: NextRequest) => { const { texts, entityTypes, language } = parsed.data.body - const masked = await maskPIIBatch(texts, entityTypes, language) - logger.info('Masked PII batch', { count: texts.length }) - return NextResponse.json({ masked }) + try { + const masked = await maskPIIBatch(texts, entityTypes, language) + logger.info('Masked PII batch', { count: texts.length }) + return NextResponse.json({ masked }) + } catch (error) { + // A broken/absent venv makes maskPIIBatch throw; fail loudly here (the + // caller scrubs to REDACTION_FAILED, so PII is never leaked). + logger.error('PII batch masking failed', { + error: getErrorMessage(error), + count: texts.length, + }) + return NextResponse.json( + { error: getErrorMessage(error, 'PII masking failed') }, + { status: 500 } + ) + } }) diff --git a/apps/sim/lib/guardrails/mask-client.test.ts b/apps/sim/lib/guardrails/mask-client.test.ts new file mode 100644 index 00000000000..d1c4ad5b843 --- /dev/null +++ b/apps/sim/lib/guardrails/mask-client.test.ts @@ -0,0 +1,68 @@ +/** + * @vitest-environment node + */ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +const { mockToken, mockBaseUrl } = vi.hoisted(() => ({ + mockToken: vi.fn(), + mockBaseUrl: vi.fn(), +})) + +vi.mock('@/lib/auth/internal', () => ({ generateInternalToken: mockToken })) +vi.mock('@/lib/core/utils/urls', () => ({ getInternalApiBaseUrl: mockBaseUrl })) + +import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client' + +describe('maskPIIBatchViaHttp', () => { + let fetchMock: ReturnType + + beforeEach(() => { + vi.clearAllMocks() + mockToken.mockResolvedValue('tok') + mockBaseUrl.mockReturnValue('http://app.internal:3000') + fetchMock = vi.fn(async (_url: string, init: { body: string }) => { + const { texts } = JSON.parse(init.body) as { texts: string[] } + return new Response(JSON.stringify({ masked: texts.map((t) => `M(${t})`) }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }) + }) + vi.stubGlobal('fetch', fetchMock) + }) + + afterEach(() => { + vi.unstubAllGlobals() + }) + + it('masks a small batch in a single request, with an abort timeout', async () => { + const out = await maskPIIBatchViaHttp(['a', 'b', 'c'], ['EMAIL_ADDRESS']) + + expect(out).toEqual(['M(a)', 'M(b)', 'M(c)']) + expect(fetchMock).toHaveBeenCalledTimes(1) + expect(fetchMock.mock.calls[0][1].signal).toBeInstanceOf(AbortSignal) + }) + + it('splits by count into multiple requests, preserving global order', async () => { + const texts = Array.from({ length: 5000 }, (_, i) => `t${i}`) + + const out = await maskPIIBatchViaHttp(texts, []) + + expect(out).toHaveLength(5000) + expect(out[0]).toBe('M(t0)') + expect(out[4999]).toBe('M(t4999)') + expect(fetchMock).toHaveBeenCalledTimes(3) // 2000-per-request cap + }) + + it('throws on a non-2xx response so the caller can scrub', async () => { + fetchMock.mockResolvedValueOnce(new Response('boom', { status: 500 })) + + await expect(maskPIIBatchViaHttp(['a'], [])).rejects.toThrow(/mask-batch request failed/) + }) + + it('returns [] without any request for empty input', async () => { + const out = await maskPIIBatchViaHttp([], []) + + expect(out).toEqual([]) + expect(fetchMock).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts index 88bd490d94e..4b24b497ab3 100644 --- a/apps/sim/lib/guardrails/mask-client.ts +++ b/apps/sim/lib/guardrails/mask-client.ts @@ -2,25 +2,77 @@ import type { GuardrailsMaskBatchResult } from '@/lib/api/contracts' import { generateInternalToken } from '@/lib/auth/internal' import { getInternalApiBaseUrl } from '@/lib/core/utils/urls' +/** + * Per-request limits. A chunk is flushed when it hits either bound, keeping each + * request small enough for one short Presidio pass under a tight timeout and far + * below the contract's 100k-entry cap — so large executions split across + * requests instead of failing validation. + */ +const REQUEST_MAX_BYTES = 256 * 1024 +const REQUEST_MAX_COUNT = 2_000 +/** Slightly above the 30s Python subprocess timeout so a hung app container aborts gracefully. */ +const REQUEST_TIMEOUT_MS = 45_000 + /** * Mask PII across many strings via the internal app-container endpoint. * * Presidio (a Python venv) only exists in the app container, but the * log-redaction persist path also runs inside the trigger.dev runtime — so * redaction always routes through HTTP, the same way the guardrails tool does. - * Order is preserved: the returned array matches `texts` length. + * Strings are grouped into byte/count-budgeted chunks; order is preserved, so + * the returned array matches `texts` length. * - * Rejects on any non-2xx or shape mismatch so the caller can apply its own - * fail-safe (scrubbing rather than leaking). + * Rejects on any non-2xx, timeout, or shape mismatch so the caller can apply + * its own fail-safe (scrubbing rather than leaking). */ export async function maskPIIBatchViaHttp( texts: string[], entityTypes: string[], language?: string ): Promise { + if (texts.length === 0) return [] + const token = await generateInternalToken() const url = `${getInternalApiBaseUrl()}/api/guardrails/mask-batch` + const masked: string[] = [] + let batch: string[] = [] + let batchBytes = 0 + + const flush = async () => { + if (batch.length === 0) return + const out = await postChunk(url, token, batch, entityTypes, language) + if (out.length !== batch.length) { + throw new Error('PII mask-batch returned an unexpected result') + } + for (const item of out) masked.push(item) + batch = [] + batchBytes = 0 + } + + for (const text of texts) { + const bytes = Buffer.byteLength(text, 'utf8') + if ( + batch.length > 0 && + (batch.length >= REQUEST_MAX_COUNT || batchBytes + bytes > REQUEST_MAX_BYTES) + ) { + await flush() + } + batch.push(text) + batchBytes += bytes + } + await flush() + + return masked +} + +async function postChunk( + url: string, + token: string, + texts: string[], + entityTypes: string[], + language: string | undefined +): Promise { // boundary-raw-fetch: internal server-to-server call to the app container (internal JWT auth, configurable base URL) const response = await fetch(url, { method: 'POST', @@ -29,6 +81,7 @@ export async function maskPIIBatchViaHttp( authorization: `Bearer ${token}`, }, body: JSON.stringify({ texts, entityTypes, language }), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), }) if (!response.ok) { @@ -37,7 +90,7 @@ export async function maskPIIBatchViaHttp( } const data = (await response.json()) as GuardrailsMaskBatchResult - if (!Array.isArray(data.masked) || data.masked.length !== texts.length) { + if (!Array.isArray(data.masked)) { throw new Error('PII mask-batch returned an unexpected result') } return data.masked From d0e573b1d394bce6edccd0cbb1227f3ff2381b3f Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Fri, 19 Jun 2026 18:37:44 -0700 Subject: [PATCH 3/9] fix(guardrails): mint internal token per mask request A single token (5min TTL) could expire mid-batch when a large execution fans out into many sequential chunk requests; mint one per request instead. --- apps/sim/lib/guardrails/mask-client.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts index 4b24b497ab3..8e94495334b 100644 --- a/apps/sim/lib/guardrails/mask-client.ts +++ b/apps/sim/lib/guardrails/mask-client.ts @@ -32,7 +32,6 @@ export async function maskPIIBatchViaHttp( ): Promise { if (texts.length === 0) return [] - const token = await generateInternalToken() const url = `${getInternalApiBaseUrl()}/api/guardrails/mask-batch` const masked: string[] = [] @@ -41,7 +40,7 @@ export async function maskPIIBatchViaHttp( const flush = async () => { if (batch.length === 0) return - const out = await postChunk(url, token, batch, entityTypes, language) + const out = await postChunk(url, batch, entityTypes, language) if (out.length !== batch.length) { throw new Error('PII mask-batch returned an unexpected result') } @@ -68,11 +67,14 @@ export async function maskPIIBatchViaHttp( async function postChunk( url: string, - token: string, texts: string[], entityTypes: string[], language: string | undefined ): Promise { + // Mint per request: a single token (5min TTL) can expire mid-batch when a + // large execution fans out into many sequential chunk requests. + const token = await generateInternalToken() + // boundary-raw-fetch: internal server-to-server call to the app container (internal JWT auth, configurable base URL) const response = await fetch(url, { method: 'POST', From 2df826f22a00c9cbc945732e205a744717a53388 Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Mon, 22 Jun 2026 17:16:21 -0700 Subject: [PATCH 4/9] feat(guardrails): run PII via Presidio sidecars + TS recognizer registry - replace the per-call python3 subprocess (cold spaCy load every call) with two long-lived Presidio sidecars (analyzer + anonymizer) reached over HTTP; the app image no longer carries Python/Presidio/venv - add PRESIDIO_ANALYZER_URL / PRESIDIO_ANONYMIZER_URL - move VIN out of Python into a TS recognizer (check-digit validated) behind a CUSTOM_RECOGNIZERS registry so new custom detectors are one entry; masking is handled uniformly by the anonymizer - drive the guardrails block's PII type picker from the shared pii-entities catalog (adds VIN, fixes drift) so block + Data Retention never diverge - delete validate_pii.py, requirements.txt, setup.sh and the Dockerfile venv step --- apps/sim/blocks/blocks/guardrails.ts | 69 +--- apps/sim/lib/core/config/env.ts | 2 + apps/sim/lib/guardrails/.gitignore | 13 - apps/sim/lib/guardrails/README.md | 33 +- apps/sim/lib/guardrails/recognizers.ts | 27 ++ apps/sim/lib/guardrails/requirements.txt | 4 - apps/sim/lib/guardrails/setup.sh | 42 -- apps/sim/lib/guardrails/validate_pii.py | 260 ------------ apps/sim/lib/guardrails/validate_pii.test.ts | 122 ++++++ apps/sim/lib/guardrails/validate_pii.ts | 402 +++++++------------ apps/sim/lib/guardrails/vin.test.ts | 52 +++ apps/sim/lib/guardrails/vin.ts | 88 ++++ docker/app.Dockerfile | 15 +- 13 files changed, 455 insertions(+), 674 deletions(-) delete mode 100644 apps/sim/lib/guardrails/.gitignore create mode 100644 apps/sim/lib/guardrails/recognizers.ts delete mode 100644 apps/sim/lib/guardrails/requirements.txt delete mode 100755 apps/sim/lib/guardrails/setup.sh delete mode 100644 apps/sim/lib/guardrails/validate_pii.py create mode 100644 apps/sim/lib/guardrails/validate_pii.test.ts create mode 100644 apps/sim/lib/guardrails/vin.test.ts create mode 100644 apps/sim/lib/guardrails/vin.ts diff --git a/apps/sim/blocks/blocks/guardrails.ts b/apps/sim/blocks/blocks/guardrails.ts index 42fefcda81e..dd35d39d5fc 100644 --- a/apps/sim/blocks/blocks/guardrails.ts +++ b/apps/sim/blocks/blocks/guardrails.ts @@ -1,4 +1,5 @@ import { ShieldCheckIcon } from '@/components/icons' +import { PII_ENTITY_GROUPS } from '@/lib/guardrails/pii-entities' import type { BlockConfig } from '@/blocks/types' import { getModelOptions, @@ -170,65 +171,15 @@ Return ONLY the regex pattern - no explanations, no quotes, no forward slashes, title: 'PII Types to Detect', type: 'grouped-checkbox-list', maxHeight: 400, - options: [ - // Common PII types - { label: 'Person name', id: 'PERSON', group: 'Common' }, - { label: 'Email address', id: 'EMAIL_ADDRESS', group: 'Common' }, - { label: 'Phone number', id: 'PHONE_NUMBER', group: 'Common' }, - { label: 'Location', id: 'LOCATION', group: 'Common' }, - { label: 'Date or time', id: 'DATE_TIME', group: 'Common' }, - { label: 'IP address', id: 'IP_ADDRESS', group: 'Common' }, - { label: 'URL', id: 'URL', group: 'Common' }, - { label: 'Credit card number', id: 'CREDIT_CARD', group: 'Common' }, - { label: 'International bank account number (IBAN)', id: 'IBAN_CODE', group: 'Common' }, - { label: 'Cryptocurrency wallet address', id: 'CRYPTO', group: 'Common' }, - { label: 'Medical license number', id: 'MEDICAL_LICENSE', group: 'Common' }, - { label: 'Nationality / religion / political group', id: 'NRP', group: 'Common' }, - - // USA - { label: 'US bank account number', id: 'US_BANK_NUMBER', group: 'USA' }, - { label: 'US driver license number', id: 'US_DRIVER_LICENSE', group: 'USA' }, - { - label: 'US individual taxpayer identification number (ITIN)', - id: 'US_ITIN', - group: 'USA', - }, - { label: 'US passport number', id: 'US_PASSPORT', group: 'USA' }, - { label: 'US Social Security number', id: 'US_SSN', group: 'USA' }, - - // UK - { label: 'UK National Insurance number', id: 'UK_NINO', group: 'UK' }, - { label: 'UK NHS number', id: 'UK_NHS', group: 'UK' }, - - // Spain - { label: 'Spanish NIF number', id: 'ES_NIF', group: 'Spain' }, - { label: 'Spanish NIE number', id: 'ES_NIE', group: 'Spain' }, - - // Italy - { label: 'Italian fiscal code', id: 'IT_FISCAL_CODE', group: 'Italy' }, - { label: 'Italian driver license', id: 'IT_DRIVER_LICENSE', group: 'Italy' }, - { label: 'Italian identity card', id: 'IT_IDENTITY_CARD', group: 'Italy' }, - { label: 'Italian passport', id: 'IT_PASSPORT', group: 'Italy' }, - - // Poland - { label: 'Polish PESEL', id: 'PL_PESEL', group: 'Poland' }, - - // Singapore - { label: 'Singapore NRIC/FIN', id: 'SG_NRIC_FIN', group: 'Singapore' }, - - // Australia - { label: 'Australian business number (ABN)', id: 'AU_ABN', group: 'Australia' }, - { label: 'Australian company number (ACN)', id: 'AU_ACN', group: 'Australia' }, - { label: 'Australian tax file number (TFN)', id: 'AU_TFN', group: 'Australia' }, - { label: 'Australian Medicare number', id: 'AU_MEDICARE', group: 'Australia' }, - - // India - { label: 'Indian Aadhaar', id: 'IN_AADHAAR', group: 'India' }, - { label: 'Indian PAN', id: 'IN_PAN', group: 'India' }, - { label: 'Indian vehicle registration', id: 'IN_VEHICLE_REGISTRATION', group: 'India' }, - { label: 'Indian voter number', id: 'IN_VOTER', group: 'India' }, - { label: 'Indian passport', id: 'IN_PASSPORT', group: 'India' }, - ], + // Driven by the shared catalog (includes VIN and custom recognizers) so the + // block and the Data Retention settings never drift. + options: PII_ENTITY_GROUPS.flatMap((group) => + group.entities.map((entity) => ({ + label: entity.label, + id: entity.value, + group: group.label, + })) + ), condition: { field: 'validationType', value: ['pii'], diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 09c2e4fe51c..8e72f55cd08 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -311,6 +311,8 @@ export const env = createEnv({ PORT: z.number().optional(), // Main application port INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000) ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins + PRESIDIO_ANALYZER_URL: z.string().optional(), // Presidio analyzer sidecar base URL for PII detection (default http://localhost:5002) + PRESIDIO_ANONYMIZER_URL: z.string().optional(), // Presidio anonymizer sidecar base URL for PII masking (default http://localhost:5001) // OAuth Integration Credentials - All optional, enables third-party integrations GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services diff --git a/apps/sim/lib/guardrails/.gitignore b/apps/sim/lib/guardrails/.gitignore deleted file mode 100644 index 3485e9bdf6c..00000000000 --- a/apps/sim/lib/guardrails/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -# Python virtual environment -venv/ - -# Python cache -__pycache__/ -*.pyc -*.pyo -*.pyd -.Python - -# Presidio cache -.presidio/ - diff --git a/apps/sim/lib/guardrails/README.md b/apps/sim/lib/guardrails/README.md index 6ce7802d223..1fea46ae027 100644 --- a/apps/sim/lib/guardrails/README.md +++ b/apps/sim/lib/guardrails/README.md @@ -19,22 +19,26 @@ For **hallucination detection**, you'll need: - A knowledge base with documents - An LLM provider API key (or use hosted models) -### Python Validators (PII Detection) +### PII Detection (Presidio sidecars) -For **PII detection**, you need to set up a Python virtual environment and install Microsoft Presidio: +PII detection runs against two long-lived **Microsoft Presidio sidecar containers** reached over +HTTP — the analyzer (NLP detection) and the anonymizer (masking). In deployment they run alongside the +app container in the same ECS task; locally, run the official images: ```bash -cd apps/sim/lib/guardrails -./setup.sh +docker run -d -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest +docker run -d -p 5001:3000 mcr.microsoft.com/presidio-anonymizer:latest ``` -This will: -1. Create a Python virtual environment in `apps/sim/lib/guardrails/venv` -2. Install required dependencies: - - `presidio-analyzer` - PII detection engine - - `presidio-anonymizer` - PII masking/anonymization +Point the app at them (defaults shown): -The TypeScript wrapper will automatically use the virtual environment's Python interpreter. +```bash +PRESIDIO_ANALYZER_URL=http://localhost:5002 +PRESIDIO_ANONYMIZER_URL=http://localhost:5001 +``` + +VIN recognition (check-digit validated) is implemented in TypeScript (`vin.ts`) and never sent to the +sidecars. No Python or local venv is required. ## Usage @@ -93,10 +97,9 @@ See [Presidio documentation](https://microsoft.github.io/presidio/supported_enti - `validate_json.ts` - JSON validation (TypeScript) - `validate_regex.ts` - Regex validation (TypeScript) - `validate_hallucination.ts` - Hallucination detection with RAG + LLM scoring (TypeScript) -- `validate_pii.ts` - PII detection TypeScript wrapper (TypeScript) -- `validate_pii.py` - PII detection using Microsoft Presidio (Python) +- `validate_pii.ts` - PII detection client: calls the Presidio analyzer/anonymizer sidecars (TypeScript) +- `vin.ts` - Check-digit-validated VIN recognizer (TypeScript) +- `pii-entities.ts` - Client-safe PII entity catalog +- `mask-client.ts` - Internal HTTP client for batch PII masking from the log-redaction persist path - `validate.test.ts` - Test suite for JSON and regex validators -- `validate_hallucination.py` - Legacy Python hallucination detector (deprecated) -- `requirements.txt` - Python dependencies for PII detection (and legacy hallucination) -- `setup.sh` - Legacy installation script (deprecated) diff --git a/apps/sim/lib/guardrails/recognizers.ts b/apps/sim/lib/guardrails/recognizers.ts new file mode 100644 index 00000000000..0b644b89ce4 --- /dev/null +++ b/apps/sim/lib/guardrails/recognizers.ts @@ -0,0 +1,27 @@ +import { findVins } from '@/lib/guardrails/vin' + +/** + * A custom PII recognizer for entities Microsoft Presidio doesn't ship. + * + * A recognizer only does **detection** — it returns character spans. Masking is + * handled uniformly by the anonymizer sidecar, which replaces every span by its + * `entityType` (e.g. ``), so a recognizer never touches the sidecars or the + * masking path. + * + * To add one: + * 1. Implement a pure `detect(text)` (regex/checksum/etc., no I/O). + * 2. Register it in {@link CUSTOM_RECOGNIZERS}. + * 3. Add its entity to `pii-entities.ts` so it appears in the Data Retention UI. + */ +export interface CustomRecognizer { + /** Entity name; becomes the `` placeholder when masked. */ + entityType: string + /** Character spans of confirmed matches in `text`. Pure — no I/O. */ + detect(text: string): Array<{ start: number; end: number }> +} + +/** The registry of TS-side recognizers, applied on top of Presidio's built-ins. */ +export const CUSTOM_RECOGNIZERS: CustomRecognizer[] = [{ entityType: 'VIN', detect: findVins }] + +/** Entity names owned by a custom recognizer — never forwarded to the analyzer. */ +export const CUSTOM_ENTITY_TYPES = new Set(CUSTOM_RECOGNIZERS.map((r) => r.entityType)) diff --git a/apps/sim/lib/guardrails/requirements.txt b/apps/sim/lib/guardrails/requirements.txt deleted file mode 100644 index 135efae05b6..00000000000 --- a/apps/sim/lib/guardrails/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Microsoft Presidio for PII detection -presidio-analyzer>=2.2.0 -presidio-anonymizer>=2.2.0 - diff --git a/apps/sim/lib/guardrails/setup.sh b/apps/sim/lib/guardrails/setup.sh deleted file mode 100755 index 20eba4247ee..00000000000 --- a/apps/sim/lib/guardrails/setup.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Setup script for guardrails validators -# This creates a virtual environment and installs Python dependencies - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -VENV_DIR="$SCRIPT_DIR/venv" - -echo "Setting up Python environment for guardrails..." - -# Check if Python 3 is available -if ! command -v python3 &> /dev/null; then - echo "Error: python3 is not installed. Please install Python 3 first." - exit 1 -fi - -# Create virtual environment if it doesn't exist -if [ ! -d "$VENV_DIR" ]; then - echo "Creating virtual environment..." - python3 -m venv "$VENV_DIR" -else - echo "Virtual environment already exists." -fi - -# Activate virtual environment and install dependencies -echo "Installing Python dependencies..." -source "$VENV_DIR/bin/activate" -pip install --upgrade pip -pip install -r "$SCRIPT_DIR/requirements.txt" - -# Presidio's default AnalyzerEngine loads the en_core_web_lg spaCy model; it is -# not a pip dependency, so download the version compatible with the installed spaCy. -echo "Downloading spaCy model (en_core_web_lg)..." -python -m spacy download en_core_web_lg - -echo "" -echo "✅ Setup complete! Guardrails validators are ready to use." -echo "" -echo "Virtual environment created at: $VENV_DIR" - diff --git a/apps/sim/lib/guardrails/validate_pii.py b/apps/sim/lib/guardrails/validate_pii.py deleted file mode 100644 index d475b96e233..00000000000 --- a/apps/sim/lib/guardrails/validate_pii.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env python3 -""" -PII Detection Validator using Microsoft Presidio - -Detects personally identifiable information (PII) in text and either: -- Blocks the request if PII is detected (block mode) -- Masks the PII and returns the masked text (mask mode) -""" - -import sys -import json -from typing import List, Dict, Any - -try: - from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer - from presidio_anonymizer import AnonymizerEngine - from presidio_anonymizer.entities import OperatorConfig -except ImportError: - print(json.dumps({ - "passed": False, - "error": "Presidio not installed. Run: pip install presidio-analyzer presidio-anonymizer", - "detectedEntities": [] - })) - sys.exit(0) - - -class VinRecognizer(PatternRecognizer): - """ - Recognizes Vehicle Identification Numbers (17 chars, A-Z/0-9 excluding - I/O/Q) and validates the ISO 3779 check digit (position 9). Validation makes - accidental matches on arbitrary 17-char codes (request ids, SKUs, tokens) - extremely unlikely. Note: some non-North-American VINs don't use the check - digit and will be skipped — an intentional bias toward precision. - """ - - _TRANSLIT = { - **{str(d): d for d in range(10)}, - "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, - "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, - "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, - } - _WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] - - def validate_result(self, pattern_text: str): - vin = pattern_text.upper() - if len(vin) != 17: - return False - try: - total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS)) - except KeyError: - return False - check = total % 11 - expected = "X" if check == 10 else str(check) - return vin[8] == expected - - -def build_analyzer() -> "AnalyzerEngine": - """ - AnalyzerEngine with custom recognizers registered on top of the Presidio - defaults. Adds a check-digit-validated VIN recognizer. - """ - analyzer = AnalyzerEngine() - vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7) - vin_recognizer = VinRecognizer( - supported_entity="VIN", - patterns=[vin_pattern], - context=["vin", "vehicle", "chassis"], - ) - analyzer.registry.add_recognizer(vin_recognizer) - return analyzer - - -def detect_pii( - text: str, - entity_types: List[str], - mode: str = "block", - language: str = "en" -) -> Dict[str, Any]: - """ - Detect PII in text using Presidio - - Args: - text: Input text to analyze - entity_types: List of PII entity types to detect (e.g., ["PERSON", "EMAIL_ADDRESS"]) - mode: "block" to fail validation if PII found, "mask" to return masked text - language: Language code (default: "en") - - Returns: - Dictionary with validation result - """ - try: - # Initialize Presidio engines - analyzer = build_analyzer() - - # Analyze text for PII - results = analyzer.analyze( - text=text, - entities=entity_types if entity_types else None, # None = detect all - language=language - ) - - # Extract detected entities - detected_entities = [] - for result in results: - detected_entities.append({ - "type": result.entity_type, - "start": result.start, - "end": result.end, - "score": result.score, - "text": text[result.start:result.end] - }) - - # If no PII detected, validation passes - if not results: - return { - "passed": True, - "detectedEntities": [], - "maskedText": None - } - - # Block mode: fail validation if PII detected - if mode == "block": - entity_summary = {} - for entity in detected_entities: - entity_type = entity["type"] - entity_summary[entity_type] = entity_summary.get(entity_type, 0) + 1 - - summary_str = ", ".join([f"{count} {etype}" for etype, count in entity_summary.items()]) - - return { - "passed": False, - "error": f"PII detected: {summary_str}", - "detectedEntities": detected_entities, - "maskedText": None - } - - # Mask mode: anonymize PII and return masked text - elif mode == "mask": - anonymizer = AnonymizerEngine() - - # Use as the replacement pattern - operators = {} - for entity_type in set([r.entity_type for r in results]): - operators[entity_type] = OperatorConfig("replace", {"new_value": f"<{entity_type}>"}) - - anonymized_result = anonymizer.anonymize( - text=text, - analyzer_results=results, - operators=operators - ) - - return { - "passed": True, - "detectedEntities": detected_entities, - "maskedText": anonymized_result.text - } - - else: - return { - "passed": False, - "error": f"Invalid mode: {mode}. Must be 'block' or 'mask'", - "detectedEntities": [] - } - - except Exception as e: - return { - "passed": False, - "error": f"PII detection failed: {str(e)}", - "detectedEntities": [] - } - - -def mask_batch( - texts: List[str], - entity_types: List[str], - language: str = "en" -) -> Dict[str, Any]: - """ - Mask PII across many strings in a single process, reusing one analyzer + - anonymizer instance (engine construction loads the spaCy model and is the - dominant cost). Returns masked text per input, in input order; strings with - no detected PII are returned unchanged so callers can substitute directly. - """ - analyzer = build_analyzer() - anonymizer = AnonymizerEngine() - entities = entity_types if entity_types else None - - results = [] - for text in texts: - if not text: - results.append({"maskedText": text}) - continue - analyzer_results = analyzer.analyze(text=text, entities=entities, language=language) - if not analyzer_results: - results.append({"maskedText": text}) - continue - operators = { - entity_type: OperatorConfig("replace", {"new_value": f"<{entity_type}>"}) - for entity_type in set([r.entity_type for r in analyzer_results]) - } - anonymized = anonymizer.anonymize( - text=text, - analyzer_results=analyzer_results, - operators=operators - ) - results.append({"maskedText": anonymized.text}) - - return {"passed": True, "results": results} - - -def main(): - """Main entry point for CLI usage""" - try: - # Read input from stdin - input_data = sys.stdin.read() - data = json.loads(input_data) - - entity_types = data.get("entityTypes", []) - language = data.get("language", "en") - - # Batch mask mode: an array of texts processed with one warm engine pair. - if "texts" in data: - texts = data.get("texts", []) - result = mask_batch(texts, entity_types, language) - print(f"__SIM_RESULT__={json.dumps(result)}") - return - - text = data.get("text", "") - mode = data.get("mode", "block") - - # Validate inputs - if not text: - result = { - "passed": False, - "error": "No text provided", - "detectedEntities": [] - } - else: - result = detect_pii(text, entity_types, mode, language) - - # Output result with marker for parsing - print(f"__SIM_RESULT__={json.dumps(result)}") - - except json.JSONDecodeError as e: - print(f"__SIM_RESULT__={json.dumps({ - 'passed': False, - 'error': f'Invalid JSON input: {str(e)}', - 'detectedEntities': [] - })}") - except Exception as e: - print(f"__SIM_RESULT__={json.dumps({ - 'passed': False, - 'error': f'Unexpected error: {str(e)}', - 'detectedEntities': [] - })}") - - -if __name__ == "__main__": - main() - diff --git a/apps/sim/lib/guardrails/validate_pii.test.ts b/apps/sim/lib/guardrails/validate_pii.test.ts new file mode 100644 index 00000000000..047b0a28832 --- /dev/null +++ b/apps/sim/lib/guardrails/validate_pii.test.ts @@ -0,0 +1,122 @@ +/** + * @vitest-environment node + */ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { maskPIIBatch, validatePII } from '@/lib/guardrails/validate_pii' + +const VALID_VIN = '1HGCM82633A004352' + +interface Span { + entity_type: string + start: number + end: number + score: number +} + +/** Mimic the Presidio anonymizer's default `replace`: each span → ``. */ +function applyReplace(text: string, results: Span[]): string { + let out = text + for (const s of [...results].sort((a, b) => b.start - a.start)) { + out = `${out.slice(0, s.start)}<${s.entity_type}>${out.slice(s.end)}` + } + return out +} + +/** Analyzer mock: flags `a@b.com` as EMAIL_ADDRESS when that entity is in scope. */ +function emailSpans(text: string, entities: string[] | undefined): Span[] { + if (entities && !entities.includes('EMAIL_ADDRESS')) return [] + const idx = text.indexOf('a@b.com') + return idx === -1 ? [] : [{ entity_type: 'EMAIL_ADDRESS', start: idx, end: idx + 7, score: 0.9 }] +} + +describe('validate_pii (Presidio sidecars + TS VIN)', () => { + let analyzeBodies: Array<{ text: string; entities?: string[] }> + let fetchMock: ReturnType + + beforeEach(() => { + analyzeBodies = [] + fetchMock = vi.fn(async (url: string, init: { body: string }) => { + const body = JSON.parse(init.body) + if (url.includes('/analyze')) { + analyzeBodies.push({ text: body.text, entities: body.entities }) + return new Response(JSON.stringify(emailSpans(body.text, body.entities)), { status: 200 }) + } + // /anonymize + return new Response( + JSON.stringify({ text: applyReplace(body.text, body.analyzer_results) }), + { + status: 200, + } + ) + }) + vi.stubGlobal('fetch', fetchMock) + }) + + afterEach(() => vi.unstubAllGlobals()) + + describe('maskPIIBatch', () => { + it('masks both Presidio entities and TS-detected VINs, preserving order', async () => { + const out = await maskPIIBatch([`email a@b.com car ${VALID_VIN}`, 'nothing here'], []) + expect(out[0]).toBe('email car ') + expect(out[1]).toBe('nothing here') + }) + + it('strips VIN from the analyzer request (handled in TS)', async () => { + await maskPIIBatch([`vin ${VALID_VIN} mail a@b.com`], ['EMAIL_ADDRESS', 'VIN']) + expect(analyzeBodies[0].entities).toEqual(['EMAIL_ADDRESS']) + }) + + it('skips the analyzer entirely for a VIN-only request', async () => { + const out = await maskPIIBatch([`vin ${VALID_VIN}`], ['VIN']) + expect(out[0]).toBe('vin ') + expect(analyzeBodies).toHaveLength(0) + }) + + it('returns [] for empty input and leaves empty strings untouched', async () => { + expect(await maskPIIBatch([], [])).toEqual([]) + expect(await maskPIIBatch([''], [])).toEqual(['']) + }) + + it('throws on a sidecar failure so the caller can scrub', async () => { + fetchMock.mockResolvedValueOnce(new Response('boom', { status: 500 })) + await expect(maskPIIBatch(['email a@b.com'], [])).rejects.toThrow(/Presidio analyze failed/) + }) + }) + + describe('validatePII', () => { + it('block mode fails with a summary when PII is detected', async () => { + const res = await validatePII({ + text: `a@b.com and ${VALID_VIN}`, + entityTypes: [], + mode: 'block', + requestId: 'r1', + }) + expect(res.passed).toBe(false) + expect(res.error).toContain('EMAIL_ADDRESS') + expect(res.error).toContain('VIN') + expect(res.detectedEntities).toHaveLength(2) + }) + + it('mask mode returns masked text', async () => { + const res = await validatePII({ + text: `mail a@b.com vin ${VALID_VIN}`, + entityTypes: [], + mode: 'mask', + requestId: 'r2', + }) + expect(res.passed).toBe(true) + expect(res.maskedText).toBe('mail vin ') + }) + + it('passes clean text', async () => { + const res = await validatePII({ + text: 'nothing to see', + entityTypes: [], + mode: 'block', + requestId: 'r3', + }) + expect(res.passed).toBe(true) + expect(res.detectedEntities).toHaveLength(0) + }) + }) +}) diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index 3e1ec90edb3..1e4c1611730 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -1,44 +1,15 @@ -import { spawn } from 'child_process' -import fs from 'fs' -import path from 'path' import { createLogger } from '@sim/logger' +import { getErrorMessage } from '@sim/utils/errors' +import { env } from '@/lib/core/config/env' +import { CUSTOM_ENTITY_TYPES, CUSTOM_RECOGNIZERS } from '@/lib/guardrails/recognizers' const logger = createLogger('PIIValidator') -const DEFAULT_TIMEOUT = 30000 // 30 seconds -/** - * Max total bytes of text sent to a single Presidio subprocess. spaCy NER is the - * bottleneck, so large payloads are split into multiple short calls instead of - * one that risks the 30s timeout. - */ -const PII_CHUNK_MAX_BYTES = 256 * 1024 +/** Just above the analyzer's spaCy NER budget so a stuck sidecar aborts gracefully. */ +const REQUEST_TIMEOUT_MS = 45_000 -/** - * Resolve the guardrails Presidio interpreter + script path. - * - * `process.cwd()` is not stable across runtimes — the Next standalone container - * launches from the monorepo root while local dev and some paths run from - * `apps/sim` — so probe both layouts (mirrors the candidate-path resolution in - * `lib/execution/isolated-vm.ts`). Requires the bundled venv: throws if it is - * absent rather than silently falling back to the system `python3`, which has no - * Presidio and reports a misleading "not installed". - */ -function resolveGuardrailsPython(): { pythonCmd: string; scriptPath: string } { - const candidateDirs = [ - path.join(process.cwd(), 'apps', 'sim', 'lib', 'guardrails'), - path.join(process.cwd(), 'lib', 'guardrails'), - ] - for (const dir of candidateDirs) { - const venvPython = path.join(dir, 'venv', 'bin', 'python3') - if (fs.existsSync(venvPython)) { - return { pythonCmd: venvPython, scriptPath: path.join(dir, 'validate_pii.py') } - } - } - const probed = candidateDirs.map((d) => path.join(d, 'venv', 'bin', 'python3')).join(', ') - throw new Error( - `Guardrails Presidio venv not found (looked in ${probed}). Provision it with apps/sim/lib/guardrails/setup.sh locally, or verify the image build installs it.` - ) -} +const ANALYZER_URL = env.PRESIDIO_ANALYZER_URL || 'http://localhost:5002' +const ANONYMIZER_URL = env.PRESIDIO_ANONYMIZER_URL || 'http://localhost:5001' export interface PIIValidationInput { text: string @@ -63,12 +34,89 @@ export interface PIIValidationResult { maskedText?: string } +interface AnalyzerSpan { + entity_type: string + start: number + end: number + score: number +} + +/** + * Detect PII spans via the Presidio analyzer sidecar. Returns [] when the request + * targets only custom entities (nothing left for Presidio). Throws on transport/HTTP failure. + */ +async function analyze( + text: string, + entities: string[] | undefined, + language: string +): Promise { + // Custom-only request: the analyzer has nothing to do. + if (entities && entities.length === 0) return [] + + // boundary-raw-fetch: internal call to the Presidio analyzer sidecar over localhost + const response = await fetch(`${ANALYZER_URL}/analyze`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ text, language, ...(entities ? { entities } : {}) }), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), + }) + if (!response.ok) { + const detail = await response.text().catch(() => '') + throw new Error(`Presidio analyze failed (${response.status}): ${detail.slice(0, 200)}`) + } + return (await response.json()) as AnalyzerSpan[] +} + /** - * Validate text for PII using Microsoft Presidio + * Mask spans via the Presidio anonymizer sidecar. Omitting `anonymizers` uses the + * default `replace` operator, which yields ``. Throws on failure. + */ +async function anonymize(text: string, spans: AnalyzerSpan[]): Promise { + if (spans.length === 0) return text + + // boundary-raw-fetch: internal call to the Presidio anonymizer sidecar over localhost + const response = await fetch(`${ANONYMIZER_URL}/anonymize`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ text, analyzer_results: spans }), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), + }) + if (!response.ok) { + const detail = await response.text().catch(() => '') + throw new Error(`Presidio anonymize failed (${response.status}): ${detail.slice(0, 200)}`) + } + const data = (await response.json()) as { text: string } + return data.text +} + +/** + * All PII spans in `text`: spans from the custom TS recognizers plus the analyzer + * sidecar's spans, both on original-text offsets. Custom spans carry their own + * `entity_type`, which the anonymizer replaces with `` like any other. + * An empty `entityTypes` means "all"; otherwise each side gets only the entities it + * owns (custom names are never forwarded to the analyzer). + */ +async function collectSpans( + text: string, + entityTypes: string[], + language: string +): Promise { + const all = entityTypes.length === 0 + const customSpans: AnalyzerSpan[] = CUSTOM_RECOGNIZERS.filter( + (r) => all || entityTypes.includes(r.entityType) + ).flatMap((r) => + r.detect(text).map((s) => ({ entity_type: r.entityType, start: s.start, end: s.end, score: 1 })) + ) + const requestEntities = all ? undefined : entityTypes.filter((t) => !CUSTOM_ENTITY_TYPES.has(t)) + const presidioSpans = await analyze(text, requestEntities, language) + return [...customSpans, ...presidioSpans] +} + +/** + * Validate text for PII using Presidio sidecars (+ the TS VIN recognizer). * - * Supports two modes: - * - block: Fails validation if any PII is detected - * - mask: Passes validation and returns masked text with PII replaced + * - block: fails validation if any PII is detected + * - mask: passes and returns masked text with PII replaced by `` */ export async function validatePII(input: PIIValidationInput): Promise { const { text, entityTypes, mode, language = 'en', requestId } = input @@ -81,41 +129,57 @@ export async function validatePII(input: PIIValidationInput): Promise ({ + type: s.entity_type, + start: s.start, + end: s.end, + score: s.score, + text: text.slice(s.start, s.end), + })) + + if (spans.length === 0) { + logger.info(`[${requestId}] PII validation completed`, { passed: true, detectedCount: 0 }) + return { passed: true, detectedEntities: [], maskedText: mode === 'mask' ? text : undefined } + } - logger.info(`[${requestId}] PII validation completed`, { - passed: result.passed, - detectedCount: result.detectedEntities.length, - hasMaskedText: !!result.maskedText, - }) + if (mode === 'block') { + const counts = new Map() + for (const e of detectedEntities) counts.set(e.type, (counts.get(e.type) ?? 0) + 1) + const summary = Array.from(counts.entries()) + .map(([type, count]) => `${count} ${type}`) + .join(', ') + logger.info(`[${requestId}] PII validation completed`, { + passed: false, + detectedCount: detectedEntities.length, + }) + return { passed: false, error: `PII detected: ${summary}`, detectedEntities } + } - return result - } catch (error: any) { - logger.error(`[${requestId}] PII validation failed`, { - error: error.message, + // mask mode: the anonymizer replaces every span (incl. VIN) with ``. + const maskedText = await anonymize(text, spans) + logger.info(`[${requestId}] PII validation completed`, { + passed: true, + detectedCount: detectedEntities.length, + hasMaskedText: true, }) - + return { passed: true, detectedEntities, maskedText } + } catch (error) { + logger.error(`[${requestId}] PII validation failed`, { error: getErrorMessage(error) }) return { passed: false, - error: `PII validation failed: ${error.message}`, + error: `PII validation failed: ${getErrorMessage(error)}`, detectedEntities: [], } } } -interface PIIMaskBatchResult { - passed: boolean - error?: string - results?: { maskedText: string }[] -} - /** - * Mask PII across many strings, preserving input order. Strings are grouped into - * byte-budgeted chunks so no single subprocess exceeds {@link PII_CHUNK_MAX_BYTES} - * (keeping each call well under the 30s timeout). One Presidio engine pair is - * reused per subprocess invocation. Rejects on any subprocess failure so callers - * can apply their own fail-safe. + * Mask PII across many strings via the Presidio sidecars, preserving input order. + * Each string runs a TS VIN pre-pass, then analyze → anonymize. Strings with no + * detected PII are returned unchanged. Rejects on any sidecar failure so callers + * can apply their own fail-safe (scrub rather than leak). */ export async function maskPIIBatch( texts: string[], @@ -124,214 +188,16 @@ export async function maskPIIBatch( ): Promise { if (texts.length === 0) return [] - const chunks: string[][] = [] - let current: string[] = [] - let currentBytes = 0 - for (const text of texts) { - const bytes = Buffer.byteLength(text, 'utf8') - if (current.length > 0 && currentBytes + bytes > PII_CHUNK_MAX_BYTES) { - chunks.push(current) - current = [] - currentBytes = 0 - } - current.push(text) - currentBytes += bytes - } - if (current.length > 0) chunks.push(current) - const masked: string[] = [] - for (const chunk of chunks) { - const result = await runPythonScript({ - texts: chunk, - entityTypes, - mode: 'mask', - language, - }) - if (!result.passed || !result.results || result.results.length !== chunk.length) { - throw new Error(result.error || 'PII batch masking returned an unexpected result') + for (const text of texts) { + if (!text) { + masked.push(text) + continue } - for (const item of result.results) masked.push(item.maskedText) + const spans = await collectSpans(text, entityTypes, language) + masked.push(await anonymize(text, spans)) } - return masked } -/** - * Spawn the Presidio Python script, write the payload to stdin as JSON, and parse - * the `__SIM_RESULT__=` marker from stdout. Rejects on non-zero exit, timeout, - * spawn failure, or a missing/unparseable marker. - */ -function runPythonScript(payload: Record): Promise { - return new Promise((resolve, reject) => { - const { pythonCmd, scriptPath } = resolveGuardrailsPython() - - const python = spawn(pythonCmd, [scriptPath]) - let stdout = '' - let stderr = '' - - const timeout = setTimeout(() => { - python.kill() - reject(new Error('PII processing timeout')) - }, DEFAULT_TIMEOUT) - - // stdin errors (e.g. EPIPE when the child exits before draining the payload — - // chunks can exceed the OS pipe buffer) emit on stdin, not the process. Without - // a listener Node throws an unhandled 'error' and crashes; funnel it into the - // promise so the caller's fail-safe scrub path handles it. - python.stdin.on('error', (error: Error) => { - clearTimeout(timeout) - reject(new Error(`PII script stdin error: ${error.message}`)) - }) - python.stdin.write(JSON.stringify(payload)) - python.stdin.end() - python.stdout.on('data', (data) => { - stdout += data.toString() - }) - python.stderr.on('data', (data) => { - stderr += data.toString() - }) - - python.on('close', (code) => { - clearTimeout(timeout) - if (code !== 0) { - reject(new Error(stderr || `PII script exited with code ${code}`)) - return - } - const prefix = '__SIM_RESULT__=' - const marker = stdout.split('\n').find((l) => l.startsWith(prefix)) - if (!marker) { - reject(new Error(`No result marker in PII script output: ${stdout.substring(0, 200)}`)) - return - } - try { - resolve(JSON.parse(marker.slice(prefix.length)) as T) - } catch (error: any) { - reject(new Error(`Failed to parse PII script result: ${error.message}`)) - } - }) - - python.on('error', (error) => { - clearTimeout(timeout) - reject( - new Error( - `Failed to execute Python: ${error.message}. Make sure Python 3 and Presidio are installed.` - ) - ) - }) - }) -} - -/** - * Execute Python PII detection script - */ -async function executePythonPIIDetection( - text: string, - entityTypes: string[], - mode: string, - language: string, - requestId: string -): Promise { - return new Promise((resolve, reject) => { - const { pythonCmd, scriptPath } = resolveGuardrailsPython() - - const python = spawn(pythonCmd, [scriptPath]) - - let stdout = '' - let stderr = '' - - const timeout = setTimeout(() => { - python.kill() - reject(new Error('PII validation timeout')) - }, DEFAULT_TIMEOUT) - - // Write input to stdin as JSON - const inputData = JSON.stringify({ - text, - entityTypes, - mode, - language, - }) - // See runPythonScript: stdin errors (EPIPE on early child exit) must be - // caught here or Node throws an unhandled 'error' and crashes the process. - python.stdin.on('error', (error: Error) => { - clearTimeout(timeout) - reject(new Error(`Failed to write to Python: ${error.message}`)) - }) - python.stdin.write(inputData) - python.stdin.end() - - python.stdout.on('data', (data) => { - stdout += data.toString() - }) - - python.stderr.on('data', (data) => { - stderr += data.toString() - }) - - python.on('close', (code) => { - clearTimeout(timeout) - - if (code !== 0) { - logger.error(`[${requestId}] Python PII detection failed`, { - code, - stderr, - }) - resolve({ - passed: false, - error: stderr || 'PII detection failed', - detectedEntities: [], - }) - return - } - - // Parse result from stdout - try { - const prefix = '__SIM_RESULT__=' - const lines = stdout.split('\n') - const marker = lines.find((l) => l.startsWith(prefix)) - - if (marker) { - const jsonPart = marker.slice(prefix.length) - const result = JSON.parse(jsonPart) - resolve(result) - } else { - logger.error(`[${requestId}] No result marker found`, { - stdout, - stderr, - stdoutLines: lines, - }) - resolve({ - passed: false, - error: `No result marker found in output. stdout: ${stdout.substring(0, 200)}, stderr: ${stderr.substring(0, 200)}`, - detectedEntities: [], - }) - } - } catch (error: any) { - logger.error(`[${requestId}] Failed to parse Python result`, { - error: error.message, - stdout, - stderr, - }) - resolve({ - passed: false, - error: `Failed to parse result: ${error.message}. stdout: ${stdout.substring(0, 200)}`, - detectedEntities: [], - }) - } - }) - - python.on('error', (error) => { - clearTimeout(timeout) - logger.error(`[${requestId}] Failed to spawn Python process`, { - error: error.message, - }) - reject( - new Error( - `Failed to execute Python: ${error.message}. Make sure Python 3 and Presidio are installed.` - ) - ) - }) - }) -} - export { type PIIEntityType, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities' diff --git a/apps/sim/lib/guardrails/vin.test.ts b/apps/sim/lib/guardrails/vin.test.ts new file mode 100644 index 00000000000..293ed4a0a97 --- /dev/null +++ b/apps/sim/lib/guardrails/vin.test.ts @@ -0,0 +1,52 @@ +/** + * @vitest-environment node + */ +import { describe, expect, it } from 'vitest' +import { findVins, isValidVin, maskVins } from '@/lib/guardrails/vin' + +const VALID = '1HGCM82633A004352' // check digit (position 9) = 3 +const INVALID_CHECK = '1HGCM82643A004352' // same shape, wrong check digit + +describe('isValidVin', () => { + it('accepts a VIN with a correct ISO 3779 check digit', () => { + expect(isValidVin(VALID)).toBe(true) + }) + + it('rejects a 17-char code whose check digit does not validate', () => { + expect(isValidVin(INVALID_CHECK)).toBe(false) + }) + + it('rejects wrong length', () => { + expect(isValidVin('1HGCM82633A00435')).toBe(false) + expect(isValidVin(`${VALID}9`)).toBe(false) + }) + + it('rejects disallowed letters I/O/Q', () => { + expect(isValidVin('1HGCM82633A0043I2'.slice(0, 17))).toBe(false) + }) +}) + +describe('findVins', () => { + it('returns spans only for valid VINs, in order', () => { + const text = `vin ${VALID} and bogus ${INVALID_CHECK} done` + const spans = findVins(text) + expect(spans).toHaveLength(1) + expect(text.slice(spans[0].start, spans[0].end)).toBe(VALID) + }) + + it('finds multiple valid VINs', () => { + const text = `${VALID} ${VALID}` + expect(findVins(text)).toHaveLength(2) + }) +}) + +describe('maskVins', () => { + it('replaces valid VINs with and leaves invalid candidates untouched', () => { + expect(maskVins(`car ${VALID}`)).toBe('car ') + expect(maskVins(`code ${INVALID_CHECK}`)).toBe(`code ${INVALID_CHECK}`) + }) + + it('returns text unchanged when there is no VIN', () => { + expect(maskVins('no vehicle here')).toBe('no vehicle here') + }) +}) diff --git a/apps/sim/lib/guardrails/vin.ts b/apps/sim/lib/guardrails/vin.ts new file mode 100644 index 00000000000..aaed7454717 --- /dev/null +++ b/apps/sim/lib/guardrails/vin.ts @@ -0,0 +1,88 @@ +/** + * Vehicle Identification Number (VIN) recognition. + * + * Presidio has no built-in VIN recognizer, and a VIN is pure pattern + arithmetic + * (no NLP), so it lives here in TS rather than in the Presidio sidecars. A VIN is + * 17 chars from A-Z/0-9 excluding I/O/Q; this validates the ISO 3779 check digit + * (position 9), which makes accidental matches on arbitrary 17-char codes (request + * ids, SKUs, tokens) extremely unlikely. Some non-North-American VINs omit the + * check digit and are skipped — an intentional bias toward precision. + */ + +const VIN_PATTERN = /\b[A-HJ-NPR-Z0-9]{17}\b/g + +/** Character → numeric value for the ISO 3779 weighted-sum check digit. */ +const TRANSLIT: Record = { + '0': 0, + '1': 1, + '2': 2, + '3': 3, + '4': 4, + '5': 5, + '6': 6, + '7': 7, + '8': 8, + '9': 9, + A: 1, + B: 2, + C: 3, + D: 4, + E: 5, + F: 6, + G: 7, + H: 8, + J: 1, + K: 2, + L: 3, + M: 4, + N: 5, + P: 7, + R: 9, + S: 2, + T: 3, + U: 4, + V: 5, + W: 6, + X: 7, + Y: 8, + Z: 9, +} + +/** Positional weights; index 8 (position 9) is the check digit itself (weight 0). */ +const WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] + +const VIN_PLACEHOLDER = '' + +/** Whether a 17-char candidate satisfies the ISO 3779 check digit at position 9. */ +export function isValidVin(candidate: string): boolean { + const vin = candidate.toUpperCase() + if (vin.length !== 17) return false + let total = 0 + for (let i = 0; i < 17; i++) { + const value = TRANSLIT[vin[i]] + if (value === undefined) return false + total += value * WEIGHTS[i] + } + const check = total % 11 + const expected = check === 10 ? 'X' : String(check) + return vin[8] === expected +} + +/** Spans of every check-digit-valid VIN in `text`, in order of appearance. */ +export function findVins(text: string): Array<{ start: number; end: number }> { + const spans: Array<{ start: number; end: number }> = [] + for (const match of text.matchAll(VIN_PATTERN)) { + if (match.index === undefined) continue + if (isValidVin(match[0])) { + spans.push({ start: match.index, end: match.index + match[0].length }) + } + } + return spans +} + +/** Replace every check-digit-valid VIN in `text` with ``. */ +export function maskVins(text: string): string { + return text.replace(VIN_PATTERN, (candidate) => + isValidVin(candidate) ? VIN_PLACEHOLDER : candidate + ) +} diff --git a/docker/app.Dockerfile b/docker/app.Dockerfile index 2323ebb1df4..ff0ea1ccc28 100644 --- a/docker/app.Dockerfile +++ b/docker/app.Dockerfile @@ -114,19 +114,8 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/execution/isolated-v # apps/sim/lib/execution/sandbox/bundles/build.ts to regenerate. COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/execution/sandbox/bundles ./apps/sim/lib/execution/sandbox/bundles -# Guardrails setup with pip caching -COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/requirements.txt ./apps/sim/lib/guardrails/requirements.txt -COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/validate_pii.py ./apps/sim/lib/guardrails/validate_pii.py - -# Install Python dependencies with pip cache mount for faster rebuilds. -# Presidio's default AnalyzerEngine loads en_core_web_lg, which is not a pip -# dependency — download the spaCy model into the venv after installing Presidio. -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m venv ./apps/sim/lib/guardrails/venv && \ - ./apps/sim/lib/guardrails/venv/bin/pip install --upgrade pip && \ - ./apps/sim/lib/guardrails/venv/bin/pip install -r ./apps/sim/lib/guardrails/requirements.txt && \ - ./apps/sim/lib/guardrails/venv/bin/python -m spacy download en_core_web_lg && \ - chown -R nextjs:nodejs /app/apps/sim/lib/guardrails +# Guardrails PII runs in dedicated Presidio sidecar containers (analyzer + +# anonymizer), reached over localhost — no Python/Presidio in this image. # Create .next/cache directory with correct ownership RUN mkdir -p apps/sim/.next/cache && \ From 91ce2d1431d536d288bdeb13a249353348d5f615 Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Mon, 22 Jun 2026 17:38:17 -0700 Subject: [PATCH 5/9] fix(guardrails): bound-parallelize mask batch; refresh stale comments - maskPIIBatch runs per-string sidecar calls with bounded concurrency (8) via mapWithConcurrency, so a chunk of many small leaves finishes within the 45s request timeout instead of aborting and scrubbing; order + fail-on-error kept - drop stale comments referencing the deleted Python venv / 30s subprocess timeout --- .../app/api/guardrails/mask-batch/route.ts | 10 +++---- apps/sim/lib/guardrails/mask-client.ts | 8 +++--- apps/sim/lib/guardrails/validate_pii.ts | 26 ++++++++++--------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/apps/sim/app/api/guardrails/mask-batch/route.ts b/apps/sim/app/api/guardrails/mask-batch/route.ts index 43979c611c9..696b69e749c 100644 --- a/apps/sim/app/api/guardrails/mask-batch/route.ts +++ b/apps/sim/app/api/guardrails/mask-batch/route.ts @@ -11,9 +11,9 @@ const logger = createLogger('GuardrailsMaskBatchAPI') /** * Internal batch PII masking. The log-redaction persist path runs in both the - * Next.js server and the trigger.dev runtime, but Presidio (Python venv) lives - * only in the app container — so redaction calls this endpoint server-to-server - * (internal JWT) to keep Presidio centralized here. + * Next.js server and the trigger.dev runtime, but the Presidio sidecars live only + * in the app task — so redaction calls this endpoint server-to-server (internal + * JWT) to keep Presidio centralized here. */ export const POST = withRouteHandler(async (request: NextRequest) => { const auth = await checkInternalAuth(request, { requireWorkflowId: false }) @@ -31,8 +31,8 @@ export const POST = withRouteHandler(async (request: NextRequest) => { logger.info('Masked PII batch', { count: texts.length }) return NextResponse.json({ masked }) } catch (error) { - // A broken/absent venv makes maskPIIBatch throw; fail loudly here (the - // caller scrubs to REDACTION_FAILED, so PII is never leaked). + // An unreachable/misconfigured Presidio sidecar makes maskPIIBatch throw; fail + // loudly here (the caller scrubs to REDACTION_FAILED, so PII is never leaked). logger.error('PII batch masking failed', { error: getErrorMessage(error), count: texts.length, diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts index 8e94495334b..3fb818a3c72 100644 --- a/apps/sim/lib/guardrails/mask-client.ts +++ b/apps/sim/lib/guardrails/mask-client.ts @@ -10,15 +10,15 @@ import { getInternalApiBaseUrl } from '@/lib/core/utils/urls' */ const REQUEST_MAX_BYTES = 256 * 1024 const REQUEST_MAX_COUNT = 2_000 -/** Slightly above the 30s Python subprocess timeout so a hung app container aborts gracefully. */ +/** Bounds one mask-batch request; an unreachable/stuck Presidio sidecar aborts so the caller scrubs. */ const REQUEST_TIMEOUT_MS = 45_000 /** * Mask PII across many strings via the internal app-container endpoint. * - * Presidio (a Python venv) only exists in the app container, but the - * log-redaction persist path also runs inside the trigger.dev runtime — so - * redaction always routes through HTTP, the same way the guardrails tool does. + * The Presidio sidecars run only in the app task, but the log-redaction persist + * path also runs inside the trigger.dev runtime — so redaction always routes + * through HTTP, the same way the guardrails tool does. * Strings are grouped into byte/count-budgeted chunks; order is preserved, so * the returned array matches `texts` length. * diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index 1e4c1611730..d103c3fc44c 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -1,6 +1,7 @@ import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' import { env } from '@/lib/core/config/env' +import { mapWithConcurrency } from '@/lib/core/utils/concurrency' import { CUSTOM_ENTITY_TYPES, CUSTOM_RECOGNIZERS } from '@/lib/guardrails/recognizers' const logger = createLogger('PIIValidator') @@ -8,6 +9,9 @@ const logger = createLogger('PIIValidator') /** Just above the analyzer's spaCy NER budget so a stuck sidecar aborts gracefully. */ const REQUEST_TIMEOUT_MS = 45_000 +/** Concurrent per-string sidecar calls within one batch; the warm model handles parallelism. */ +const MASK_CONCURRENCY = 8 + const ANALYZER_URL = env.PRESIDIO_ANALYZER_URL || 'http://localhost:5002' const ANONYMIZER_URL = env.PRESIDIO_ANONYMIZER_URL || 'http://localhost:5001' @@ -177,9 +181,12 @@ export async function validatePII(input: PIIValidationInput): Promise { if (texts.length === 0) return [] - const masked: string[] = [] - for (const text of texts) { - if (!text) { - masked.push(text) - continue - } + return mapWithConcurrency(texts, MASK_CONCURRENCY, async (text) => { + if (!text) return text const spans = await collectSpans(text, entityTypes, language) - masked.push(await anonymize(text, spans)) - } - return masked + return anonymize(text, spans) + }) } export { type PIIEntityType, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities' From de32214934df8b1fc8ad769df191ac63bb42d69a Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Tue, 23 Jun 2026 01:43:21 -0700 Subject: [PATCH 6/9] refactor(guardrails): single Presidio image, native VIN, per-rule redaction language MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - collapse the analyzer/anonymizer URLs into one PRESIDIO_URL (combined image serves /analyze + /anonymize) - remove the TS VIN recognizer (vin.ts, recognizers.ts) — VIN is now native + multi-language in the image; validate_pii is a thin analyze→anonymize client - trim KR_RRN/TH_TNIN from the catalog (no Korean/Thai model in the image) - add per-rule redaction language: PII_LANGUAGES catalog drives the contract enum, the Data Retention rule modal, and the guardrails block dropdown; resolver + logger thread it through to maskPIIBatch (default en), so non-English entity rules (e.g. ES_NIF) actually fire instead of silently no-op'ing under en --- .../[id]/data-retention/route.ts | 17 +++- apps/sim/blocks/blocks/guardrails.ts | 10 +-- .../components/data-retention-settings.tsx | 38 +++++++- apps/sim/lib/api/contracts/primitives.ts | 3 + apps/sim/lib/billing/retention.test.ts | 25 ++++-- apps/sim/lib/billing/retention.ts | 7 +- apps/sim/lib/core/config/env.ts | 3 +- apps/sim/lib/guardrails/pii-entities.ts | 27 +++++- apps/sim/lib/guardrails/recognizers.ts | 27 ------ apps/sim/lib/guardrails/validate_pii.test.ts | 38 ++++---- apps/sim/lib/guardrails/validate_pii.ts | 63 ++++--------- apps/sim/lib/guardrails/vin.test.ts | 52 ----------- apps/sim/lib/guardrails/vin.ts | 88 ------------------- apps/sim/lib/logs/execution/logger.ts | 5 +- packages/db/schema.ts | 2 + 15 files changed, 148 insertions(+), 257 deletions(-) delete mode 100644 apps/sim/lib/guardrails/recognizers.ts delete mode 100644 apps/sim/lib/guardrails/vin.test.ts delete mode 100644 apps/sim/lib/guardrails/vin.ts diff --git a/apps/sim/app/api/organizations/[id]/data-retention/route.ts b/apps/sim/app/api/organizations/[id]/data-retention/route.ts index 37fbbaabb94..f1fc1712f37 100644 --- a/apps/sim/app/api/organizations/[id]/data-retention/route.ts +++ b/apps/sim/app/api/organizations/[id]/data-retention/route.ts @@ -16,6 +16,14 @@ import { isOrganizationOnEnterprisePlan } from '@/lib/billing/core/subscription' import { isBillingEnabled } from '@/lib/core/config/env-flags' import { isFeatureEnabled } from '@/lib/core/config/feature-flags' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' +import { PII_LANGUAGE_CODES, type PIILanguage } from '@/lib/guardrails/pii-entities' + +/** Narrow a stored (loosely-typed) language to the supported set; unknown ⇒ undefined (defaults to en). */ +function coercePiiLanguage(value: string | undefined): PIILanguage | undefined { + return value && (PII_LANGUAGE_CODES as readonly string[]).includes(value) + ? (value as PIILanguage) + : undefined +} const logger = createLogger('DataRetentionAPI') @@ -35,7 +43,14 @@ function normalizeConfigured( logRetentionHours: settings?.logRetentionHours ?? null, softDeleteRetentionHours: settings?.softDeleteRetentionHours ?? null, taskCleanupHours: settings?.taskCleanupHours ?? null, - piiRedaction: settings?.piiRedaction?.rules ? { rules: settings.piiRedaction.rules } : null, + piiRedaction: settings?.piiRedaction?.rules + ? { + rules: settings.piiRedaction.rules.map((rule) => ({ + ...rule, + language: coercePiiLanguage(rule.language), + })), + } + : null, } } diff --git a/apps/sim/blocks/blocks/guardrails.ts b/apps/sim/blocks/blocks/guardrails.ts index dd35d39d5fc..7acd5a89013 100644 --- a/apps/sim/blocks/blocks/guardrails.ts +++ b/apps/sim/blocks/blocks/guardrails.ts @@ -1,5 +1,5 @@ import { ShieldCheckIcon } from '@/components/icons' -import { PII_ENTITY_GROUPS } from '@/lib/guardrails/pii-entities' +import { PII_ENTITY_GROUPS, PII_LANGUAGES } from '@/lib/guardrails/pii-entities' import type { BlockConfig } from '@/blocks/types' import { getModelOptions, @@ -206,13 +206,7 @@ Return ONLY the regex pattern - no explanations, no quotes, no forward slashes, id: 'piiLanguage', title: 'Language', type: 'dropdown', - options: [ - { label: 'English', id: 'en' }, - { label: 'Spanish', id: 'es' }, - { label: 'Italian', id: 'it' }, - { label: 'Polish', id: 'pl' }, - { label: 'Finnish', id: 'fi' }, - ], + options: PII_LANGUAGES.map((language) => ({ label: language.label, id: language.value })), defaultValue: 'en', condition: { field: 'validationType', diff --git a/apps/sim/ee/data-retention/components/data-retention-settings.tsx b/apps/sim/ee/data-retention/components/data-retention-settings.tsx index bca54112d5e..1c39594d1e8 100644 --- a/apps/sim/ee/data-retention/components/data-retention-settings.tsx +++ b/apps/sim/ee/data-retention/components/data-retention-settings.tsx @@ -21,7 +21,13 @@ import { } from '@/components/emcn' import { useSession } from '@/lib/auth/auth-client' import { isBillingEnabled } from '@/lib/core/config/env-flags' -import { PII_ENTITY_GROUPS, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities' +import { + DEFAULT_PII_LANGUAGE, + PII_ENTITY_GROUPS, + PII_LANGUAGES, + type PIILanguage, + SUPPORTED_PII_ENTITIES, +} from '@/lib/guardrails/pii-entities' import { getUserRole } from '@/lib/workspaces/organization/utils' import { SettingsSection } from '@/app/workspace/[workspaceId]/settings/components/settings-section/settings-section' import { InfoNote } from '@/ee/components/info-note' @@ -59,6 +65,7 @@ interface RuleDraft { id: string entityTypes: string[] workspaceId: string | null + language: PIILanguage } function hoursToDisplayDays(hours: number | null): string { @@ -75,6 +82,7 @@ function normalizeRule(rule: RuleDraft): string { return JSON.stringify({ entityTypes: [...rule.entityTypes].sort(), workspaceId: rule.workspaceId, + language: rule.language, }) } @@ -227,6 +235,18 @@ function RuleModal({ onChange={(entityTypes) => onChange({ ...draft, entityTypes })} /> + + onChange({ ...draft, language: language as PIILanguage })} + options={PII_LANGUAGES.map((l) => ({ value: l.value, label: l.label }))} + align='start' + /> + diff --git a/apps/sim/lib/billing/retention.test.ts b/apps/sim/lib/billing/retention.test.ts index 2852cb6a640..b2519838940 100644 --- a/apps/sim/lib/billing/retention.test.ts +++ b/apps/sim/lib/billing/retention.test.ts @@ -21,7 +21,11 @@ describe('resolveEffectivePiiRedaction', () => { orgSettings: settings([allRule]), workspaceId: 'ws-1', }) - expect(result).toEqual({ enabled: true, entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'] }) + expect(result).toEqual({ + enabled: true, + entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'], + language: 'en', + }) }) it('lets a workspace-specific rule override the all rule', () => { @@ -29,7 +33,17 @@ describe('resolveEffectivePiiRedaction', () => { orgSettings: settings([allRule, { id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-1' }]), workspaceId: 'ws-1', }) - expect(result).toEqual({ enabled: true, entityTypes: ['US_SSN'] }) + expect(result).toEqual({ enabled: true, entityTypes: ['US_SSN'], language: 'en' }) + }) + + it('carries the rule language through (defaults to en)', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([ + { id: 'r-es', entityTypes: ['ES_NIF'], workspaceId: 'ws-1', language: 'es' }, + ]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: true, entityTypes: ['ES_NIF'], language: 'es' }) }) it('exempts a workspace when its specific rule has no entity types', () => { @@ -37,7 +51,7 @@ describe('resolveEffectivePiiRedaction', () => { orgSettings: settings([allRule, { id: 'r-1', entityTypes: [], workspaceId: 'ws-1' }]), workspaceId: 'ws-1', }) - expect(result).toEqual({ enabled: false, entityTypes: [] }) + expect(result).toEqual({ enabled: false, entityTypes: [], language: 'en' }) }) it('is disabled when no rule matches and there is no all rule', () => { @@ -45,16 +59,17 @@ describe('resolveEffectivePiiRedaction', () => { orgSettings: settings([{ id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-2' }]), workspaceId: 'ws-1', }) - expect(result).toEqual({ enabled: false, entityTypes: [] }) + expect(result).toEqual({ enabled: false, entityTypes: [], language: 'en' }) }) it('is disabled when there are no rules', () => { expect( resolveEffectivePiiRedaction({ orgSettings: settings([]), workspaceId: 'ws-1' }) - ).toEqual({ enabled: false, entityTypes: [] }) + ).toEqual({ enabled: false, entityTypes: [], language: 'en' }) expect(resolveEffectivePiiRedaction({ orgSettings: null, workspaceId: 'ws-1' })).toEqual({ enabled: false, entityTypes: [], + language: 'en', }) }) }) diff --git a/apps/sim/lib/billing/retention.ts b/apps/sim/lib/billing/retention.ts index 183dbb280e1..bdec594dddd 100644 --- a/apps/sim/lib/billing/retention.ts +++ b/apps/sim/lib/billing/retention.ts @@ -1,14 +1,18 @@ import type { DataRetentionSettings } from '@sim/db/schema' +import { DEFAULT_PII_LANGUAGE } from '@/lib/guardrails/pii-entities' export interface EffectivePiiRedaction { enabled: boolean /** Presidio entity types to mask. Empty = redact all detected PII. */ entityTypes: string[] + /** Language whose Presidio recognizers apply when masking. */ + language: string } export const DEFAULT_PII_REDACTION: EffectivePiiRedaction = { enabled: false, entityTypes: [], + language: DEFAULT_PII_LANGUAGE, } /** @@ -34,5 +38,6 @@ export function resolveEffectivePiiRedaction(params: { ? rule.entityTypes.filter((t): t is string => typeof t === 'string') : [] if (types.length === 0) return DEFAULT_PII_REDACTION - return { enabled: true, entityTypes: types } + const language = typeof rule?.language === 'string' ? rule.language : DEFAULT_PII_LANGUAGE + return { enabled: true, entityTypes: types, language } } diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 8e72f55cd08..211c7664a65 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -311,8 +311,7 @@ export const env = createEnv({ PORT: z.number().optional(), // Main application port INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000) ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins - PRESIDIO_ANALYZER_URL: z.string().optional(), // Presidio analyzer sidecar base URL for PII detection (default http://localhost:5002) - PRESIDIO_ANONYMIZER_URL: z.string().optional(), // Presidio anonymizer sidecar base URL for PII masking (default http://localhost:5001) + PRESIDIO_URL: z.string().optional(), // Presidio sidecar base URL serving /analyze + /anonymize (default http://localhost:5002) // OAuth Integration Credentials - All optional, enables third-party integrations GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services diff --git a/apps/sim/lib/guardrails/pii-entities.ts b/apps/sim/lib/guardrails/pii-entities.ts index 0e67fe22ff7..af512e75f10 100644 --- a/apps/sim/lib/guardrails/pii-entities.ts +++ b/apps/sim/lib/guardrails/pii-entities.ts @@ -51,8 +51,6 @@ export const SUPPORTED_PII_ENTITIES = { IN_VOTER: 'Indian voter ID', IN_PASSPORT: 'Indian passport', FI_PERSONAL_IDENTITY_CODE: 'Finnish Personal Identity Code', - KR_RRN: 'Korean Resident Registration Number', - TH_TNIN: 'Thai National ID Number', } as const export type PIIEntityType = keyof typeof SUPPORTED_PII_ENTITIES @@ -115,8 +113,6 @@ export const PII_ENTITY_GROUPS: ReadonlyArray<{ 'IN_VOTER', 'IN_PASSPORT', 'FI_PERSONAL_IDENTITY_CODE', - 'KR_RRN', - 'TH_TNIN', ], }, ].map((group) => ({ @@ -126,3 +122,26 @@ export const PII_ENTITY_GROUPS: ReadonlyArray<{ label: SUPPORTED_PII_ENTITIES[value as PIIEntityType], })), })) + +/** + * Languages the Presidio image has NLP models for. The analyzer only recognizes a + * language's entities when its model is loaded, so this set must match the image. + */ +export const PII_LANGUAGES = [ + { value: 'en', label: 'English' }, + { value: 'es', label: 'Spanish' }, + { value: 'it', label: 'Italian' }, + { value: 'pl', label: 'Polish' }, + { value: 'fi', label: 'Finnish' }, +] as const + +export type PIILanguage = (typeof PII_LANGUAGES)[number]['value'] + +/** Non-empty tuple of language codes for schema/enum use. */ +export const PII_LANGUAGE_CODES = PII_LANGUAGES.map((l) => l.value) as [ + PIILanguage, + ...PIILanguage[], +] + +/** Default redaction language when a rule doesn't set one. */ +export const DEFAULT_PII_LANGUAGE: PIILanguage = 'en' diff --git a/apps/sim/lib/guardrails/recognizers.ts b/apps/sim/lib/guardrails/recognizers.ts deleted file mode 100644 index 0b644b89ce4..00000000000 --- a/apps/sim/lib/guardrails/recognizers.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { findVins } from '@/lib/guardrails/vin' - -/** - * A custom PII recognizer for entities Microsoft Presidio doesn't ship. - * - * A recognizer only does **detection** — it returns character spans. Masking is - * handled uniformly by the anonymizer sidecar, which replaces every span by its - * `entityType` (e.g. ``), so a recognizer never touches the sidecars or the - * masking path. - * - * To add one: - * 1. Implement a pure `detect(text)` (regex/checksum/etc., no I/O). - * 2. Register it in {@link CUSTOM_RECOGNIZERS}. - * 3. Add its entity to `pii-entities.ts` so it appears in the Data Retention UI. - */ -export interface CustomRecognizer { - /** Entity name; becomes the `` placeholder when masked. */ - entityType: string - /** Character spans of confirmed matches in `text`. Pure — no I/O. */ - detect(text: string): Array<{ start: number; end: number }> -} - -/** The registry of TS-side recognizers, applied on top of Presidio's built-ins. */ -export const CUSTOM_RECOGNIZERS: CustomRecognizer[] = [{ entityType: 'VIN', detect: findVins }] - -/** Entity names owned by a custom recognizer — never forwarded to the analyzer. */ -export const CUSTOM_ENTITY_TYPES = new Set(CUSTOM_RECOGNIZERS.map((r) => r.entityType)) diff --git a/apps/sim/lib/guardrails/validate_pii.test.ts b/apps/sim/lib/guardrails/validate_pii.test.ts index 047b0a28832..0ba1c585bc0 100644 --- a/apps/sim/lib/guardrails/validate_pii.test.ts +++ b/apps/sim/lib/guardrails/validate_pii.test.ts @@ -4,8 +4,6 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' import { maskPIIBatch, validatePII } from '@/lib/guardrails/validate_pii' -const VALID_VIN = '1HGCM82633A004352' - interface Span { entity_type: string start: number @@ -29,8 +27,8 @@ function emailSpans(text: string, entities: string[] | undefined): Span[] { return idx === -1 ? [] : [{ entity_type: 'EMAIL_ADDRESS', start: idx, end: idx + 7, score: 0.9 }] } -describe('validate_pii (Presidio sidecars + TS VIN)', () => { - let analyzeBodies: Array<{ text: string; entities?: string[] }> +describe('validate_pii (Presidio sidecar)', () => { + let analyzeBodies: Array<{ text: string; language: string; entities?: string[] }> let fetchMock: ReturnType beforeEach(() => { @@ -38,7 +36,7 @@ describe('validate_pii (Presidio sidecars + TS VIN)', () => { fetchMock = vi.fn(async (url: string, init: { body: string }) => { const body = JSON.parse(init.body) if (url.includes('/analyze')) { - analyzeBodies.push({ text: body.text, entities: body.entities }) + analyzeBodies.push({ text: body.text, language: body.language, entities: body.entities }) return new Response(JSON.stringify(emailSpans(body.text, body.entities)), { status: 200 }) } // /anonymize @@ -55,21 +53,20 @@ describe('validate_pii (Presidio sidecars + TS VIN)', () => { afterEach(() => vi.unstubAllGlobals()) describe('maskPIIBatch', () => { - it('masks both Presidio entities and TS-detected VINs, preserving order', async () => { - const out = await maskPIIBatch([`email a@b.com car ${VALID_VIN}`, 'nothing here'], []) - expect(out[0]).toBe('email car ') + it('masks detected entities, preserving input order', async () => { + const out = await maskPIIBatch(['email a@b.com', 'nothing here'], []) + expect(out[0]).toBe('email ') expect(out[1]).toBe('nothing here') }) - it('strips VIN from the analyzer request (handled in TS)', async () => { - await maskPIIBatch([`vin ${VALID_VIN} mail a@b.com`], ['EMAIL_ADDRESS', 'VIN']) - expect(analyzeBodies[0].entities).toEqual(['EMAIL_ADDRESS']) - }) + it('forwards entityTypes (and language) to the analyzer; empty ⇒ omitted (all)', async () => { + await maskPIIBatch(['mail a@b.com'], ['EMAIL_ADDRESS', 'PERSON'], 'es') + expect(analyzeBodies[0].entities).toEqual(['EMAIL_ADDRESS', 'PERSON']) + expect(analyzeBodies[0].language).toBe('es') - it('skips the analyzer entirely for a VIN-only request', async () => { - const out = await maskPIIBatch([`vin ${VALID_VIN}`], ['VIN']) - expect(out[0]).toBe('vin ') - expect(analyzeBodies).toHaveLength(0) + analyzeBodies.length = 0 + await maskPIIBatch(['mail a@b.com'], []) + expect(analyzeBodies[0].entities).toBeUndefined() }) it('returns [] for empty input and leaves empty strings untouched', async () => { @@ -86,26 +83,25 @@ describe('validate_pii (Presidio sidecars + TS VIN)', () => { describe('validatePII', () => { it('block mode fails with a summary when PII is detected', async () => { const res = await validatePII({ - text: `a@b.com and ${VALID_VIN}`, + text: 'reach me at a@b.com', entityTypes: [], mode: 'block', requestId: 'r1', }) expect(res.passed).toBe(false) expect(res.error).toContain('EMAIL_ADDRESS') - expect(res.error).toContain('VIN') - expect(res.detectedEntities).toHaveLength(2) + expect(res.detectedEntities).toHaveLength(1) }) it('mask mode returns masked text', async () => { const res = await validatePII({ - text: `mail a@b.com vin ${VALID_VIN}`, + text: 'mail a@b.com', entityTypes: [], mode: 'mask', requestId: 'r2', }) expect(res.passed).toBe(true) - expect(res.maskedText).toBe('mail vin ') + expect(res.maskedText).toBe('mail ') }) it('passes clean text', async () => { diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index d103c3fc44c..5fa380b7ebf 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -2,7 +2,6 @@ import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' import { env } from '@/lib/core/config/env' import { mapWithConcurrency } from '@/lib/core/utils/concurrency' -import { CUSTOM_ENTITY_TYPES, CUSTOM_RECOGNIZERS } from '@/lib/guardrails/recognizers' const logger = createLogger('PIIValidator') @@ -12,8 +11,8 @@ const REQUEST_TIMEOUT_MS = 45_000 /** Concurrent per-string sidecar calls within one batch; the warm model handles parallelism. */ const MASK_CONCURRENCY = 8 -const ANALYZER_URL = env.PRESIDIO_ANALYZER_URL || 'http://localhost:5002' -const ANONYMIZER_URL = env.PRESIDIO_ANONYMIZER_URL || 'http://localhost:5001' +/** Single Presidio sidecar serving both /analyze and /anonymize (VIN is native there). */ +const PRESIDIO_URL = env.PRESIDIO_URL || 'http://localhost:5002' export interface PIIValidationInput { text: string @@ -46,19 +45,18 @@ interface AnalyzerSpan { } /** - * Detect PII spans via the Presidio analyzer sidecar. Returns [] when the request - * targets only custom entities (nothing left for Presidio). Throws on transport/HTTP failure. + * Detect PII spans via the Presidio analyzer. An empty `entityTypes` ⇒ detect all. + * Throws on transport/HTTP failure so callers can apply their own fail-safe. */ async function analyze( text: string, - entities: string[] | undefined, + entityTypes: string[], language: string ): Promise { - // Custom-only request: the analyzer has nothing to do. - if (entities && entities.length === 0) return [] + const entities = entityTypes.length > 0 ? entityTypes : undefined // boundary-raw-fetch: internal call to the Presidio analyzer sidecar over localhost - const response = await fetch(`${ANALYZER_URL}/analyze`, { + const response = await fetch(`${PRESIDIO_URL}/analyze`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify({ text, language, ...(entities ? { entities } : {}) }), @@ -79,7 +77,7 @@ async function anonymize(text: string, spans: AnalyzerSpan[]): Promise { if (spans.length === 0) return text // boundary-raw-fetch: internal call to the Presidio anonymizer sidecar over localhost - const response = await fetch(`${ANONYMIZER_URL}/anonymize`, { + const response = await fetch(`${PRESIDIO_URL}/anonymize`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify({ text, analyzer_results: spans }), @@ -94,30 +92,7 @@ async function anonymize(text: string, spans: AnalyzerSpan[]): Promise { } /** - * All PII spans in `text`: spans from the custom TS recognizers plus the analyzer - * sidecar's spans, both on original-text offsets. Custom spans carry their own - * `entity_type`, which the anonymizer replaces with `` like any other. - * An empty `entityTypes` means "all"; otherwise each side gets only the entities it - * owns (custom names are never forwarded to the analyzer). - */ -async function collectSpans( - text: string, - entityTypes: string[], - language: string -): Promise { - const all = entityTypes.length === 0 - const customSpans: AnalyzerSpan[] = CUSTOM_RECOGNIZERS.filter( - (r) => all || entityTypes.includes(r.entityType) - ).flatMap((r) => - r.detect(text).map((s) => ({ entity_type: r.entityType, start: s.start, end: s.end, score: 1 })) - ) - const requestEntities = all ? undefined : entityTypes.filter((t) => !CUSTOM_ENTITY_TYPES.has(t)) - const presidioSpans = await analyze(text, requestEntities, language) - return [...customSpans, ...presidioSpans] -} - -/** - * Validate text for PII using Presidio sidecars (+ the TS VIN recognizer). + * Validate text for PII using the Presidio sidecar. * * - block: fails validation if any PII is detected * - mask: passes and returns masked text with PII replaced by `` @@ -133,7 +108,7 @@ export async function validatePII(input: PIIValidationInput): Promise ({ type: s.entity_type, @@ -161,7 +136,7 @@ export async function validatePII(input: PIIValidationInput): Promise`. + // mask mode: the anonymizer replaces every span with ``. const maskedText = await anonymize(text, spans) logger.info(`[${requestId}] PII validation completed`, { passed: true, @@ -180,13 +155,13 @@ export async function validatePII(input: PIIValidationInput): Promise { if (!text) return text - const spans = await collectSpans(text, entityTypes, language) + const spans = await analyze(text, entityTypes, language) return anonymize(text, spans) }) } diff --git a/apps/sim/lib/guardrails/vin.test.ts b/apps/sim/lib/guardrails/vin.test.ts deleted file mode 100644 index 293ed4a0a97..00000000000 --- a/apps/sim/lib/guardrails/vin.test.ts +++ /dev/null @@ -1,52 +0,0 @@ -/** - * @vitest-environment node - */ -import { describe, expect, it } from 'vitest' -import { findVins, isValidVin, maskVins } from '@/lib/guardrails/vin' - -const VALID = '1HGCM82633A004352' // check digit (position 9) = 3 -const INVALID_CHECK = '1HGCM82643A004352' // same shape, wrong check digit - -describe('isValidVin', () => { - it('accepts a VIN with a correct ISO 3779 check digit', () => { - expect(isValidVin(VALID)).toBe(true) - }) - - it('rejects a 17-char code whose check digit does not validate', () => { - expect(isValidVin(INVALID_CHECK)).toBe(false) - }) - - it('rejects wrong length', () => { - expect(isValidVin('1HGCM82633A00435')).toBe(false) - expect(isValidVin(`${VALID}9`)).toBe(false) - }) - - it('rejects disallowed letters I/O/Q', () => { - expect(isValidVin('1HGCM82633A0043I2'.slice(0, 17))).toBe(false) - }) -}) - -describe('findVins', () => { - it('returns spans only for valid VINs, in order', () => { - const text = `vin ${VALID} and bogus ${INVALID_CHECK} done` - const spans = findVins(text) - expect(spans).toHaveLength(1) - expect(text.slice(spans[0].start, spans[0].end)).toBe(VALID) - }) - - it('finds multiple valid VINs', () => { - const text = `${VALID} ${VALID}` - expect(findVins(text)).toHaveLength(2) - }) -}) - -describe('maskVins', () => { - it('replaces valid VINs with and leaves invalid candidates untouched', () => { - expect(maskVins(`car ${VALID}`)).toBe('car ') - expect(maskVins(`code ${INVALID_CHECK}`)).toBe(`code ${INVALID_CHECK}`) - }) - - it('returns text unchanged when there is no VIN', () => { - expect(maskVins('no vehicle here')).toBe('no vehicle here') - }) -}) diff --git a/apps/sim/lib/guardrails/vin.ts b/apps/sim/lib/guardrails/vin.ts deleted file mode 100644 index aaed7454717..00000000000 --- a/apps/sim/lib/guardrails/vin.ts +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Vehicle Identification Number (VIN) recognition. - * - * Presidio has no built-in VIN recognizer, and a VIN is pure pattern + arithmetic - * (no NLP), so it lives here in TS rather than in the Presidio sidecars. A VIN is - * 17 chars from A-Z/0-9 excluding I/O/Q; this validates the ISO 3779 check digit - * (position 9), which makes accidental matches on arbitrary 17-char codes (request - * ids, SKUs, tokens) extremely unlikely. Some non-North-American VINs omit the - * check digit and are skipped — an intentional bias toward precision. - */ - -const VIN_PATTERN = /\b[A-HJ-NPR-Z0-9]{17}\b/g - -/** Character → numeric value for the ISO 3779 weighted-sum check digit. */ -const TRANSLIT: Record = { - '0': 0, - '1': 1, - '2': 2, - '3': 3, - '4': 4, - '5': 5, - '6': 6, - '7': 7, - '8': 8, - '9': 9, - A: 1, - B: 2, - C: 3, - D: 4, - E: 5, - F: 6, - G: 7, - H: 8, - J: 1, - K: 2, - L: 3, - M: 4, - N: 5, - P: 7, - R: 9, - S: 2, - T: 3, - U: 4, - V: 5, - W: 6, - X: 7, - Y: 8, - Z: 9, -} - -/** Positional weights; index 8 (position 9) is the check digit itself (weight 0). */ -const WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] - -const VIN_PLACEHOLDER = '' - -/** Whether a 17-char candidate satisfies the ISO 3779 check digit at position 9. */ -export function isValidVin(candidate: string): boolean { - const vin = candidate.toUpperCase() - if (vin.length !== 17) return false - let total = 0 - for (let i = 0; i < 17; i++) { - const value = TRANSLIT[vin[i]] - if (value === undefined) return false - total += value * WEIGHTS[i] - } - const check = total % 11 - const expected = check === 10 ? 'X' : String(check) - return vin[8] === expected -} - -/** Spans of every check-digit-valid VIN in `text`, in order of appearance. */ -export function findVins(text: string): Array<{ start: number; end: number }> { - const spans: Array<{ start: number; end: number }> = [] - for (const match of text.matchAll(VIN_PATTERN)) { - if (match.index === undefined) continue - if (isValidVin(match[0])) { - spans.push({ start: match.index, end: match.index + match[0].length }) - } - } - return spans -} - -/** Replace every check-digit-valid VIN in `text` with ``. */ -export function maskVins(text: string): string { - return text.replace(VIN_PATTERN, (candidate) => - isValidVin(candidate) ? VIN_PLACEHOLDER : candidate - ) -} diff --git a/apps/sim/lib/logs/execution/logger.ts b/apps/sim/lib/logs/execution/logger.ts index 9a531b72934..e68ad3100fa 100644 --- a/apps/sim/lib/logs/execution/logger.ts +++ b/apps/sim/lib/logs/execution/logger.ts @@ -620,7 +620,10 @@ export class ExecutionLogger implements IExecutionLoggerService { const config = resolveEffectivePiiRedaction({ orgSettings: row.orgSettings, workspaceId }) if (!config.enabled) return payload - return redactPIIFromExecution(payload, { entityTypes: config.entityTypes }) + return redactPIIFromExecution(payload, { + entityTypes: config.entityTypes, + language: config.language, + }) } async completeWorkflowExecution(params: { diff --git a/packages/db/schema.ts b/packages/db/schema.ts index 87188837516..f066c19ad4f 100644 --- a/packages/db/schema.ts +++ b/packages/db/schema.ts @@ -1077,6 +1077,8 @@ export interface PiiRedactionRule { entityTypes: string[] /** `null` = all workspaces; otherwise the single targeted workspace. */ workspaceId: string | null + /** Language whose Presidio recognizers apply (e.g. 'en', 'es'); defaults to English. */ + language?: string } /** From 67b303f2bf62eb9e68b0d92673598998f7b7424c Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Tue, 23 Jun 2026 01:55:41 -0700 Subject: [PATCH 7/9] fix(guardrails): correct sidecar port (5001) + README for combined image The combined Presidio image (docker/pii.Dockerfile) serves /analyze + /anonymize on a single port 5001 with native VIN + multi-language recognizers. Fix the PRESIDIO_URL default (was 5002) and rewrite the README, which still described two stock containers and a TS VIN recognizer. --- apps/sim/lib/core/config/env.ts | 2 +- apps/sim/lib/guardrails/README.md | 30 +++++++++++++------------ apps/sim/lib/guardrails/validate_pii.ts | 2 +- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 211c7664a65..1a766c28b0c 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -311,7 +311,7 @@ export const env = createEnv({ PORT: z.number().optional(), // Main application port INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000) ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins - PRESIDIO_URL: z.string().optional(), // Presidio sidecar base URL serving /analyze + /anonymize (default http://localhost:5002) + PRESIDIO_URL: z.string().optional(), // Presidio sidecar base URL serving /analyze + /anonymize (default http://localhost:5001) // OAuth Integration Credentials - All optional, enables third-party integrations GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services diff --git a/apps/sim/lib/guardrails/README.md b/apps/sim/lib/guardrails/README.md index 1fea46ae027..293938fab2c 100644 --- a/apps/sim/lib/guardrails/README.md +++ b/apps/sim/lib/guardrails/README.md @@ -19,26 +19,29 @@ For **hallucination detection**, you'll need: - A knowledge base with documents - An LLM provider API key (or use hosted models) -### PII Detection (Presidio sidecars) +### PII Detection (Presidio sidecar) -PII detection runs against two long-lived **Microsoft Presidio sidecar containers** reached over -HTTP — the analyzer (NLP detection) and the anonymizer (masking). In deployment they run alongside the -app container in the same ECS task; locally, run the official images: +PII detection runs against **one** long-lived Presidio sidecar — a combined service (built from +`docker/pii.Dockerfile`, source in `apps/pii/server.py`) that constructs a warm `AnalyzerEngine` + +`AnonymizerEngine` once and exposes both `/analyze` and `/anonymize` (plus `/health`) on a single +port. In deployment it runs alongside the app container in the same ECS task; locally, build and run +it: ```bash -docker run -d -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest -docker run -d -p 5001:3000 mcr.microsoft.com/presidio-anonymizer:latest +docker build -f docker/pii.Dockerfile -t sim-pii . +docker run -d -p 5001:5001 sim-pii ``` -Point the app at them (defaults shown): +Point the app at it (default shown): ```bash -PRESIDIO_ANALYZER_URL=http://localhost:5002 -PRESIDIO_ANONYMIZER_URL=http://localhost:5001 +PRESIDIO_URL=http://localhost:5001 ``` -VIN recognition (check-digit validated) is implemented in TypeScript (`vin.ts`) and never sent to the -sidecars. No Python or local venv is required. +The image bakes in the recognizers itself — a check-digit-validated **VIN** recognizer and +multi-language NLP models (en/es/it/pl/fi) — so the app is a thin HTTP client (`validate_pii.ts`) with +no Python or local venv. The redaction language is configured per rule (Data Retention) and defaults +to English. ## Usage @@ -97,9 +100,8 @@ See [Presidio documentation](https://microsoft.github.io/presidio/supported_enti - `validate_json.ts` - JSON validation (TypeScript) - `validate_regex.ts` - Regex validation (TypeScript) - `validate_hallucination.ts` - Hallucination detection with RAG + LLM scoring (TypeScript) -- `validate_pii.ts` - PII detection client: calls the Presidio analyzer/anonymizer sidecars (TypeScript) -- `vin.ts` - Check-digit-validated VIN recognizer (TypeScript) -- `pii-entities.ts` - Client-safe PII entity catalog +- `validate_pii.ts` - PII detection client: calls the Presidio sidecar's /analyze + /anonymize (TypeScript) +- `pii-entities.ts` - Client-safe PII entity + language catalog (shared by the block and Data Retention) - `mask-client.ts` - Internal HTTP client for batch PII masking from the log-redaction persist path - `validate.test.ts` - Test suite for JSON and regex validators diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index 5fa380b7ebf..6ee24d14eff 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -12,7 +12,7 @@ const REQUEST_TIMEOUT_MS = 45_000 const MASK_CONCURRENCY = 8 /** Single Presidio sidecar serving both /analyze and /anonymize (VIN is native there). */ -const PRESIDIO_URL = env.PRESIDIO_URL || 'http://localhost:5002' +const PRESIDIO_URL = env.PRESIDIO_URL || 'http://localhost:5001' export interface PIIValidationInput { text: string From e4935bfcdb11a05cd772364415d7f70da0005561 Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Tue, 23 Jun 2026 02:10:56 -0700 Subject: [PATCH 8/9] fix(guardrails): coerce stored redaction language in the resolver The persist-path resolver accepted any stored language string, so a stale/invalid code (e.g. a dropped locale) would reach Presidio and scrub the log even though the admin UI shows English. Coerce against the supported set via a shared coercePiiLanguage helper (now reused by the data-retention route too), falling back to en for unknown values. --- .../api/organizations/[id]/data-retention/route.ts | 9 +-------- apps/sim/lib/billing/retention.test.ts | 10 ++++++++++ apps/sim/lib/billing/retention.ts | 4 ++-- apps/sim/lib/guardrails/pii-entities.ts | 11 +++++++++++ 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/apps/sim/app/api/organizations/[id]/data-retention/route.ts b/apps/sim/app/api/organizations/[id]/data-retention/route.ts index f1fc1712f37..7d7052a3923 100644 --- a/apps/sim/app/api/organizations/[id]/data-retention/route.ts +++ b/apps/sim/app/api/organizations/[id]/data-retention/route.ts @@ -16,14 +16,7 @@ import { isOrganizationOnEnterprisePlan } from '@/lib/billing/core/subscription' import { isBillingEnabled } from '@/lib/core/config/env-flags' import { isFeatureEnabled } from '@/lib/core/config/feature-flags' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' -import { PII_LANGUAGE_CODES, type PIILanguage } from '@/lib/guardrails/pii-entities' - -/** Narrow a stored (loosely-typed) language to the supported set; unknown ⇒ undefined (defaults to en). */ -function coercePiiLanguage(value: string | undefined): PIILanguage | undefined { - return value && (PII_LANGUAGE_CODES as readonly string[]).includes(value) - ? (value as PIILanguage) - : undefined -} +import { coercePiiLanguage } from '@/lib/guardrails/pii-entities' const logger = createLogger('DataRetentionAPI') diff --git a/apps/sim/lib/billing/retention.test.ts b/apps/sim/lib/billing/retention.test.ts index b2519838940..15714bc0465 100644 --- a/apps/sim/lib/billing/retention.test.ts +++ b/apps/sim/lib/billing/retention.test.ts @@ -46,6 +46,16 @@ describe('resolveEffectivePiiRedaction', () => { expect(result).toEqual({ enabled: true, entityTypes: ['ES_NIF'], language: 'es' }) }) + it('falls back to en when a stored language is unsupported/stale', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([ + { id: 'r-de', entityTypes: ['EMAIL_ADDRESS'], workspaceId: 'ws-1', language: 'de' }, + ]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: true, entityTypes: ['EMAIL_ADDRESS'], language: 'en' }) + }) + it('exempts a workspace when its specific rule has no entity types', () => { const result = resolveEffectivePiiRedaction({ orgSettings: settings([allRule, { id: 'r-1', entityTypes: [], workspaceId: 'ws-1' }]), diff --git a/apps/sim/lib/billing/retention.ts b/apps/sim/lib/billing/retention.ts index bdec594dddd..dafb9e3a78b 100644 --- a/apps/sim/lib/billing/retention.ts +++ b/apps/sim/lib/billing/retention.ts @@ -1,5 +1,5 @@ import type { DataRetentionSettings } from '@sim/db/schema' -import { DEFAULT_PII_LANGUAGE } from '@/lib/guardrails/pii-entities' +import { coercePiiLanguage, DEFAULT_PII_LANGUAGE } from '@/lib/guardrails/pii-entities' export interface EffectivePiiRedaction { enabled: boolean @@ -38,6 +38,6 @@ export function resolveEffectivePiiRedaction(params: { ? rule.entityTypes.filter((t): t is string => typeof t === 'string') : [] if (types.length === 0) return DEFAULT_PII_REDACTION - const language = typeof rule?.language === 'string' ? rule.language : DEFAULT_PII_LANGUAGE + const language = coercePiiLanguage(rule?.language) ?? DEFAULT_PII_LANGUAGE return { enabled: true, entityTypes: types, language } } diff --git a/apps/sim/lib/guardrails/pii-entities.ts b/apps/sim/lib/guardrails/pii-entities.ts index af512e75f10..c26e7dc0b91 100644 --- a/apps/sim/lib/guardrails/pii-entities.ts +++ b/apps/sim/lib/guardrails/pii-entities.ts @@ -145,3 +145,14 @@ export const PII_LANGUAGE_CODES = PII_LANGUAGES.map((l) => l.value) as [ /** Default redaction language when a rule doesn't set one. */ export const DEFAULT_PII_LANGUAGE: PIILanguage = 'en' + +/** + * Narrow a loosely-typed (stored/legacy) language to a supported code. Unknown or + * stale values (e.g. a dropped locale) return `undefined` so callers fall back to + * the default rather than forwarding an unsupported language to Presidio. + */ +export function coercePiiLanguage(value: string | undefined): PIILanguage | undefined { + return value && (PII_LANGUAGE_CODES as readonly string[]).includes(value) + ? (value as PIILanguage) + : undefined +} From 438d6b0f91a1b4bc55263a805f78f5176e2f84ab Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Tue, 23 Jun 2026 02:19:30 -0700 Subject: [PATCH 9/9] fix(guardrails): rename PRESIDIO_URL env var to PII_URL Match the infra taskdef, which sets PII_URL on the app container for the combined Presidio sidecar. --- apps/sim/lib/core/config/env.ts | 2 +- apps/sim/lib/guardrails/README.md | 2 +- apps/sim/lib/guardrails/validate_pii.ts | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 1a766c28b0c..302d3a93fd2 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -311,7 +311,7 @@ export const env = createEnv({ PORT: z.number().optional(), // Main application port INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000) ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins - PRESIDIO_URL: z.string().optional(), // Presidio sidecar base URL serving /analyze + /anonymize (default http://localhost:5001) + PII_URL: z.string().optional(), // Presidio PII sidecar base URL serving /analyze + /anonymize (default http://localhost:5001) // OAuth Integration Credentials - All optional, enables third-party integrations GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services diff --git a/apps/sim/lib/guardrails/README.md b/apps/sim/lib/guardrails/README.md index 293938fab2c..6c0a5df9709 100644 --- a/apps/sim/lib/guardrails/README.md +++ b/apps/sim/lib/guardrails/README.md @@ -35,7 +35,7 @@ docker run -d -p 5001:5001 sim-pii Point the app at it (default shown): ```bash -PRESIDIO_URL=http://localhost:5001 +PII_URL=http://localhost:5001 ``` The image bakes in the recognizers itself — a check-digit-validated **VIN** recognizer and diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index 6ee24d14eff..a24c8f880e1 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -12,7 +12,7 @@ const REQUEST_TIMEOUT_MS = 45_000 const MASK_CONCURRENCY = 8 /** Single Presidio sidecar serving both /analyze and /anonymize (VIN is native there). */ -const PRESIDIO_URL = env.PRESIDIO_URL || 'http://localhost:5001' +const PII_URL = env.PII_URL || 'http://localhost:5001' export interface PIIValidationInput { text: string @@ -56,7 +56,7 @@ async function analyze( const entities = entityTypes.length > 0 ? entityTypes : undefined // boundary-raw-fetch: internal call to the Presidio analyzer sidecar over localhost - const response = await fetch(`${PRESIDIO_URL}/analyze`, { + const response = await fetch(`${PII_URL}/analyze`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify({ text, language, ...(entities ? { entities } : {}) }), @@ -77,7 +77,7 @@ async function anonymize(text: string, spans: AnalyzerSpan[]): Promise { if (spans.length === 0) return text // boundary-raw-fetch: internal call to the Presidio anonymizer sidecar over localhost - const response = await fetch(`${PRESIDIO_URL}/anonymize`, { + const response = await fetch(`${PII_URL}/anonymize`, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify({ text, analyzer_results: spans }),