Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions apps/sim/app/api/guardrails/mask-batch/route.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/**
* @vitest-environment node
*/
import { createMockRequest } from '@sim/testing'
import { beforeEach, describe, expect, it, vi } from 'vitest'

const { mockCheckInternalAuth, mockMaskPIIBatch } = vi.hoisted(() => ({
mockCheckInternalAuth: vi.fn(),
mockMaskPIIBatch: vi.fn(),
}))

vi.mock('@/lib/auth/hybrid', () => ({
checkInternalAuth: mockCheckInternalAuth,
}))

vi.mock('@/lib/guardrails/validate_pii', () => ({
maskPIIBatch: mockMaskPIIBatch,
}))

import { POST } from '@/app/api/guardrails/mask-batch/route'

describe('POST /api/guardrails/mask-batch', () => {
beforeEach(() => {
vi.clearAllMocks()
mockCheckInternalAuth.mockResolvedValue({ success: true })
mockMaskPIIBatch.mockImplementation(async (texts: string[]) => texts.map((t) => `M(${t})`))
})

it('returns 401 without internal auth', async () => {
mockCheckInternalAuth.mockResolvedValue({
success: false,
error: 'Internal authentication required',
})

const res = await POST(
createMockRequest('POST', { texts: ['a@b.com'], entityTypes: ['EMAIL_ADDRESS'] })
)

expect(res.status).toBe(401)
expect(mockMaskPIIBatch).not.toHaveBeenCalled()
})

it('masks the batch in-process and preserves order', async () => {
const res = await POST(
createMockRequest('POST', {
texts: ['a@b.com', 'hello'],
entityTypes: ['EMAIL_ADDRESS'],
language: 'en',
})
)

expect(res.status).toBe(200)
const json = await res.json()
expect(json.masked).toEqual(['M(a@b.com)', 'M(hello)'])
expect(mockMaskPIIBatch).toHaveBeenCalledWith(['a@b.com', 'hello'], ['EMAIL_ADDRESS'], 'en')
})

it('rejects an invalid body with 400', async () => {
const res = await POST(createMockRequest('POST', { texts: 'not-an-array', entityTypes: [] }))

expect(res.status).toBe(400)
expect(mockMaskPIIBatch).not.toHaveBeenCalled()
})
})
45 changes: 45 additions & 0 deletions apps/sim/app/api/guardrails/mask-batch/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { createLogger } from '@sim/logger'
import { getErrorMessage } from '@sim/utils/errors'
import { type NextRequest, NextResponse } from 'next/server'
import { guardrailsMaskBatchContract } from '@/lib/api/contracts'
import { parseRequest } from '@/lib/api/server'
import { checkInternalAuth } from '@/lib/auth/hybrid'
import { withRouteHandler } from '@/lib/core/utils/with-route-handler'
import { maskPIIBatch } from '@/lib/guardrails/validate_pii'

const logger = createLogger('GuardrailsMaskBatchAPI')

/**
* Internal batch PII masking. The log-redaction persist path runs in both the
* Next.js server and the trigger.dev runtime, but the Presidio sidecars live only
* in the app task — so redaction calls this endpoint server-to-server (internal
* JWT) to keep Presidio centralized here.
*/
export const POST = withRouteHandler(async (request: NextRequest) => {
const auth = await checkInternalAuth(request, { requireWorkflowId: false })
if (!auth.success) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}

const parsed = await parseRequest(guardrailsMaskBatchContract, request, {})
if (!parsed.success) return parsed.response

const { texts, entityTypes, language } = parsed.data.body

try {
const masked = await maskPIIBatch(texts, entityTypes, language)
logger.info('Masked PII batch', { count: texts.length })
return NextResponse.json({ masked })
} catch (error) {
// An unreachable/misconfigured Presidio sidecar makes maskPIIBatch throw; fail
// loudly here (the caller scrubs to REDACTION_FAILED, so PII is never leaked).
logger.error('PII batch masking failed', {
error: getErrorMessage(error),
count: texts.length,
})
return NextResponse.json(
{ error: getErrorMessage(error, 'PII masking failed') },
{ status: 500 }
)
}
})
69 changes: 10 additions & 59 deletions apps/sim/blocks/blocks/guardrails.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { ShieldCheckIcon } from '@/components/icons'
import { PII_ENTITY_GROUPS } from '@/lib/guardrails/pii-entities'
import type { BlockConfig } from '@/blocks/types'
import {
getModelOptions,
Expand Down Expand Up @@ -170,65 +171,15 @@ Return ONLY the regex pattern - no explanations, no quotes, no forward slashes,
title: 'PII Types to Detect',
type: 'grouped-checkbox-list',
maxHeight: 400,
options: [
// Common PII types
{ label: 'Person name', id: 'PERSON', group: 'Common' },
{ label: 'Email address', id: 'EMAIL_ADDRESS', group: 'Common' },
{ label: 'Phone number', id: 'PHONE_NUMBER', group: 'Common' },
{ label: 'Location', id: 'LOCATION', group: 'Common' },
{ label: 'Date or time', id: 'DATE_TIME', group: 'Common' },
{ label: 'IP address', id: 'IP_ADDRESS', group: 'Common' },
{ label: 'URL', id: 'URL', group: 'Common' },
{ label: 'Credit card number', id: 'CREDIT_CARD', group: 'Common' },
{ label: 'International bank account number (IBAN)', id: 'IBAN_CODE', group: 'Common' },
{ label: 'Cryptocurrency wallet address', id: 'CRYPTO', group: 'Common' },
{ label: 'Medical license number', id: 'MEDICAL_LICENSE', group: 'Common' },
{ label: 'Nationality / religion / political group', id: 'NRP', group: 'Common' },

// USA
{ label: 'US bank account number', id: 'US_BANK_NUMBER', group: 'USA' },
{ label: 'US driver license number', id: 'US_DRIVER_LICENSE', group: 'USA' },
{
label: 'US individual taxpayer identification number (ITIN)',
id: 'US_ITIN',
group: 'USA',
},
{ label: 'US passport number', id: 'US_PASSPORT', group: 'USA' },
{ label: 'US Social Security number', id: 'US_SSN', group: 'USA' },

// UK
{ label: 'UK National Insurance number', id: 'UK_NINO', group: 'UK' },
{ label: 'UK NHS number', id: 'UK_NHS', group: 'UK' },

// Spain
{ label: 'Spanish NIF number', id: 'ES_NIF', group: 'Spain' },
{ label: 'Spanish NIE number', id: 'ES_NIE', group: 'Spain' },

// Italy
{ label: 'Italian fiscal code', id: 'IT_FISCAL_CODE', group: 'Italy' },
{ label: 'Italian driver license', id: 'IT_DRIVER_LICENSE', group: 'Italy' },
{ label: 'Italian identity card', id: 'IT_IDENTITY_CARD', group: 'Italy' },
{ label: 'Italian passport', id: 'IT_PASSPORT', group: 'Italy' },

// Poland
{ label: 'Polish PESEL', id: 'PL_PESEL', group: 'Poland' },

// Singapore
{ label: 'Singapore NRIC/FIN', id: 'SG_NRIC_FIN', group: 'Singapore' },

// Australia
{ label: 'Australian business number (ABN)', id: 'AU_ABN', group: 'Australia' },
{ label: 'Australian company number (ACN)', id: 'AU_ACN', group: 'Australia' },
{ label: 'Australian tax file number (TFN)', id: 'AU_TFN', group: 'Australia' },
{ label: 'Australian Medicare number', id: 'AU_MEDICARE', group: 'Australia' },

// India
{ label: 'Indian Aadhaar', id: 'IN_AADHAAR', group: 'India' },
{ label: 'Indian PAN', id: 'IN_PAN', group: 'India' },
{ label: 'Indian vehicle registration', id: 'IN_VEHICLE_REGISTRATION', group: 'India' },
{ label: 'Indian voter number', id: 'IN_VOTER', group: 'India' },
{ label: 'Indian passport', id: 'IN_PASSPORT', group: 'India' },
],
// Driven by the shared catalog (includes VIN and custom recognizers) so the
// block and the Data Retention settings never drift.
options: PII_ENTITY_GROUPS.flatMap((group) =>
group.entities.map((entity) => ({
label: entity.label,
id: entity.value,
group: group.label,
}))
),
condition: {
field: 'validationType',
value: ['pii'],
Expand Down
28 changes: 28 additions & 0 deletions apps/sim/lib/api/contracts/hotspots.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,34 @@ export const guardrailsValidateContract = defineRouteContract({
},
})

const guardrailsMaskBatchBodySchema = z.object({
texts: z.array(z.string()).max(100_000),
entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(200),
language: z.string().min(1).max(20).optional(),
})

const guardrailsMaskBatchResponseSchema = z.object({
masked: z.array(z.string()),
})

/**
* Internal batch PII masking. Called server-to-server (internal JWT) from the
* log-redaction persist path so Presidio always runs in the app container,
* including for async executions that persist inside the trigger.dev runtime.
*/
export const guardrailsMaskBatchContract = defineRouteContract({
method: 'POST',
path: '/api/guardrails/mask-batch',
body: guardrailsMaskBatchBodySchema,
response: {
mode: 'json',
schema: guardrailsMaskBatchResponseSchema,
},
})

export type GuardrailsMaskBatchBody = z.input<typeof guardrailsMaskBatchBodySchema>
export type GuardrailsMaskBatchResult = z.output<typeof guardrailsMaskBatchResponseSchema>

const chatMessageSchema = z.object({
role: z.enum(['user', 'assistant', 'system']),
content: z.string(),
Expand Down
2 changes: 2 additions & 0 deletions apps/sim/lib/core/config/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,8 @@ export const env = createEnv({
PORT: z.number().optional(), // Main application port
INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000)
ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins
PRESIDIO_ANALYZER_URL: z.string().optional(), // Presidio analyzer sidecar base URL for PII detection (default http://localhost:5002)
PRESIDIO_ANONYMIZER_URL: z.string().optional(), // Presidio anonymizer sidecar base URL for PII masking (default http://localhost:5001)

// OAuth Integration Credentials - All optional, enables third-party integrations
GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services
Expand Down
13 changes: 0 additions & 13 deletions apps/sim/lib/guardrails/.gitignore

This file was deleted.

33 changes: 18 additions & 15 deletions apps/sim/lib/guardrails/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,26 @@ For **hallucination detection**, you'll need:
- A knowledge base with documents
- An LLM provider API key (or use hosted models)

### Python Validators (PII Detection)
### PII Detection (Presidio sidecars)

For **PII detection**, you need to set up a Python virtual environment and install Microsoft Presidio:
PII detection runs against two long-lived **Microsoft Presidio sidecar containers** reached over
HTTP — the analyzer (NLP detection) and the anonymizer (masking). In deployment they run alongside the
app container in the same ECS task; locally, run the official images:

```bash
cd apps/sim/lib/guardrails
./setup.sh
docker run -d -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest
docker run -d -p 5001:3000 mcr.microsoft.com/presidio-anonymizer:latest
```

This will:
1. Create a Python virtual environment in `apps/sim/lib/guardrails/venv`
2. Install required dependencies:
- `presidio-analyzer` - PII detection engine
- `presidio-anonymizer` - PII masking/anonymization
Point the app at them (defaults shown):

The TypeScript wrapper will automatically use the virtual environment's Python interpreter.
```bash
PRESIDIO_ANALYZER_URL=http://localhost:5002
PRESIDIO_ANONYMIZER_URL=http://localhost:5001
```

VIN recognition (check-digit validated) is implemented in TypeScript (`vin.ts`) and never sent to the
sidecars. No Python or local venv is required.

## Usage

Expand Down Expand Up @@ -93,10 +97,9 @@ See [Presidio documentation](https://microsoft.github.io/presidio/supported_enti
- `validate_json.ts` - JSON validation (TypeScript)
- `validate_regex.ts` - Regex validation (TypeScript)
- `validate_hallucination.ts` - Hallucination detection with RAG + LLM scoring (TypeScript)
- `validate_pii.ts` - PII detection TypeScript wrapper (TypeScript)
- `validate_pii.py` - PII detection using Microsoft Presidio (Python)
- `validate_pii.ts` - PII detection client: calls the Presidio analyzer/anonymizer sidecars (TypeScript)
- `vin.ts` - Check-digit-validated VIN recognizer (TypeScript)
- `pii-entities.ts` - Client-safe PII entity catalog
- `mask-client.ts` - Internal HTTP client for batch PII masking from the log-redaction persist path
- `validate.test.ts` - Test suite for JSON and regex validators
- `validate_hallucination.py` - Legacy Python hallucination detector (deprecated)
- `requirements.txt` - Python dependencies for PII detection (and legacy hallucination)
- `setup.sh` - Legacy installation script (deprecated)

68 changes: 68 additions & 0 deletions apps/sim/lib/guardrails/mask-client.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* @vitest-environment node
*/
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'

const { mockToken, mockBaseUrl } = vi.hoisted(() => ({
mockToken: vi.fn(),
mockBaseUrl: vi.fn(),
}))

vi.mock('@/lib/auth/internal', () => ({ generateInternalToken: mockToken }))
vi.mock('@/lib/core/utils/urls', () => ({ getInternalApiBaseUrl: mockBaseUrl }))

import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client'

describe('maskPIIBatchViaHttp', () => {
let fetchMock: ReturnType<typeof vi.fn>

beforeEach(() => {
vi.clearAllMocks()
mockToken.mockResolvedValue('tok')
mockBaseUrl.mockReturnValue('http://app.internal:3000')
fetchMock = vi.fn(async (_url: string, init: { body: string }) => {
const { texts } = JSON.parse(init.body) as { texts: string[] }
return new Response(JSON.stringify({ masked: texts.map((t) => `M(${t})`) }), {
status: 200,
headers: { 'content-type': 'application/json' },
})
})
vi.stubGlobal('fetch', fetchMock)
})

afterEach(() => {
vi.unstubAllGlobals()
})

it('masks a small batch in a single request, with an abort timeout', async () => {
const out = await maskPIIBatchViaHttp(['a', 'b', 'c'], ['EMAIL_ADDRESS'])

expect(out).toEqual(['M(a)', 'M(b)', 'M(c)'])
expect(fetchMock).toHaveBeenCalledTimes(1)
expect(fetchMock.mock.calls[0][1].signal).toBeInstanceOf(AbortSignal)
})

it('splits by count into multiple requests, preserving global order', async () => {
const texts = Array.from({ length: 5000 }, (_, i) => `t${i}`)

const out = await maskPIIBatchViaHttp(texts, [])

expect(out).toHaveLength(5000)
expect(out[0]).toBe('M(t0)')
expect(out[4999]).toBe('M(t4999)')
expect(fetchMock).toHaveBeenCalledTimes(3) // 2000-per-request cap
})

it('throws on a non-2xx response so the caller can scrub', async () => {
fetchMock.mockResolvedValueOnce(new Response('boom', { status: 500 }))

await expect(maskPIIBatchViaHttp(['a'], [])).rejects.toThrow(/mask-batch request failed/)
})

it('returns [] without any request for empty input', async () => {
const out = await maskPIIBatchViaHttp([], [])

expect(out).toEqual([])
expect(fetchMock).not.toHaveBeenCalled()
})
})
Loading
Loading