Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ jobs:
ecr_repo_secret: ECR_MIGRATIONS
- dockerfile: ./docker/realtime.Dockerfile
ecr_repo_secret: ECR_REALTIME
- dockerfile: ./docker/pii.Dockerfile
ecr_repo_secret: ECR_PII
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
Expand Down Expand Up @@ -115,7 +117,7 @@ jobs:
id: ecr-repo
run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
env:
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PII' && secrets.ECR_PII || '' }}

- name: Build and push
uses: useblacksmith/build-push-action@fb9e3e6a9299c78462bfadd0d93352c316adc9b8 # v2
Expand Down Expand Up @@ -153,6 +155,10 @@ jobs:
- dockerfile: ./docker/realtime.Dockerfile
ghcr_image: ghcr.io/simstudioai/realtime
ecr_repo_secret: ECR_REALTIME
# pii is ECR-only (private ECS sidecar) — no ghcr_image, so the tag
# step below skips GHCR for it.
- dockerfile: ./docker/pii.Dockerfile
ecr_repo_secret: ECR_PII
steps:
- name: Checkout code
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
Expand Down Expand Up @@ -188,7 +194,7 @@ jobs:
id: ecr-repo
run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
env:
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PII' && secrets.ECR_PII || '' }}

- name: Generate tags
id: meta
Expand All @@ -206,7 +212,7 @@ jobs:

TAGS="${ECR_IMAGE}"

if [ "${{ github.ref }}" = "refs/heads/main" ]; then
if [ "${{ github.ref }}" = "refs/heads/main" ] && [ -n "$GHCR_IMAGE" ]; then
GHCR_AMD64="${GHCR_IMAGE}:latest-amd64"
GHCR_SHA="${GHCR_IMAGE}:${{ github.sha }}-amd64"
TAGS="${TAGS},$GHCR_AMD64,$GHCR_SHA"
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ jobs:
- dockerfile: ./docker/realtime.Dockerfile
ghcr_image: ghcr.io/simstudioai/realtime
ecr_repo_secret: ECR_REALTIME
# pii is ECR-only (private ECS sidecar) — no ghcr_image.
- dockerfile: ./docker/pii.Dockerfile
ecr_repo_secret: ECR_PII
outputs:
registry: ${{ steps.login-ecr.outputs.registry }}

Expand Down Expand Up @@ -80,8 +83,8 @@ jobs:
# Build tags list
TAGS="${ECR_IMAGE}"

# Add GHCR tags only for main branch
if [ "${{ github.ref }}" = "refs/heads/main" ]; then
# Add GHCR tags only for main branch (and only for images with a GHCR target)
if [ "${{ github.ref }}" = "refs/heads/main" ] && [ -n "$GHCR_IMAGE" ]; then
GHCR_AMD64="${GHCR_IMAGE}:latest-amd64"
GHCR_SHA="${GHCR_IMAGE}:${{ github.sha }}-amd64"
TAGS="${TAGS},$GHCR_AMD64,$GHCR_SHA"
Expand Down
6 changes: 6 additions & 0 deletions apps/pii/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "@sim/pii",
"version": "0.0.0",
"private": true,
"description": "PII detection + anonymization service (Microsoft Presidio, FastAPI). Python service built as a container image (docker/pii.Dockerfile); not part of the JS/turbo build."
}
10 changes: 10 additions & 0 deletions apps/pii/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Pinned for reproducible image builds. Bump deliberately.
presidio-analyzer==2.2.362
presidio-anonymizer==2.2.362
spacy==3.8.14
fastapi==0.138.0
uvicorn[standard]==0.49.0

# The English spaCy model (en_core_web_lg, ~400MB) is fetched + pinned in the
# Dockerfile via curl-with-retry rather than here — a direct pip wheel URL
# truncates on flaky networks and fails wheel validation.
212 changes: 212 additions & 0 deletions apps/pii/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""Combined Presidio REST service: analyzer + anonymizer on one port.

Constructs one warm AnalyzerEngine (multi-language NLP + a native check-digit
VIN recognizer) and one AnonymizerEngine at startup, exposing stock-compatible
endpoints so a single PRESIDIO_URL serves both.
"""

from typing import Any

from fastapi import FastAPI
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.predefined_recognizers import (
AuAbnRecognizer,
AuAcnRecognizer,
AuMedicareRecognizer,
AuTfnRecognizer,
EsNieRecognizer,
EsNifRecognizer,
FiPersonalIdentityCodeRecognizer,
InAadhaarRecognizer,
InPanRecognizer,
InPassportRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
ItDriverLicenseRecognizer,
ItFiscalCodeRecognizer,
ItIdentityCardRecognizer,
ItPassportRecognizer,
ItVatCodeRecognizer,
PlPeselRecognizer,
SgFinRecognizer,
SgUenRecognizer,
UkNinoRecognizer,
)
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from pydantic import BaseModel

# Languages served. Each needs its spaCy model installed in the image; the
# es/it/pl/fi predefined recognizers (ES_NIF, IT_FISCAL_CODE, PL_PESEL, ...)
# auto-load once their NLP engine is present.
NLP_CONFIGURATION = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"},
{"lang_code": "es", "model_name": "es_core_news_lg"},
{"lang_code": "it", "model_name": "it_core_news_lg"},
{"lang_code": "pl", "model_name": "pl_core_news_lg"},
{"lang_code": "fi", "model_name": "fi_core_news_lg"},
],
}
SUPPORTED_LANGUAGES = [m["lang_code"] for m in NLP_CONFIGURATION["models"]]

# Predefined recognizers Presidio ships but does NOT load into the default
# registry — they must be added explicitly. Each carries its own
# supported_language, so it fires under that language once its NLP model is
# loaded. en: UK/AU/IN/SG locale ids; es/it/pl/fi: national ids.
EXTRA_RECOGNIZERS = [
UkNinoRecognizer,
AuAbnRecognizer,
AuAcnRecognizer,
AuTfnRecognizer,
AuMedicareRecognizer,
InPanRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
InPassportRecognizer,
SgFinRecognizer,
SgUenRecognizer,
EsNifRecognizer,
EsNieRecognizer,
ItFiscalCodeRecognizer,
ItDriverLicenseRecognizer,
ItVatCodeRecognizer,
ItPassportRecognizer,
ItIdentityCardRecognizer,
PlPeselRecognizer,
FiPersonalIdentityCodeRecognizer,
]


class VinRecognizer(PatternRecognizer):
"""VIN (17 chars, A-Z/0-9 excluding I/O/Q) with ISO 3779 check-digit
validation (position 9). Validation makes accidental matches on arbitrary
17-char codes (request ids, SKUs, tokens) extremely unlikely. Some
non-North-American VINs omit the check digit and are skipped — an
intentional bias toward precision.
"""

_TRANSLIT = {
**{str(d): d for d in range(10)},
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
"J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
"S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
}
_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]

def validate_result(self, pattern_text: str):
vin = pattern_text.upper()
if len(vin) != 17:
return False
try:
total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS))
except KeyError:
return False
check = total % 11
expected = "X" if check == 10 else str(check)
return vin[8] == expected


def build_analyzer() -> AnalyzerEngine:
nlp_engine = NlpEngineProvider(nlp_configuration=NLP_CONFIGURATION).create_engine()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=SUPPORTED_LANGUAGES)
# VIN is language-agnostic, so register it under every served language —
# a recognizer only fires for the language the caller routes to.
vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7)
for language in SUPPORTED_LANGUAGES:
analyzer.registry.add_recognizer(
VinRecognizer(
supported_entity="VIN",
patterns=[vin_pattern],
context=["vin", "vehicle", "chassis"],
supported_language=language,
)
)
Comment thread
cursor[bot] marked this conversation as resolved.
for recognizer_cls in EXTRA_RECOGNIZERS:
analyzer.registry.add_recognizer(recognizer_cls())
return analyzer


analyzer = build_analyzer()
anonymizer = AnonymizerEngine()

app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None)


class AnalyzeRequest(BaseModel):
text: str
language: str = "en"
entities: list[str] | None = None
score_threshold: float | None = None
return_decision_process: bool = False


class AnonymizeRequest(BaseModel):
text: str
analyzer_results: list[dict[str, Any]] = []
anonymizers: dict[str, dict[str, Any]] | None = None
operators: dict[str, dict[str, Any]] | None = None


@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}


@app.get("/supportedentities")
def supported_entities(language: str = "en") -> list[str]:
return analyzer.get_supported_entities(language)


@app.post("/analyze")
def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]:
results = analyzer.analyze(
text=req.text,
language=req.language,
entities=req.entities or None,
score_threshold=req.score_threshold,
return_decision_process=req.return_decision_process,
)
return [r.to_dict() for r in results]


@app.post("/anonymize")
def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
analyzer_results = [
RecognizerResult(
entity_type=r["entity_type"],
start=r["start"],
end=r["end"],
score=r.get("score", 1.0),
)
for r in req.analyzer_results
]
raw_operators = req.anonymizers or req.operators
operators = None
if raw_operators:
operators = {}
for entity, raw_cfg in raw_operators.items():
op_cfg = dict(raw_cfg)
op_type = op_cfg.pop("type", "replace")
operators[entity] = OperatorConfig(op_type, op_cfg)
result = anonymizer.anonymize(
text=req.text,
analyzer_results=analyzer_results,
operators=operators,
)
return {
"text": result.text,
"items": [
{
"operator": item.operator,
"entity_type": item.entity_type,
"start": item.start,
"end": item.end,
"text": item.text,
}
for item in result.items
],
}
6 changes: 6 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions docker/pii.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# ========================================
# Combined Presidio service (analyzer + anonymizer) on a single port (3000)
# ========================================
FROM python:3.12-slim-bookworm AS base

WORKDIR /app

# build-essential for any sdist that compiles native deps (e.g. blis/thinc).
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update && apt-get install -y --no-install-recommends \
build-essential curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# Pinned Python deps. Separate layer so source edits don't reinstall them.
COPY apps/pii/requirements.txt ./requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt

# Pinned spaCy models (en + es/it/pl/fi, ~2.2GB total). Downloaded with
# retries/resume — the large wheels truncate on flaky networks if pip fetches
# the URLs directly.
ARG SPACY_MODELS="en_core_web_lg-3.8.0 es_core_news_lg-3.8.0 it_core_news_lg-3.8.0 pl_core_news_lg-3.8.0 fi_core_news_lg-3.8.0"
RUN --mount=type=cache,target=/root/.cache/pip \
for model in ${SPACY_MODELS}; do \
whl="${model}-py3-none-any.whl"; \
curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \
-o "/tmp/${whl}" \
"https://github.com/explosion/spacy-models/releases/download/${model}/${whl}" || exit 1; \
done && \
pip install /tmp/*.whl && \
rm /tmp/*.whl

COPY apps/pii/server.py ./server.py

RUN groupadd -g 1001 pii && \
useradd -u 1001 -g pii pii && \
chown -R pii:pii /app
USER pii

EXPOSE 3000

# start-period is generous: five large spaCy models load at import before
# /health responds. Tune against measured cold-start once built.
HEALTHCHECK --interval=30s --timeout=5s --start-period=180s --retries=3 \
CMD curl -fsS http://localhost:3000/health || exit 1
Comment thread
cursor[bot] marked this conversation as resolved.

CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3000"]
Loading