diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 017d5f14768..199910a5703 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,6 +88,8 @@ jobs: ecr_repo_secret: ECR_MIGRATIONS - dockerfile: ./docker/realtime.Dockerfile ecr_repo_secret: ECR_REALTIME + - dockerfile: ./docker/pii.Dockerfile + ecr_repo_secret: ECR_PII steps: - name: Checkout code uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -115,7 +117,7 @@ jobs: id: ecr-repo run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT env: - ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }} + ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PII' && secrets.ECR_PII || '' }} - name: Build and push uses: useblacksmith/build-push-action@fb9e3e6a9299c78462bfadd0d93352c316adc9b8 # v2 @@ -153,6 +155,10 @@ jobs: - dockerfile: ./docker/realtime.Dockerfile ghcr_image: ghcr.io/simstudioai/realtime ecr_repo_secret: ECR_REALTIME + # pii is ECR-only (private ECS sidecar) — no ghcr_image, so the tag + # step below skips GHCR for it. + - dockerfile: ./docker/pii.Dockerfile + ecr_repo_secret: ECR_PII steps: - name: Checkout code uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 @@ -188,7 +194,7 @@ jobs: id: ecr-repo run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT env: - ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }} + ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PII' && secrets.ECR_PII || '' }} - name: Generate tags id: meta @@ -206,7 +212,7 @@ jobs: TAGS="${ECR_IMAGE}" - if [ "${{ github.ref }}" = "refs/heads/main" ]; then + if [ "${{ github.ref }}" = "refs/heads/main" ] && [ -n "$GHCR_IMAGE" ]; then GHCR_AMD64="${GHCR_IMAGE}:latest-amd64" GHCR_SHA="${GHCR_IMAGE}:${{ github.sha }}-amd64" TAGS="${TAGS},$GHCR_AMD64,$GHCR_SHA" diff --git a/.github/workflows/images.yml b/.github/workflows/images.yml index 54f8a2f47a8..78b7db6510c 100644 --- a/.github/workflows/images.yml +++ b/.github/workflows/images.yml @@ -26,6 +26,9 @@ jobs: - dockerfile: ./docker/realtime.Dockerfile ghcr_image: ghcr.io/simstudioai/realtime ecr_repo_secret: ECR_REALTIME + # pii is ECR-only (private ECS sidecar) — no ghcr_image. + - dockerfile: ./docker/pii.Dockerfile + ecr_repo_secret: ECR_PII outputs: registry: ${{ steps.login-ecr.outputs.registry }} @@ -80,8 +83,8 @@ jobs: # Build tags list TAGS="${ECR_IMAGE}" - # Add GHCR tags only for main branch - if [ "${{ github.ref }}" = "refs/heads/main" ]; then + # Add GHCR tags only for main branch (and only for images with a GHCR target) + if [ "${{ github.ref }}" = "refs/heads/main" ] && [ -n "$GHCR_IMAGE" ]; then GHCR_AMD64="${GHCR_IMAGE}:latest-amd64" GHCR_SHA="${GHCR_IMAGE}:${{ github.sha }}-amd64" TAGS="${TAGS},$GHCR_AMD64,$GHCR_SHA" diff --git a/apps/pii/package.json b/apps/pii/package.json new file mode 100644 index 00000000000..0c3c3807feb --- /dev/null +++ b/apps/pii/package.json @@ -0,0 +1,6 @@ +{ + "name": "@sim/pii", + "version": "0.0.0", + "private": true, + "description": "PII detection + anonymization service (Microsoft Presidio, FastAPI). Python service built as a container image (docker/pii.Dockerfile); not part of the JS/turbo build." +} diff --git a/apps/pii/requirements.txt b/apps/pii/requirements.txt new file mode 100644 index 00000000000..bd120fd57c7 --- /dev/null +++ b/apps/pii/requirements.txt @@ -0,0 +1,10 @@ +# Pinned for reproducible image builds. Bump deliberately. +presidio-analyzer==2.2.362 +presidio-anonymizer==2.2.362 +spacy==3.8.14 +fastapi==0.138.0 +uvicorn[standard]==0.49.0 + +# The English spaCy model (en_core_web_lg, ~400MB) is fetched + pinned in the +# Dockerfile via curl-with-retry rather than here — a direct pip wheel URL +# truncates on flaky networks and fails wheel validation. diff --git a/apps/pii/server.py b/apps/pii/server.py new file mode 100644 index 00000000000..597fe8f3d90 --- /dev/null +++ b/apps/pii/server.py @@ -0,0 +1,212 @@ +"""Combined Presidio REST service: analyzer + anonymizer on one port. + +Constructs one warm AnalyzerEngine (multi-language NLP + a native check-digit +VIN recognizer) and one AnonymizerEngine at startup, exposing stock-compatible +endpoints so a single PRESIDIO_URL serves both. +""" + +from typing import Any + +from fastapi import FastAPI +from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult +from presidio_analyzer.nlp_engine import NlpEngineProvider +from presidio_analyzer.predefined_recognizers import ( + AuAbnRecognizer, + AuAcnRecognizer, + AuMedicareRecognizer, + AuTfnRecognizer, + EsNieRecognizer, + EsNifRecognizer, + FiPersonalIdentityCodeRecognizer, + InAadhaarRecognizer, + InPanRecognizer, + InPassportRecognizer, + InVehicleRegistrationRecognizer, + InVoterRecognizer, + ItDriverLicenseRecognizer, + ItFiscalCodeRecognizer, + ItIdentityCardRecognizer, + ItPassportRecognizer, + ItVatCodeRecognizer, + PlPeselRecognizer, + SgFinRecognizer, + SgUenRecognizer, + UkNinoRecognizer, +) +from presidio_anonymizer import AnonymizerEngine +from presidio_anonymizer.entities import OperatorConfig +from pydantic import BaseModel + +# Languages served. Each needs its spaCy model installed in the image; the +# es/it/pl/fi predefined recognizers (ES_NIF, IT_FISCAL_CODE, PL_PESEL, ...) +# auto-load once their NLP engine is present. +NLP_CONFIGURATION = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + {"lang_code": "es", "model_name": "es_core_news_lg"}, + {"lang_code": "it", "model_name": "it_core_news_lg"}, + {"lang_code": "pl", "model_name": "pl_core_news_lg"}, + {"lang_code": "fi", "model_name": "fi_core_news_lg"}, + ], +} +SUPPORTED_LANGUAGES = [m["lang_code"] for m in NLP_CONFIGURATION["models"]] + +# Predefined recognizers Presidio ships but does NOT load into the default +# registry — they must be added explicitly. Each carries its own +# supported_language, so it fires under that language once its NLP model is +# loaded. en: UK/AU/IN/SG locale ids; es/it/pl/fi: national ids. +EXTRA_RECOGNIZERS = [ + UkNinoRecognizer, + AuAbnRecognizer, + AuAcnRecognizer, + AuTfnRecognizer, + AuMedicareRecognizer, + InPanRecognizer, + InAadhaarRecognizer, + InVehicleRegistrationRecognizer, + InVoterRecognizer, + InPassportRecognizer, + SgFinRecognizer, + SgUenRecognizer, + EsNifRecognizer, + EsNieRecognizer, + ItFiscalCodeRecognizer, + ItDriverLicenseRecognizer, + ItVatCodeRecognizer, + ItPassportRecognizer, + ItIdentityCardRecognizer, + PlPeselRecognizer, + FiPersonalIdentityCodeRecognizer, +] + + +class VinRecognizer(PatternRecognizer): + """VIN (17 chars, A-Z/0-9 excluding I/O/Q) with ISO 3779 check-digit + validation (position 9). Validation makes accidental matches on arbitrary + 17-char codes (request ids, SKUs, tokens) extremely unlikely. Some + non-North-American VINs omit the check digit and are skipped — an + intentional bias toward precision. + """ + + _TRANSLIT = { + **{str(d): d for d in range(10)}, + "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, + "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, + "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, + } + _WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] + + def validate_result(self, pattern_text: str): + vin = pattern_text.upper() + if len(vin) != 17: + return False + try: + total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS)) + except KeyError: + return False + check = total % 11 + expected = "X" if check == 10 else str(check) + return vin[8] == expected + + +def build_analyzer() -> AnalyzerEngine: + nlp_engine = NlpEngineProvider(nlp_configuration=NLP_CONFIGURATION).create_engine() + analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=SUPPORTED_LANGUAGES) + # VIN is language-agnostic, so register it under every served language — + # a recognizer only fires for the language the caller routes to. + vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7) + for language in SUPPORTED_LANGUAGES: + analyzer.registry.add_recognizer( + VinRecognizer( + supported_entity="VIN", + patterns=[vin_pattern], + context=["vin", "vehicle", "chassis"], + supported_language=language, + ) + ) + for recognizer_cls in EXTRA_RECOGNIZERS: + analyzer.registry.add_recognizer(recognizer_cls()) + return analyzer + + +analyzer = build_analyzer() +anonymizer = AnonymizerEngine() + +app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None) + + +class AnalyzeRequest(BaseModel): + text: str + language: str = "en" + entities: list[str] | None = None + score_threshold: float | None = None + return_decision_process: bool = False + + +class AnonymizeRequest(BaseModel): + text: str + analyzer_results: list[dict[str, Any]] = [] + anonymizers: dict[str, dict[str, Any]] | None = None + operators: dict[str, dict[str, Any]] | None = None + + +@app.get("/health") +def health() -> dict[str, str]: + return {"status": "ok"} + + +@app.get("/supportedentities") +def supported_entities(language: str = "en") -> list[str]: + return analyzer.get_supported_entities(language) + + +@app.post("/analyze") +def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]: + results = analyzer.analyze( + text=req.text, + language=req.language, + entities=req.entities or None, + score_threshold=req.score_threshold, + return_decision_process=req.return_decision_process, + ) + return [r.to_dict() for r in results] + + +@app.post("/anonymize") +def anonymize(req: AnonymizeRequest) -> dict[str, Any]: + analyzer_results = [ + RecognizerResult( + entity_type=r["entity_type"], + start=r["start"], + end=r["end"], + score=r.get("score", 1.0), + ) + for r in req.analyzer_results + ] + raw_operators = req.anonymizers or req.operators + operators = None + if raw_operators: + operators = {} + for entity, raw_cfg in raw_operators.items(): + op_cfg = dict(raw_cfg) + op_type = op_cfg.pop("type", "replace") + operators[entity] = OperatorConfig(op_type, op_cfg) + result = anonymizer.anonymize( + text=req.text, + analyzer_results=analyzer_results, + operators=operators, + ) + return { + "text": result.text, + "items": [ + { + "operator": item.operator, + "entity_type": item.entity_type, + "start": item.start, + "end": item.end, + "text": item.text, + } + for item in result.items + ], + } diff --git a/bun.lock b/bun.lock index ec76f4c62d8..eebbc993925 100644 --- a/bun.lock +++ b/bun.lock @@ -54,6 +54,10 @@ "typescript": "^5.8.2", }, }, + "apps/pii": { + "name": "@sim/pii", + "version": "0.0.0", + }, "apps/realtime": { "name": "@sim/realtime", "version": "0.1.0", @@ -1423,6 +1427,8 @@ "@sim/logger": ["@sim/logger@workspace:packages/logger"], + "@sim/pii": ["@sim/pii@workspace:apps/pii"], + "@sim/platform-authz": ["@sim/platform-authz@workspace:packages/platform-authz"], "@sim/realtime": ["@sim/realtime@workspace:apps/realtime"], diff --git a/docker/pii.Dockerfile b/docker/pii.Dockerfile new file mode 100644 index 00000000000..1045a762e6e --- /dev/null +++ b/docker/pii.Dockerfile @@ -0,0 +1,48 @@ +# ======================================== +# Combined Presidio service (analyzer + anonymizer) on a single port (3000) +# ======================================== +FROM python:3.12-slim-bookworm AS base + +WORKDIR /app + +# build-essential for any sdist that compiles native deps (e.g. blis/thinc). +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get update && apt-get install -y --no-install-recommends \ + build-essential curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Pinned Python deps. Separate layer so source edits don't reinstall them. +COPY apps/pii/requirements.txt ./requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r requirements.txt + +# Pinned spaCy models (en + es/it/pl/fi, ~2.2GB total). Downloaded with +# retries/resume — the large wheels truncate on flaky networks if pip fetches +# the URLs directly. +ARG SPACY_MODELS="en_core_web_lg-3.8.0 es_core_news_lg-3.8.0 it_core_news_lg-3.8.0 pl_core_news_lg-3.8.0 fi_core_news_lg-3.8.0" +RUN --mount=type=cache,target=/root/.cache/pip \ + for model in ${SPACY_MODELS}; do \ + whl="${model}-py3-none-any.whl"; \ + curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \ + -o "/tmp/${whl}" \ + "https://github.com/explosion/spacy-models/releases/download/${model}/${whl}" || exit 1; \ + done && \ + pip install /tmp/*.whl && \ + rm /tmp/*.whl + +COPY apps/pii/server.py ./server.py + +RUN groupadd -g 1001 pii && \ + useradd -u 1001 -g pii pii && \ + chown -R pii:pii /app +USER pii + +EXPOSE 3000 + +# start-period is generous: five large spaCy models load at import before +# /health responds. Tune against measured cold-start once built. +HEALTHCHECK --interval=30s --timeout=5s --start-period=180s --retries=3 \ + CMD curl -fsS http://localhost:3000/health || exit 1 + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3000"]