Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sentry_sdk/integrations/pydantic_ai/consts.py
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
import re

SPAN_ORIGIN = "auto.ai.pydantic_ai"

# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
# Group 1: MIME type (e.g. "image/png"), Group 2: base64 data
DATA_URL_BASE64_REGEX = re.compile(
r"^data:([a-zA-Z]+/[a-zA-Z]+);base64,([A-Za-z0-9+/\-_]+={0,2})$"
)

Check warning on line 9 in sentry_sdk/integrations/pydantic_ai/consts.py

View workflow job for this annotation

GitHub Actions / warden: find-bugs

Regex fails to match valid MIME types, leaking base64 data instead of redacting it

The DATA_URL_BASE64_REGEX pattern `([a-zA-Z]+/[a-zA-Z]+)` only matches MIME types containing letters, but RFC 2046 allows digits, hyphens, periods, and plus signs. Valid data URLs like `data:image/svg+xml;base64,...` or `data:video/3gpp;base64,...` will not match, causing `_serialize_image_url_item` to fall through and return the full data URL including base64-encoded content that should be redacted.
Comment thread
sentry[bot] marked this conversation as resolved.
Comment thread
ericapisani marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
ericapisani marked this conversation as resolved.
24 changes: 10 additions & 14 deletions sentry_sdk/integrations/pydantic_ai/spans/ai_client.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json

import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import (
normalize_message_roles,
set_data_normalized,
truncate_and_annotate_messages,
get_modality_from_mime_type,
)
from sentry_sdk.consts import OP, SPANDATA
from sentry_sdk.utils import safe_serialize
Expand All @@ -21,7 +19,11 @@
get_current_agent,
get_is_streaming,
)
from .utils import _set_usage_data
from .utils import (
_serialize_binary_content_item,
_serialize_image_url_item,
_set_usage_data,
)

from typing import TYPE_CHECKING

Expand All @@ -40,6 +42,7 @@
TextPart,
ThinkingPart,
BinaryContent,
ImageUrl,
)
except ImportError:
# Fallback if these classes are not available
Expand All @@ -50,6 +53,7 @@
TextPart = None
ThinkingPart = None
BinaryContent = None
ImageUrl = None
Comment thread
ericapisani marked this conversation as resolved.


def _transform_system_instructions(
Expand Down Expand Up @@ -158,22 +162,14 @@ def _set_input_messages(span: "sentry_sdk.tracing.Span", messages: "Any") -> Non
for item in part.content:
if isinstance(item, str):
content.append({"type": "text", "text": item})
elif ImageUrl and isinstance(item, ImageUrl):
content.append(_serialize_image_url_item(item))
elif BinaryContent and isinstance(item, BinaryContent):
content.append(
{
"type": "blob",
"modality": get_modality_from_mime_type(
item.media_type
),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}
)
content.append(_serialize_binary_content_item(item))
else:
content.append(safe_serialize(item))
else:
content.append({"type": "text", "text": str(part.content)})

# Add message if we have content or tool calls
if content or tool_calls:
message: "Dict[str, Any]" = {"role": role}
Expand Down
24 changes: 10 additions & 14 deletions sentry_sdk/integrations/pydantic_ai/spans/invoke_agent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import (
get_modality_from_mime_type,
get_start_span_function,
normalize_message_roles,
set_data_normalized,
Expand All @@ -16,17 +14,22 @@
_set_model_data,
_should_send_prompts,
)
from .utils import _set_usage_data
from .utils import (
_serialize_binary_content_item,
_serialize_image_url_item,
_set_usage_data,
)

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any

try:
from pydantic_ai.messages import BinaryContent # type: ignore
from pydantic_ai.messages import BinaryContent, ImageUrl # type: ignore
except ImportError:
BinaryContent = None
ImageUrl = None


def invoke_agent_span(
Expand Down Expand Up @@ -105,17 +108,10 @@ def invoke_agent_span(
for item in user_prompt:
if isinstance(item, str):
content.append({"text": item, "type": "text"})
elif ImageUrl and isinstance(item, ImageUrl):
content.append(_serialize_image_url_item(item))
elif BinaryContent and isinstance(item, BinaryContent):
content.append(
{
"type": "blob",
"modality": get_modality_from_mime_type(
item.media_type
),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}
)
content.append(_serialize_binary_content_item(item))
if content:
messages.append(
{
Expand Down
44 changes: 43 additions & 1 deletion sentry_sdk/integrations/pydantic_ai/spans/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,56 @@
"""Utility functions for PydanticAI span instrumentation."""

import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import get_modality_from_mime_type
from sentry_sdk.consts import SPANDATA

from ..consts import DATA_URL_BASE64_REGEX

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Union, Dict, Any, List
from typing import Union, Dict, Any, List, Optional
from pydantic_ai.usage import RequestUsage, RunUsage # type: ignore

try:
from pydantic_ai.messages import BinaryContent, ImageUrl # type: ignore
except ImportError:
BinaryContent = None
ImageUrl = None
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated


def _serialize_image_url_item(item: "Any") -> "Dict[str, Any]":
"""Serialize an ImageUrl content item for span data.

For data URLs containing base64-encoded images, the content is redacted.
For regular HTTP URLs, the URL string is preserved.
"""
data_url_matches = DATA_URL_BASE64_REGEX.match(item.url)
Comment thread
sentry[bot] marked this conversation as resolved.
Outdated

if data_url_matches:
mime_type = data_url_matches[1] or "image"
return {
Comment thread
ericapisani marked this conversation as resolved.
"type": "image",
"mime_type": mime_type,
"content": BLOB_DATA_SUBSTITUTE,
}
Comment thread
ericapisani marked this conversation as resolved.
Outdated

return {
"type": "image",
"content": str(item.url),
}

Check warning on line 42 in sentry_sdk/integrations/pydantic_ai/spans/utils.py

View workflow job for this annotation

GitHub Actions / warden: find-bugs

[53N-D7M] Regex fails to match valid MIME types, leaking base64 data instead of redacting it (additional location)

The DATA_URL_BASE64_REGEX pattern `([a-zA-Z]+/[a-zA-Z]+)` only matches MIME types containing letters, but RFC 2046 allows digits, hyphens, periods, and plus signs. Valid data URLs like `data:image/svg+xml;base64,...` or `data:video/3gpp;base64,...` will not match, causing `_serialize_image_url_item` to fall through and return the full data URL including base64-encoded content that should be redacted.


def _serialize_binary_content_item(item: "Any") -> "Dict[str, Any]":
"""Serialize a BinaryContent item for span data, redacting the blob data."""
return {
"type": "blob",
"modality": get_modality_from_mime_type(item.media_type),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}


def _set_usage_data(
span: "sentry_sdk.tracing.Span", usage: "Union[RequestUsage, RunUsage]"
Expand Down
Loading
Loading