getsentry · jgreer013 · Jun 18, 2026
@@ -31,16 +31,32 @@
     raise DidNotEnable("LiteLLM not installed")
 
 
-def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]":
-    """Get the metadata dictionary from the kwargs."""
-    litellm_params = kwargs.setdefault("litellm_params", {})
+# litellm threads the SAME `kwargs` dict (its per-request ``model_call_details``)
+# through the input, success, and failure callbacks, so the bookkeeping span is
+# stashed on it directly. This ties the span's lifetime to the request -- it is
+# freed when litellm releases the request -- with no module-level tracking.
+#
+# The span must NOT go in ``kwargs["litellm_params"]["metadata"]``: litellm
+# forwards that caller ``metadata`` dict into the outbound request body for some
+# providers (e.g. Anthropic's /v1/messages passthrough), which would break
+# ``json.dumps(request_body)`` and leak the span (and its prompt data) to the
+# provider. The Anthropic request body is built only from the recognized request
+# params, not from ``model_call_details``, so a top-level key here is not
+# forwarded -- it is also where litellm stores its own per-request internal state
+# (e.g. ``agentic_loop_params``).
+_SPAN_KEY = "_sentry_span"
 
-    # we need this weird little dance, as metadata might be set but may be None initially
-    metadata = litellm_params.get("metadata")
-    if metadata is None:
-        metadata = {}
-        litellm_params["metadata"] = metadata
-    return metadata
+
+def _store_span(kwargs: "Dict[str, Any]", span: "Any") -> None:
+    kwargs[_SPAN_KEY] = span
+
+
+def _peek_span(kwargs: "Dict[str, Any]") -> "Any":
+    return kwargs.get(_SPAN_KEY)
+
+
+def _pop_span(kwargs: "Dict[str, Any]") -> "Any":
+    return kwargs.pop(_SPAN_KEY, None)
 
 
 def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]":
@@ -117,8 +133,8 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
         )
         span.__enter__()
 
-    # Store span for later
-    _get_metadata_dict(kwargs)["_sentry_span"] = span
+    # Store span for later, off-band from the kwargs litellm may forward.
+    _store_span(kwargs, span)
 
     # Set basic data
     set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider)
@@ -198,8 +214,7 @@ def _success_callback(
 ) -> None:
     """Handle successful completion."""
 
-    metadata = _get_metadata_dict(kwargs)
-    span = metadata.get("_sentry_span")
+    span = _peek_span(kwargs)
     if span is None:
         return
 
@@ -259,7 +274,7 @@ def _success_callback(
             or "complete_streaming_response" in kwargs
             or "async_complete_streaming_response" in kwargs
         ):
-            span = metadata.pop("_sentry_span", None)
+            span = _pop_span(kwargs)
             if span is not None:
                 span.__exit__(None, None, None)
 
@@ -285,7 +300,7 @@ def _failure_callback(
     end_time: "datetime",
 ) -> None:
     """Handle request failure."""
-    span = _get_metadata_dict(kwargs).get("_sentry_span")
+    span = _pop_span(kwargs)
     if span is None:
         return
 

@@ -2532,6 +2532,139 @@ def test_integration_setup(sentry_init):
     assert _failure_callback in (litellm.failure_callback or [])
 
 
+def test_caller_metadata_stays_json_serializable(
+    sentry_init,
+    capture_events,
+):
+    """Regression test for GH-6596.
+
+    litellm threads the caller's ``metadata`` dict into ``litellm_params`` and
+    some providers (e.g. Anthropic's ``/v1/messages`` passthrough) serialize it
+    into the outbound request body *before the response comes back*. The
+    integration must therefore never write its live ``Span`` into that dict, or
+    ``json.dumps(request_body)`` raises ``TypeError: Object of type Span is not
+    JSON serializable`` before the request is even sent.
+    """
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        send_default_pii=True,
+        _experiments={"trace_lifecycle": "static"},
+    )
+    events = capture_events()
+
+    # Mirror the kwargs litellm hands to its callbacks: the caller's metadata
+    # lives under litellm_params and is the very dict forwarded onto the wire.
+    caller_metadata = {"user_id": "my-org"}
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello!"}],
+        "litellm_call_id": "call-6596",
+        "litellm_params": {"metadata": caller_metadata},
+    }
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+
+        # litellm would serialize the request body here, while the span is live.
+        # The live span must not be in the forwarded metadata...
+        assert "_sentry_span" not in caller_metadata
+        # ...so the request body remains JSON-serializable.
+        json.dumps(caller_metadata)
+
+        # The span is still recorded off-band, so monitoring keeps working.
+        _success_callback(
+            kwargs, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    chat_spans = [
+        span
+        for span in event["spans"]
+        if span["op"] == OP.GEN_AI_CHAT and span["origin"] == "auto.ai.litellm"
+    ]
+    assert len(chat_spans) == 1
+
+
+def test_span_stashed_on_shared_kwargs_not_forwarded_metadata(sentry_init):
+    """The span is stashed on the shared kwargs dict (a top-level key litellm
+    does not forward), never in the caller's metadata, and each call keeps its
+    own span.
+    """
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+
+    with start_transaction(name="litellm test"):
+        caller_metadata = {"user_id": "my-org"}
+        kwargs_a = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "a"}],
+            "litellm_params": {"metadata": caller_metadata},
+        }
+        kwargs_b = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "b"}],
+        }
+
+        _input_callback(kwargs_a)
+        _input_callback(kwargs_b)
+
+        # Stashed on the shared kwargs dict, off the forwarded metadata path...
+        assert kwargs_a["_sentry_span"] is not None
+        assert "_sentry_span" not in caller_metadata
+        json.dumps(caller_metadata)
+        # ...and each call keeps its own span (no cross-talk).
+        assert kwargs_a["_sentry_span"] is not kwargs_b["_sentry_span"]
+
+        _success_callback(
+            kwargs_a, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        _success_callback(
+            kwargs_b, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+
+
+def test_span_cleaned_up_after_terminal_callbacks(sentry_init):
+    """Both terminal callbacks remove the span from the shared kwargs dict, so a
+    completed or failed call leaves nothing behind."""
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+
+    with start_transaction(name="litellm test"):
+        success_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "success-call",
+        }
+        _input_callback(success_kwargs)
+        assert "_sentry_span" in success_kwargs
+        _success_callback(
+            success_kwargs, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        assert "_sentry_span" not in success_kwargs
+
+        failure_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "failure-call",
+        }
+        _input_callback(failure_kwargs)
+        assert "_sentry_span" in failure_kwargs
+        _failure_callback(
+            failure_kwargs, ValueError("boom"), datetime.now(), datetime.now()
+        )
+        assert "_sentry_span" not in failure_kwargs
+
+
 def test_litellm_message_truncation(sentry_init, capture_events):
     """Test that large messages are truncated properly in LiteLLM integration."""
     sentry_init(