diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 3cff0fbc23..5fee701ed9 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -6,6 +6,7 @@ from sentry_sdk.ai.monitoring import record_token_usage from sentry_sdk.ai.utils import ( get_start_span_function, + normalize_message_roles, set_data_normalized, truncate_and_annotate_messages, transform_openai_content_part, @@ -17,7 +18,7 @@ from sentry_sdk.utils import event_from_exception if TYPE_CHECKING: - from typing import Any, Dict, List + from typing import Any, Dict, List, Optional from datetime import datetime try: @@ -39,6 +40,23 @@ def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]": return metadata +def _read_usage_field(usage: "Any", *names: str) -> "Optional[int]": + """Read the first non-None field from a usage container. + + The usage object can be either a typed Pydantic model (attribute access) or + a plain dict (litellm hands us a dict for the assembled async-streaming + response), so we try both shapes. + """ + for name in names: + if isinstance(usage, dict): + value = usage.get(name) + else: + value = getattr(usage, name, None) + if value is not None: + return value + return None + + def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]": """ Convert the message parts from OpenAI format to the `gen_ai.request.messages` format @@ -84,16 +102,17 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: call_type = kwargs.get("call_type", None) if call_type == "embedding" or call_type == "aembedding": operation = "embeddings" + op = consts.OP.GEN_AI_EMBEDDINGS + elif call_type == "responses" or call_type == "aresponses": + operation = "responses" + op = consts.OP.GEN_AI_RESPONSES else: operation = "chat" + op = consts.OP.GEN_AI_CHAT # Start a new span/transaction span = get_start_span_function()( - op=( - consts.OP.GEN_AI_CHAT - if operation == "chat" - else consts.OP.GEN_AI_EMBEDDINGS - ), + op=op, name=f"{operation} {model}", origin=LiteLLMIntegration.origin, ) @@ -106,14 +125,15 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider) set_data_normalized(span, SPANDATA.GEN_AI_OPERATION_NAME, operation) - # Record input/messages if allowed - if should_send_default_pii() and integration.include_prompts: - if operation == "embeddings": - # For embeddings, look for the 'input' parameter + # Per-operation request data. Conversation id (responses) is set + # unconditionally; user-content fields are gated on PII / include_prompts. + record_prompts = should_send_default_pii() and integration.include_prompts + scope = sentry_sdk.get_current_scope() + + if operation == "embeddings": + if record_prompts: embedding_input = kwargs.get("input") if embedding_input: - scope = sentry_sdk.get_current_scope() - # Normalize to list format input_list = ( embedding_input if isinstance(embedding_input, list) @@ -129,11 +149,50 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: messages_data, unpack=False, ) - else: - # For chat, look for the 'messages' parameter + + elif operation == "responses": + # litellm unpacks `extra_body` into the request body, so the + # `conversation` field shows up in additional_args.complete_input_dict + # rather than as a top-level kwarg. + complete_input = (kwargs.get("additional_args") or {}).get( + "complete_input_dict" + ) or {} + conversation = complete_input.get("conversation") + if conversation is not None: + conversation_id: "Optional[str]" = None + if isinstance(conversation, str): + conversation_id = conversation + elif isinstance(conversation, dict): + conversation_id = conversation.get("id") + if conversation_id is not None: + set_data_normalized( + span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id + ) + + if record_prompts: + # `input` is either a string or a list of message dicts (same + # shape as OpenAI Responses API). + responses_input = kwargs.get("input") + if responses_input: + if isinstance(responses_input, str): + input_messages = [responses_input] + else: + input_messages = list(responses_input) + normalized = normalize_message_roles(input_messages) # type: ignore[arg-type] + messages_data = truncate_and_annotate_messages(normalized, span, scope) + if messages_data is not None: + set_data_normalized( + span, + SPANDATA.GEN_AI_REQUEST_MESSAGES, + messages_data, + unpack=False, + ) + + else: + # Chat completions. + if record_prompts: messages = kwargs.get("messages", []) if messages: - scope = sentry_sdk.get_current_scope() messages = _convert_message_parts(messages) messages_data = truncate_and_annotate_messages(messages, span, scope) if messages_data is not None: @@ -166,11 +225,24 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None: def _success_callback( kwargs: "Dict[str, Any]", - completion_response: "Any", + response: "Any", start_time: "datetime", end_time: "datetime", ) -> None: - """Handle successful completion.""" + """Handle a successful chat completion, embeddings, or Responses API call. + + The shape of `response` differs between API paths: + - Chat Completions: ModelResponse with ``.choices[].message`` and + ``.usage`` carrying ``prompt_tokens`` / ``completion_tokens``. + - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]`` + items (``message`` / ``function_call``) and ``.usage`` carrying + ``input_tokens`` / ``output_tokens``. + - Responses API (streaming): a ResponseCompletedEvent wrapper + ``{type: "response.completed", response: ResponsesAPIResponse}``, + which we unwrap below. + - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices + or output). + """ metadata = _get_metadata_dict(kwargs) span = metadata.get("_sentry_span") @@ -181,18 +253,25 @@ def _success_callback( if integration is None: return + # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of + # the function sees the assembled ResponsesAPIResponse directly. + if getattr(response, "type", None) == "response.completed" and hasattr( + response, "response" + ): + response = response.response + try: - # Record model information - if hasattr(completion_response, "model"): - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_MODEL, completion_response.model - ) + # `model` is set by all API shapes (chat / responses / embeddings). + if hasattr(response, "model"): + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) - # Record response content if allowed + # Response content: structure depends on the API shape. Embeddings have + # neither ``choices`` nor ``output``, so we just skip this block. if should_send_default_pii() and integration.include_prompts: - if hasattr(completion_response, "choices"): + if hasattr(response, "choices"): + # Chat Completions API. response_messages = [] - for choice in completion_response.choices: + for choice in response.choices: if hasattr(choice, "message"): if hasattr(choice.message, "model_dump"): response_messages.append(choice.message.model_dump()) @@ -213,15 +292,56 @@ def _success_callback( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages ) + elif hasattr(response, "output"): + # Responses API: split message text from function-call items. + output_text: "List[Any]" = [] + tool_calls: "List[Any]" = [] + for output in response.output: + output_type = getattr(output, "type", None) + if output_type == "function_call": + if hasattr(output, "model_dump"): + tool_calls.append(output.model_dump()) + elif hasattr(output, "dict"): + tool_calls.append(output.dict()) + elif output_type == "message": + for content_item in getattr(output, "content", []) or []: + text = getattr(content_item, "text", None) + if text is not None: + output_text.append(text) + elif hasattr(content_item, "model_dump"): + output_text.append(content_item.model_dump()) + elif hasattr(content_item, "dict"): + output_text.append(content_item.dict()) + + if tool_calls: + set_data_normalized( + span, + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, + tool_calls, + unpack=False, + ) + if output_text: + set_data_normalized( + span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text + ) - # Record token usage - if hasattr(completion_response, "usage"): - usage = completion_response.usage + # Token usage field names differ across APIs: + # Chat Completions / Embeddings: prompt_tokens / completion_tokens + # Responses API (non-streaming): input_tokens / output_tokens + # Responses API (streaming): prompt_tokens / completion_tokens + # (litellm normalizes to chat-completion names when assembling the + # streaming response). For the async-streaming variant, the + # assembled `usage` is a plain dict, not a Pydantic model — hence + # `_read_usage_field` supports both shapes. + if hasattr(response, "usage"): + usage = response.usage record_token_usage( span, - input_tokens=getattr(usage, "prompt_tokens", None), - output_tokens=getattr(usage, "completion_tokens", None), - total_tokens=getattr(usage, "total_tokens", None), + input_tokens=_read_usage_field(usage, "prompt_tokens", "input_tokens"), + output_tokens=_read_usage_field( + usage, "completion_tokens", "output_tokens" + ), + total_tokens=_read_usage_field(usage, "total_tokens"), ) finally: diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 18f8cfaf6e..5dcf16003f 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2135,3 +2135,261 @@ def test_convert_message_parts_image_url_missing_url(): converted = _convert_message_parts(messages) # Should return item unchanged assert converted[0]["content"][0]["type"] == "image_url" + + +class MockResponsesUsage: + def __init__(self, input_tokens=12, output_tokens=24, total_tokens=36): + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.total_tokens = total_tokens + + +class MockResponsesContentItem: + def __init__(self, text): + self.type = "output_text" + self.text = text + + +class MockResponsesOutputMessage: + def __init__(self, text): + self.type = "message" + self.role = "assistant" + self.content = [MockResponsesContentItem(text)] + + +class MockResponsesResponse: + def __init__( + self, + model="gpt-4.1-nano", + output=None, + usage=None, + ): + self.id = "resp-test" + self.model = model + self.output = output or [MockResponsesOutputMessage("the model response")] + self.usage = usage or MockResponsesUsage() + + +def _build_responses_kwargs( + *, + input_value="What is the capital of France?", + conversation=None, + model="openai/gpt-4.1-nano", + extra_body_conversation=None, +): + """ + Build the kwargs shape that litellm passes to input_callback for a + responses() call. `extra_body` is unpacked into + additional_args.complete_input_dict by litellm before callbacks fire. + """ + complete_input_dict = {"model": model.split("/")[-1], "input": input_value} + if extra_body_conversation is not None: + complete_input_dict["conversation"] = extra_body_conversation + elif conversation is not None: + complete_input_dict["conversation"] = conversation + + return { + "model": model, + "input": input_value, + "call_type": "responses", + "additional_args": {"complete_input_dict": complete_input_dict}, + } + + +@pytest.mark.parametrize( + "conversation, expected_id", + [ + pytest.param(None, None, id="no_conversation"), + pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"), + pytest.param("conv_str_id", "conv_str_id", id="string"), + ], +) +def test_responses_conversation_id( + sentry_init, capture_events, conversation, expected_id +): + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs(extra_body_conversation=conversation) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["description"] == "responses gpt-4.1-nano" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses" + + if expected_id is None: + assert SPANDATA.GEN_AI_CONVERSATION_ID not in span["data"] + else: + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id + + +def test_responses_records_input_output_and_usage(sentry_init, capture_events): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + kwargs = _build_responses_kwargs( + extra_body_conversation={"id": "conv_xyz"}, + ) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + + +def test_responses_no_pii_omits_messages(sentry_init, capture_events): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + # send_default_pii not set -> defaults to False + ) + events = capture_events() + + kwargs = _build_responses_kwargs(extra_body_conversation={"id": "conv_xyz"}) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + # Conversation id is not PII, but request/response content is + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + +class MockResponseCompletedEvent: + """Mimics litellm/openai's response.completed streaming wrapper.""" + + def __init__(self, response): + self.type = "response.completed" + self.response = response + + +def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events): + """For streaming responses, success_handler receives a ResponseCompletedEvent + wrapper. We must unwrap it to read usage/output from the inner response.""" + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["stream"] = True + kwargs["complete_streaming_response"] = MockResponsesResponse() + + wrapper = MockResponseCompletedEvent(MockResponsesResponse()) + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback(kwargs, wrapper, datetime.now(), datetime.now()) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + +class MockResponseWithDictUsage: + """Mimics the assembled async-streaming responses object: usage is a dict + (not a Pydantic model), as litellm hands us for that path.""" + + def __init__(self): + self.id = "resp-test" + self.model = "gpt-4.1-nano" + self.output = [MockResponsesOutputMessage("hi")] + self.usage = { + "prompt_tokens": 7, + "completion_tokens": 2, + "total_tokens": 9, + } + + +def test_responses_async_streaming_dict_usage(sentry_init, capture_events): + """For async streaming responses, litellm assembles `usage` as a plain dict. + `getattr(dict, ...)` would silently miss it; we need to support both shapes.""" + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["call_type"] = "aresponses" + kwargs["stream"] = True + kwargs["async_complete_streaming_response"] = MockResponseWithDictUsage() + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponseWithDictUsage(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 7 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 2 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9 + + +def test_aresponses_call_type_treated_as_responses(sentry_init, capture_events): + """aresponses (async) call_type should produce a responses span.""" + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + kwargs = _build_responses_kwargs() + kwargs["call_type"] = "aresponses" + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback( + kwargs, MockResponsesResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"