ref(openai): Only handle streamed results when applicable

alexander-alderman-webb · alexander-alderman-webb · commit 9d89b90da0ae · 2026-02-26T16:02:09.000+01:00
diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py
@@ -746,14 +746,103 @@ def _set_streaming_completions_api_output_data(
     if messages is not None and isinstance(messages, str):
         messages = [messages]
 
-    _common_set_output_data(
-        span,
-        response,
-        messages,
-        integration,
-        start_time,
-        finish_span,
-    )
+    ttft: "Optional[float]" = None
+    data_buf: "list[list[str]]" = []  # one for each choice
+
+    old_iterator = response._iterator
+
+    def new_iterator() -> "Iterator[ChatCompletionChunk]":
+        nonlocal ttft
+        count_tokens_manually = True
+        for x in old_iterator:
+            with capture_internal_exceptions():
+                if hasattr(x, "choices"):
+                    choice_index = 0
+                    for choice in x.choices:
+                        if hasattr(choice, "delta") and hasattr(
+                            choice.delta, "content"
+                        ):
+                            if start_time is not None and ttft is None:
+                                ttft = time.perf_counter() - start_time
+                            content = choice.delta.content
+                            if len(data_buf) <= choice_index:
+                                data_buf.append([])
+                            data_buf[choice_index].append(content or "")
+                        choice_index += 1
+
+            yield x
+
+        with capture_internal_exceptions():
+            if ttft is not None:
+                set_data_normalized(
+                    span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                )
+            if len(data_buf) > 0:
+                all_responses = ["".join(chunk) for chunk in data_buf]
+                if should_send_default_pii() and integration.include_prompts:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses
+                    )
+                if count_tokens_manually:
+                    _calculate_token_usage(
+                        messages,
+                        response,
+                        span,
+                        all_responses,
+                        integration.count_tokens,
+                    )
+
+        if finish_span:
+            span.__exit__(None, None, None)
+
+    async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
+        nonlocal ttft
+        count_tokens_manually = True
+        async for x in old_iterator:
+            with capture_internal_exceptions():
+                # OpenAI chat completion API
+                if hasattr(x, "choices"):
+                    choice_index = 0
+                    for choice in x.choices:
+                        if hasattr(choice, "delta") and hasattr(
+                            choice.delta, "content"
+                        ):
+                            if start_time is not None and ttft is None:
+                                ttft = time.perf_counter() - start_time
+                            content = choice.delta.content
+                            if len(data_buf) <= choice_index:
+                                data_buf.append([])
+                            data_buf[choice_index].append(content or "")
+                        choice_index += 1
+
+            yield x
+
+        with capture_internal_exceptions():
+            if ttft is not None:
+                set_data_normalized(
+                    span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                )
+            if len(data_buf) > 0:
+                all_responses = ["".join(chunk) for chunk in data_buf]
+                if should_send_default_pii() and integration.include_prompts:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses
+                    )
+                if count_tokens_manually:
+                    _calculate_token_usage(
+                        messages,
+                        response,
+                        span,
+                        all_responses,
+                        integration.count_tokens,
+                    )
+        if finish_span:
+            span.__exit__(None, None, None)
+
+    if str(type(response._iterator)) == "<class 'async_generator'>":
+        response._iterator = new_iterator_async()
+    else:
+        response._iterator = new_iterator()
 
 
 def _set_responses_api_output_data(
@@ -792,14 +881,108 @@ def _set_streaming_responses_api_output_data(
     if input is not None and isinstance(input, str):
         input = [input]
 
-    _common_set_output_data(
-        span,
-        response,
-        input,
-        integration,
-        start_time,
-        finish_span,
-    )
+    ttft: "Optional[float]" = None
+    data_buf: "list[list[str]]" = []  # one for each choice
+
+    old_iterator = response._iterator
+
+    def new_iterator() -> "Iterator[ChatCompletionChunk]":
+        nonlocal ttft
+        count_tokens_manually = True
+        for x in old_iterator:
+            with capture_internal_exceptions():
+                if hasattr(x, "delta"):
+                    if start_time is not None and ttft is None:
+                        ttft = time.perf_counter() - start_time
+                    if len(data_buf) == 0:
+                        data_buf.append([])
+                    data_buf[0].append(x.delta or "")
+
+                if isinstance(x, ResponseCompletedEvent):
+                    _calculate_token_usage(
+                        input,
+                        x.response,
+                        span,
+                        None,
+                        integration.count_tokens,
+                    )
+                    count_tokens_manually = False
+
+            yield x
+
+        with capture_internal_exceptions():
+            if ttft is not None:
+                set_data_normalized(
+                    span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                )
+            if len(data_buf) > 0:
+                all_responses = ["".join(chunk) for chunk in data_buf]
+                if should_send_default_pii() and integration.include_prompts:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses
+                    )
+                if count_tokens_manually:
+                    _calculate_token_usage(
+                        input,
+                        response,
+                        span,
+                        all_responses,
+                        integration.count_tokens,
+                    )
+
+        if finish_span:
+            span.__exit__(None, None, None)
+
+    async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
+        nonlocal ttft
+        count_tokens_manually = True
+        async for x in old_iterator:
+            with capture_internal_exceptions():
+                if hasattr(x, "delta"):
+                    if start_time is not None and ttft is None:
+                        ttft = time.perf_counter() - start_time
+                    if len(data_buf) == 0:
+                        data_buf.append([])
+                    data_buf[0].append(x.delta or "")
+
+                if isinstance(x, ResponseCompletedEvent):
+                    _calculate_token_usage(
+                        input,
+                        x.response,
+                        span,
+                        None,
+                        integration.count_tokens,
+                    )
+                    count_tokens_manually = False
+
+            yield x
+
+        with capture_internal_exceptions():
+            if ttft is not None:
+                set_data_normalized(
+                    span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                )
+            if len(data_buf) > 0:
+                all_responses = ["".join(chunk) for chunk in data_buf]
+                if should_send_default_pii() and integration.include_prompts:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses
+                    )
+                if count_tokens_manually:
+                    _calculate_token_usage(
+                        input,
+                        response,
+                        span,
+                        all_responses,
+                        integration.count_tokens,
+                    )
+        if finish_span:
+            span.__exit__(None, None, None)
+
+    if str(type(response._iterator)) == "<class 'async_generator'>":
+        response._iterator = new_iterator_async()
+    else:
+        response._iterator = new_iterator()
 
 
 def _set_embeddings_output_data(
diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py
@@ -1,6 +1,8 @@
 import json
 import pytest
 
+from typing import Union
+
 from sentry_sdk.utils import package_version
 
 try:
@@ -21,6 +23,8 @@
 from openai.types.chat.chat_completion_chunk import ChoiceDelta, Choice as DeltaChoice
 from openai.types.create_embedding_response import Usage as EmbeddingTokenUsage
 
+from openai.types.responses.response_text_delta_event import ResponseTextDeltaEvent
+
 SKIP_RESPONSES_TESTS = False
 
 try:
@@ -500,6 +504,7 @@ def test_streaming_chat_completion_no_prompts(
                 {"role": "system", "content": "You are a helpful assistant."},
                 {"role": "user", "content": "hello"},
             ],
+            stream=True,
         )
         response_string = "".join(
             map(lambda x: x.choices[0].delta.content, response_stream)
@@ -624,6 +629,7 @@ def test_streaming_chat_completion(sentry_init, capture_events, messages, reques
         response_stream = client.chat.completions.create(
             model="some-model",
             messages=messages,
+            stream=True,
         )
         response_string = "".join(
             map(lambda x: x.choices[0].delta.content, response_stream)
@@ -747,6 +753,7 @@ async def test_streaming_chat_completion_async_no_prompts(
                 {"role": "system", "content": "You are a helpful assistant."},
                 {"role": "user", "content": "hello"},
             ],
+            stream=True,
         )
 
         response_string = ""
@@ -881,6 +888,7 @@ async def test_streaming_chat_completion_async(
         response_stream = await client.chat.completions.create(
             model="some-model",
             messages=messages,
+            stream=True,
         )
 
         response_string = ""
@@ -942,7 +950,9 @@ def test_bad_chat_completion(sentry_init, capture_events):
     )
     with pytest.raises(OpenAIError):
         client.chat.completions.create(
-            model="some-model", messages=[{"role": "system", "content": "hello"}]
+            model="some-model",
+            messages=[{"role": "system", "content": "hello"}],
+            stream=True,
         )
 
     (event,) = events
@@ -2340,6 +2350,16 @@ async def test_ai_client_span_responses_async_api(
     assert spans[0]["data"] == expected_data
 
 
+async def example_response_stream():
+    yield EXAMPLE_RESPONSES_STREAM[0]
+    yield EXAMPLE_RESPONSES_STREAM[1]
+    yield EXAMPLE_RESPONSES_STREAM[2]
+    yield EXAMPLE_RESPONSES_STREAM[3]
+    yield EXAMPLE_RESPONSES_STREAM[4]
+
+    return
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "instructions",
@@ -2417,15 +2437,19 @@ async def test_ai_client_span_streaming_responses_async_api(
     events = capture_events()
 
     client = AsyncOpenAI(api_key="z")
-    client.responses._post = AsyncMock(return_value=EXAMPLE_RESPONSE)
+    returned_stream = AsyncStream(cast_to=None, response=None, client=client)
+    returned_stream._iterator = example_response_stream()
+    client.responses._post = mock.AsyncMock(return_value=returned_stream)
 
     with start_transaction(name="openai tx"):
-        await client.responses.create(
+        result = await client.responses.create(
             model="gpt-4o",
             instructions=instructions,
             input=input,
             stream=True,
         )
+        async for _ in result:
+            pass
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2438,14 +2462,14 @@ async def test_ai_client_span_streaming_responses_async_api(
         "gen_ai.operation.name": "responses",
         "gen_ai.response.streaming": True,
         "gen_ai.system": "openai",
-        "gen_ai.response.model": "response-model-id",
+        "gen_ai.response.time_to_first_token": mock.ANY,
         "gen_ai.usage.input_tokens": 20,
         "gen_ai.usage.input_tokens.cached": 5,
         "gen_ai.usage.output_tokens": 10,
         "gen_ai.usage.output_tokens.reasoning": 8,
         "gen_ai.usage.total_tokens": 30,
         "gen_ai.request.model": "gpt-4o",
-        "gen_ai.response.text": "the model response",
+        "gen_ai.response.text": "hello world",
         "thread.id": mock.ANY,
         "thread.name": mock.ANY,
     }