From 6b32d44260fb4d171d034bbee26922f318951729 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Thu, 29 Jan 2026 09:08:32 +0100 Subject: [PATCH 1/2] fix(google-genai): fix wrong token accounting in streaming --- .../integrations/google_genai/streaming.py | 42 ++++++++++++------- .../google_genai/test_google_genai.py | 21 +++++----- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/sentry_sdk/integrations/google_genai/streaming.py b/sentry_sdk/integrations/google_genai/streaming.py index 5bd8890d02..ced09db46a 100644 --- a/sentry_sdk/integrations/google_genai/streaming.py +++ b/sentry_sdk/integrations/google_genai/streaming.py @@ -37,20 +37,27 @@ class AccumulatedResponse(TypedDict): def accumulate_streaming_response( chunks: "List[GenerateContentResponse]", ) -> "AccumulatedResponse": - """Accumulate streaming chunks into a single response-like object.""" + """Accumulate streaming chunks into a single response-like object. + + Token usage handling: + - input_tokens: Use last non-zero value (prompt doesn't change during streaming) + - input_tokens_cached: Use last non-zero value + - output_tokens: Sum across chunks (incremental output) + - output_tokens_reasoning: Use last non-zero value + - total_tokens: Use last non-zero value (cumulative in final chunk) + """ accumulated_text = [] finish_reasons = [] tool_calls = [] - total_input_tokens = 0 total_output_tokens = 0 - total_tokens = 0 - total_cached_tokens = 0 - total_reasoning_tokens = 0 + last_input_tokens = 0 + last_cached_tokens = 0 + last_reasoning_tokens = 0 + last_total_tokens = 0 response_id = None model = None for chunk in chunks: - # Extract text and tool calls if getattr(chunk, "candidates", None): for candidate in getattr(chunk, "candidates", []): if hasattr(candidate, "content") and getattr( @@ -68,24 +75,29 @@ def accumulate_streaming_response( if extracted_tool_calls: tool_calls.extend(extracted_tool_calls) - # Accumulate token usage extracted_usage_data = extract_usage_data(chunk) - total_input_tokens += extracted_usage_data["input_tokens"] + + if extracted_usage_data["input_tokens"]: + last_input_tokens = extracted_usage_data["input_tokens"] + if extracted_usage_data["input_tokens_cached"]: + last_cached_tokens = extracted_usage_data["input_tokens_cached"] + if extracted_usage_data["output_tokens_reasoning"]: + last_reasoning_tokens = extracted_usage_data["output_tokens_reasoning"] + if extracted_usage_data["total_tokens"]: + last_total_tokens = extracted_usage_data["total_tokens"] + total_output_tokens += extracted_usage_data["output_tokens"] - total_cached_tokens += extracted_usage_data["input_tokens_cached"] - total_reasoning_tokens += extracted_usage_data["output_tokens_reasoning"] - total_tokens += extracted_usage_data["total_tokens"] accumulated_response = AccumulatedResponse( text="".join(accumulated_text), finish_reasons=finish_reasons, tool_calls=tool_calls, usage_metadata=UsageData( - input_tokens=total_input_tokens, + input_tokens=last_input_tokens, output_tokens=total_output_tokens, - input_tokens_cached=total_cached_tokens, - output_tokens_reasoning=total_reasoning_tokens, - total_tokens=total_tokens, + input_tokens_cached=last_cached_tokens, + output_tokens_reasoning=last_reasoning_tokens, + total_tokens=last_total_tokens, ), id=response_id, model=model, diff --git a/tests/integrations/google_genai/test_google_genai.py b/tests/integrations/google_genai/test_google_genai.py index 37ba50420f..e167fd6744 100644 --- a/tests/integrations/google_genai/test_google_genai.py +++ b/tests/integrations/google_genai/test_google_genai.py @@ -545,25 +545,24 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" assert invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" - # Verify token counts - should reflect accumulated values - # Input tokens: max of all chunks = 10 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 30 - assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 30 + # Verify token counts + # Input tokens: last non-zero value (prompt doesn't change during streaming) = 10 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - # Output tokens: candidates (2 + 3 + 7 = 12) + reasoning (3) = 15 - # Note: output_tokens includes both candidates and reasoning tokens + # Output tokens: sum of candidates (2 + 3 + 7 = 12) + last reasoning (3) = 15 assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 15 assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 15 - # Total tokens: from the last chunk - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 50 - assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 50 + # Total tokens: last non-zero value from final chunk = 25 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 + assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 - # Cached tokens: max of all chunks = 5 + # Cached tokens: last non-zero value = 5 assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - # Reasoning tokens: sum of thoughts_token_count = 3 + # Reasoning tokens: last non-zero value = 3 assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 From 8715843ada3ec2a3205d16950903fd0fd41d11c4 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Thu, 29 Jan 2026 09:19:51 +0100 Subject: [PATCH 2/2] fix --- sentry_sdk/integrations/google_genai/streaming.py | 9 ++++----- .../integrations/google_genai/test_google_genai.py | 13 +++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sentry_sdk/integrations/google_genai/streaming.py b/sentry_sdk/integrations/google_genai/streaming.py index ced09db46a..1d5264edfc 100644 --- a/sentry_sdk/integrations/google_genai/streaming.py +++ b/sentry_sdk/integrations/google_genai/streaming.py @@ -43,16 +43,16 @@ def accumulate_streaming_response( - input_tokens: Use last non-zero value (prompt doesn't change during streaming) - input_tokens_cached: Use last non-zero value - output_tokens: Sum across chunks (incremental output) - - output_tokens_reasoning: Use last non-zero value + - output_tokens_reasoning: Sum across chunks (incremental reasoning) - total_tokens: Use last non-zero value (cumulative in final chunk) """ accumulated_text = [] finish_reasons = [] tool_calls = [] total_output_tokens = 0 + total_reasoning_tokens = 0 last_input_tokens = 0 last_cached_tokens = 0 - last_reasoning_tokens = 0 last_total_tokens = 0 response_id = None model = None @@ -81,12 +81,11 @@ def accumulate_streaming_response( last_input_tokens = extracted_usage_data["input_tokens"] if extracted_usage_data["input_tokens_cached"]: last_cached_tokens = extracted_usage_data["input_tokens_cached"] - if extracted_usage_data["output_tokens_reasoning"]: - last_reasoning_tokens = extracted_usage_data["output_tokens_reasoning"] if extracted_usage_data["total_tokens"]: last_total_tokens = extracted_usage_data["total_tokens"] total_output_tokens += extracted_usage_data["output_tokens"] + total_reasoning_tokens += extracted_usage_data["output_tokens_reasoning"] accumulated_response = AccumulatedResponse( text="".join(accumulated_text), @@ -96,7 +95,7 @@ def accumulate_streaming_response( input_tokens=last_input_tokens, output_tokens=total_output_tokens, input_tokens_cached=last_cached_tokens, - output_tokens_reasoning=last_reasoning_tokens, + output_tokens_reasoning=total_reasoning_tokens, total_tokens=last_total_tokens, ), id=response_id, diff --git a/tests/integrations/google_genai/test_google_genai.py b/tests/integrations/google_genai/test_google_genai.py index e167fd6744..3e57178438 100644 --- a/tests/integrations/google_genai/test_google_genai.py +++ b/tests/integrations/google_genai/test_google_genai.py @@ -472,6 +472,7 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie "promptTokenCount": 10, "candidatesTokenCount": 3, "totalTokenCount": 13, + "thoughtsTokenCount": 1, }, } @@ -550,9 +551,9 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - # Output tokens: sum of candidates (2 + 3 + 7 = 12) + last reasoning (3) = 15 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 15 - assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 15 + # Output tokens: sum of candidates (2 + 3 + 7 = 12) + reasoning (0 + 1 + 3) = 16 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 16 + assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 16 # Total tokens: last non-zero value from final chunk = 25 assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 @@ -562,9 +563,9 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - # Reasoning tokens: last non-zero value = 3 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 - assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + # Reasoning tokens: sum across chunks = 4 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 4 + assert invoke_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 4 # Verify model name assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash"