feat(ai): add cache writes for gen_ai

shellmayr · shellmayr · commit 5c9cffb159e4 · 2026-01-14T11:15:57.000+01:00
diff --git a/sentry_sdk/ai/__init__.py b/sentry_sdk/ai/__init__.py
@@ -1,7 +1,17 @@
+from .monitoring import record_token_usage  # noqa: F401
 from .utils import (
     set_data_normalized,
     GEN_AI_MESSAGE_ROLE_MAPPING,
     GEN_AI_MESSAGE_ROLE_REVERSE_MAPPING,
     normalize_message_role,
     normalize_message_roles,
 )  # noqa: F401
+
+__all__ = [
+    "record_token_usage",
+    "set_data_normalized",
+    "GEN_AI_MESSAGE_ROLE_MAPPING",
+    "GEN_AI_MESSAGE_ROLE_REVERSE_MAPPING",
+    "normalize_message_role",
+    "normalize_message_roles",
+]
diff --git a/sentry_sdk/ai/monitoring.py b/sentry_sdk/ai/monitoring.py
@@ -95,6 +95,7 @@ def record_token_usage(
     span: "Span",
     input_tokens: "Optional[int]" = None,
     input_tokens_cached: "Optional[int]" = None,
+    input_tokens_cache_write: "Optional[int]" = None,
     output_tokens: "Optional[int]" = None,
     output_tokens_reasoning: "Optional[int]" = None,
     total_tokens: "Optional[int]" = None,
@@ -113,6 +114,12 @@ def record_token_usage(
             input_tokens_cached,
         )
 
+    if input_tokens_cache_write is not None:
+        span.set_data(
+            SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE,
+            input_tokens_cache_write,
+        )
+
     if output_tokens is not None:
         span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
 
diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py
@@ -632,6 +632,12 @@ class SPANDATA:
     Example: 50
     """
 
+    GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE = "gen_ai.usage.input_tokens.cache_write"
+    """
+    The number of tokens written to the cache when processing the AI input (prompt).
+    Example: 100
+    """
+
     GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
     """
     The number of tokens in the output.
diff --git a/sentry_sdk/integrations/anthropic.py b/sentry_sdk/integrations/anthropic.py
@@ -72,29 +72,47 @@ def _capture_exception(exc: "Any") -> None:
     sentry_sdk.capture_event(event, hint=hint)
 
 
-def _get_token_usage(result: "Messages") -> "tuple[int, int]":
+def _get_token_usage(result: "Messages") -> "tuple[int, int, int, int]":
     """
     Get token usage from the Anthropic response.
+    Returns: (input_tokens, output_tokens, cache_read_input_tokens, cache_write_input_tokens)
     """
     input_tokens = 0
     output_tokens = 0
+    cache_read_input_tokens = 0
+    cache_write_input_tokens = 0
     if hasattr(result, "usage"):
         usage = result.usage
         if hasattr(usage, "input_tokens") and isinstance(usage.input_tokens, int):
             input_tokens = usage.input_tokens
         if hasattr(usage, "output_tokens") and isinstance(usage.output_tokens, int):
             output_tokens = usage.output_tokens
-
-    return input_tokens, output_tokens
+        if hasattr(usage, "cache_read_input_tokens") and isinstance(
+            usage.cache_read_input_tokens, int
+        ):
+            cache_read_input_tokens = usage.cache_read_input_tokens
+        if hasattr(usage, "cache_creation_input_tokens") and isinstance(
+            usage.cache_creation_input_tokens, int
+        ):
+            cache_write_input_tokens = usage.cache_creation_input_tokens
+
+    return (
+        input_tokens,
+        output_tokens,
+        cache_read_input_tokens,
+        cache_write_input_tokens,
+    )
 
 
 def _collect_ai_data(
     event: "MessageStreamEvent",
     model: "str | None",
     input_tokens: int,
     output_tokens: int,
+    cache_read_input_tokens: int,
+    cache_write_input_tokens: int,
     content_blocks: "list[str]",
-) -> "tuple[str | None, int, int, list[str]]":
+) -> "tuple[str | None, int, int, int, int, list[str]]":
     """
     Collect model information, token usage, and collect content blocks from the AI streaming response.
     """
@@ -104,6 +122,14 @@ def _collect_ai_data(
                 usage = event.message.usage
                 input_tokens += usage.input_tokens
                 output_tokens += usage.output_tokens
+                if hasattr(usage, "cache_read_input_tokens") and isinstance(
+                    usage.cache_read_input_tokens, int
+                ):
+                    cache_read_input_tokens += usage.cache_read_input_tokens
+                if hasattr(usage, "cache_creation_input_tokens") and isinstance(
+                    usage.cache_creation_input_tokens, int
+                ):
+                    cache_write_input_tokens += usage.cache_creation_input_tokens
                 model = event.message.model or model
             elif event.type == "content_block_start":
                 pass
@@ -117,7 +143,14 @@ def _collect_ai_data(
             elif event.type == "message_delta":
                 output_tokens += event.usage.output_tokens
 
-    return model, input_tokens, output_tokens, content_blocks
+    return (
+        model,
+        input_tokens,
+        output_tokens,
+        cache_read_input_tokens,
+        cache_write_input_tokens,
+        content_blocks,
+    )
 
 
 def _set_input_data(
@@ -219,6 +252,8 @@ def _set_output_data(
     model: "str | None",
     input_tokens: "int | None",
     output_tokens: "int | None",
+    cache_read_input_tokens: "int | None",
+    cache_write_input_tokens: "int | None",
     content_blocks: "list[Any]",
     finish_span: bool = False,
 ) -> None:
@@ -254,6 +289,8 @@ def _set_output_data(
         span,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
+        input_tokens_cached=cache_read_input_tokens,
+        input_tokens_cache_write=cache_write_input_tokens,
     )
 
     if finish_span:
@@ -288,7 +325,12 @@ def _sentry_patched_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "A
 
     with capture_internal_exceptions():
         if hasattr(result, "content"):
-            input_tokens, output_tokens = _get_token_usage(result)
+            (
+                input_tokens,
+                output_tokens,
+                cache_read_input_tokens,
+                cache_write_input_tokens,
+            ) = _get_token_usage(result)
 
             content_blocks = []
             for content_block in result.content:
@@ -305,6 +347,8 @@ def _sentry_patched_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "A
                 model=getattr(result, "model", None),
                 input_tokens=input_tokens,
                 output_tokens=output_tokens,
+                cache_read_input_tokens=cache_read_input_tokens,
+                cache_write_input_tokens=cache_write_input_tokens,
                 content_blocks=content_blocks,
                 finish_span=True,
             )
@@ -317,13 +361,26 @@ def new_iterator() -> "Iterator[MessageStreamEvent]":
                 model = None
                 input_tokens = 0
                 output_tokens = 0
+                cache_read_input_tokens = 0
+                cache_write_input_tokens = 0
                 content_blocks: "list[str]" = []
 
                 for event in old_iterator:
-                    model, input_tokens, output_tokens, content_blocks = (
-                        _collect_ai_data(
-                            event, model, input_tokens, output_tokens, content_blocks
-                        )
+                    (
+                        model,
+                        input_tokens,
+                        output_tokens,
+                        cache_read_input_tokens,
+                        cache_write_input_tokens,
+                        content_blocks,
+                    ) = _collect_ai_data(
+                        event,
+                        model,
+                        input_tokens,
+                        output_tokens,
+                        cache_read_input_tokens,
+                        cache_write_input_tokens,
+                        content_blocks,
                     )
                     yield event
 
@@ -333,6 +390,8 @@ def new_iterator() -> "Iterator[MessageStreamEvent]":
                     model=model,
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    cache_write_input_tokens=cache_write_input_tokens,
                     content_blocks=[{"text": "".join(content_blocks), "type": "text"}],
                     finish_span=True,
                 )
@@ -341,13 +400,26 @@ async def new_iterator_async() -> "AsyncIterator[MessageStreamEvent]":
                 model = None
                 input_tokens = 0
                 output_tokens = 0
+                cache_read_input_tokens = 0
+                cache_write_input_tokens = 0
                 content_blocks: "list[str]" = []
 
                 async for event in old_iterator:
-                    model, input_tokens, output_tokens, content_blocks = (
-                        _collect_ai_data(
-                            event, model, input_tokens, output_tokens, content_blocks
-                        )
+                    (
+                        model,
+                        input_tokens,
+                        output_tokens,
+                        cache_read_input_tokens,
+                        cache_write_input_tokens,
+                        content_blocks,
+                    ) = _collect_ai_data(
+                        event,
+                        model,
+                        input_tokens,
+                        output_tokens,
+                        cache_read_input_tokens,
+                        cache_write_input_tokens,
+                        content_blocks,
                     )
                     yield event
 
@@ -357,6 +429,8 @@ async def new_iterator_async() -> "AsyncIterator[MessageStreamEvent]":
                     model=model,
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    cache_write_input_tokens=cache_write_input_tokens,
                     content_blocks=[{"text": "".join(content_blocks), "type": "text"}],
                     finish_span=True,
                 )
diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
@@ -192,6 +192,10 @@ def _success_callback(
             record_token_usage(
                 span,
                 input_tokens=getattr(usage, "prompt_tokens", None),
+                input_tokens_cached=getattr(usage, "cache_read_input_tokens", None),
+                input_tokens_cache_write=getattr(
+                    usage, "cache_write_input_tokens", None
+                ),
                 output_tokens=getattr(usage, "completion_tokens", None),
                 total_tokens=getattr(usage, "total_tokens", None),
             )
diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py
@@ -131,7 +131,12 @@ def _calculate_token_usage(
 
     if hasattr(response, "usage"):
         input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"])
-        if hasattr(response.usage, "input_tokens_details"):
+        if hasattr(response.usage, "prompt_tokens_details"):
+            input_tokens_cached = _get_usage(
+                response.usage.prompt_tokens_details, ["cached_tokens"]
+            )
+        # OpenAI also supports input_tokens_details for compatibility
+        elif hasattr(response.usage, "input_tokens_details"):
             input_tokens_cached = _get_usage(
                 response.usage.input_tokens_details, ["cached_tokens"]
             )
@@ -143,6 +148,10 @@ def _calculate_token_usage(
             output_tokens_reasoning = _get_usage(
                 response.usage.output_tokens_details, ["reasoning_tokens"]
             )
+        elif hasattr(response.usage, "completion_tokens_details"):
+            output_tokens_reasoning = _get_usage(
+                response.usage.completion_tokens_details, ["reasoning_tokens"]
+            )
 
         total_tokens = _get_usage(response.usage, ["total_tokens"])
 
diff --git a/sentry_sdk/integrations/openai_agents/utils.py b/sentry_sdk/integrations/openai_agents/utils.py
@@ -96,15 +96,23 @@ def _set_agent_data(span: "sentry_sdk.tracing.Span", agent: "agents.Agent") -> N
 
 def _set_usage_data(span: "sentry_sdk.tracing.Span", usage: "Usage") -> None:
     span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, usage.input_tokens)
-    span.set_data(
-        SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED,
-        usage.input_tokens_details.cached_tokens,
-    )
+
+    if hasattr(usage, "input_tokens_details") and usage.input_tokens_details:
+        if hasattr(usage.input_tokens_details, "cached_tokens"):
+            span.set_data(
+                SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED,
+                usage.input_tokens_details.cached_tokens,
+            )
+
     span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, usage.output_tokens)
-    span.set_data(
-        SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING,
-        usage.output_tokens_details.reasoning_tokens,
-    )
+
+    if hasattr(usage, "output_tokens_details") and usage.output_tokens_details:
+        if hasattr(usage.output_tokens_details, "reasoning_tokens"):
+            span.set_data(
+                SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING,
+                usage.output_tokens_details.reasoning_tokens,
+            )
+
     span.set_data(SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, usage.total_tokens)
 
 
diff --git a/sentry_sdk/integrations/pydantic_ai/spans/utils.py b/sentry_sdk/integrations/pydantic_ai/spans/utils.py
@@ -28,8 +28,30 @@ def _set_usage_data(
     if hasattr(usage, "input_tokens") and usage.input_tokens is not None:
         span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, usage.input_tokens)
 
+    # Pydantic AI uses cache_read_tokens (not input_tokens_cached)
+    if hasattr(usage, "cache_read_tokens") and usage.cache_read_tokens is not None:
+        span.set_data(
+            SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED, usage.cache_read_tokens
+        )
+
+    # Pydantic AI uses cache_write_tokens (not input_tokens_cache_write)
+    if hasattr(usage, "cache_write_tokens") and usage.cache_write_tokens is not None:
+        span.set_data(
+            SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE,
+            usage.cache_write_tokens,
+        )
+
     if hasattr(usage, "output_tokens") and usage.output_tokens is not None:
         span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, usage.output_tokens)
 
+    if (
+        hasattr(usage, "output_tokens_reasoning")
+        and usage.output_tokens_reasoning is not None
+    ):
+        span.set_data(
+            SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING,
+            usage.output_tokens_reasoning,
+        )
+
     if hasattr(usage, "total_tokens") and usage.total_tokens is not None:
         span.set_data(SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, usage.total_tokens)
diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py