fix: double counting anthropic langchain (#399)

carlos-marchal-ph · web-flow · commit f1c6da2da292 · 2026-01-05T16:56:17.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 7.4.3 - 2026-01-02
+
+Fixes cache creation cost for Langchain with Anthropic
+
 # 7.4.2 - 2025-12-22
 
 feat: add `in_app_modules` option to control code variables capturing
@@ -13,6 +17,7 @@ When using OpenAI stored prompts, the model is defined in the OpenAI dashboard r
 feat: Add automatic retries for feature flag requests
 
 Feature flag API requests now automatically retry on transient failures:
+
 - Network errors (connection refused, DNS failures, timeouts)
 - Server errors (500, 502, 503, 504)
 - Up to 2 retries with exponential backoff (0.5s, 1s delays)
diff --git a/posthog/ai/langchain/callbacks.py b/posthog/ai/langchain/callbacks.py
@@ -773,24 +773,26 @@ def _parse_usage_model(
             for mapped_key, dataclass_key in field_mapping.items()
         },
     )
-    # For Anthropic providers, LangChain reports input_tokens as the sum of input and cache read tokens.
+    # For Anthropic providers, LangChain reports input_tokens as the sum of all input tokens.
     # Our cost calculation expects them to be separate for Anthropic, so we subtract cache tokens.
-    # For other providers (OpenAI, etc.), input_tokens already includes cache tokens as expected.
+    # Both cache_read and cache_write tokens should be subtracted since Anthropic's raw API
+    # reports input_tokens as tokens NOT read from or used to create a cache.
+    # For other providers (OpenAI, etc.), input_tokens already excludes cache tokens as expected.
     # Match logic consistent with plugin-server: exact match on provider OR substring match on model
     is_anthropic = False
     if provider and provider.lower() == "anthropic":
         is_anthropic = True
     elif model and "anthropic" in model.lower():
         is_anthropic = True
 
-    if (
-        is_anthropic
-        and normalized_usage.input_tokens
-        and normalized_usage.cache_read_tokens
-    ):
-        normalized_usage.input_tokens = max(
-            normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
+    if is_anthropic and normalized_usage.input_tokens:
+        cache_tokens = (normalized_usage.cache_read_tokens or 0) + (
+            normalized_usage.cache_write_tokens or 0
         )
+        if cache_tokens > 0:
+            normalized_usage.input_tokens = max(
+                normalized_usage.input_tokens - cache_tokens, 0
+            )
     return normalized_usage
 
 
diff --git a/posthog/test/ai/langchain/test_callbacks.py b/posthog/test/ai/langchain/test_callbacks.py
@@ -1638,6 +1638,95 @@ def test_anthropic_provider_subtracts_cache_tokens(mock_client):
     assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800
 
 
+def test_anthropic_provider_subtracts_cache_write_tokens(mock_client):
+    """Test that Anthropic provider correctly subtracts cache write tokens from input tokens."""
+    from langchain_core.outputs import LLMResult, ChatGeneration
+    from langchain_core.messages import AIMessage
+    from uuid import uuid4
+
+    cb = CallbackHandler(mock_client)
+    run_id = uuid4()
+
+    # Set up with Anthropic provider
+    cb._set_llm_metadata(
+        serialized={},
+        run_id=run_id,
+        messages=[{"role": "user", "content": "test"}],
+        metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
+    )
+
+    # Response with cache creation: 1000 input (includes 800 being written to cache)
+    response = LLMResult(
+        generations=[
+            [
+                ChatGeneration(
+                    message=AIMessage(content="Response"),
+                    generation_info={
+                        "usage_metadata": {
+                            "input_tokens": 1000,
+                            "output_tokens": 50,
+                            "cache_creation_input_tokens": 800,
+                        }
+                    },
+                )
+            ]
+        ],
+        llm_output={},
+    )
+
+    cb._pop_run_and_capture_generation(run_id, None, response)
+
+    generation_args = mock_client.capture.call_args_list[0][1]
+    assert generation_args["properties"]["$ai_input_tokens"] == 200  # 1000 - 800
+    assert generation_args["properties"]["$ai_cache_creation_input_tokens"] == 800
+
+
+def test_anthropic_provider_subtracts_both_cache_read_and_write_tokens(mock_client):
+    """Test that Anthropic provider correctly subtracts both cache read and write tokens."""
+    from langchain_core.outputs import LLMResult, ChatGeneration
+    from langchain_core.messages import AIMessage
+    from uuid import uuid4
+
+    cb = CallbackHandler(mock_client)
+    run_id = uuid4()
+
+    # Set up with Anthropic provider
+    cb._set_llm_metadata(
+        serialized={},
+        run_id=run_id,
+        messages=[{"role": "user", "content": "test"}],
+        metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
+    )
+
+    # Response with both cache read and creation
+    response = LLMResult(
+        generations=[
+            [
+                ChatGeneration(
+                    message=AIMessage(content="Response"),
+                    generation_info={
+                        "usage_metadata": {
+                            "input_tokens": 2000,
+                            "output_tokens": 50,
+                            "cache_read_input_tokens": 800,
+                            "cache_creation_input_tokens": 500,
+                        }
+                    },
+                )
+            ]
+        ],
+        llm_output={},
+    )
+
+    cb._pop_run_and_capture_generation(run_id, None, response)
+
+    generation_args = mock_client.capture.call_args_list[0][1]
+    # 2000 - 800 (read) - 500 (write) = 700
+    assert generation_args["properties"]["$ai_input_tokens"] == 700
+    assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800
+    assert generation_args["properties"]["$ai_cache_creation_input_tokens"] == 500
+
+
 def test_openai_cache_read_tokens(mock_client):
     """Test that OpenAI cache read tokens are captured correctly."""
     prompt = ChatPromptTemplate.from_messages(
@@ -2092,10 +2181,12 @@ def test_zero_input_tokens_with_cache_read(mock_client):
     assert generation_props["$ai_cache_read_input_tokens"] == 50
 
 
-def test_cache_write_tokens_not_subtracted_from_input(mock_client):
-    """Test that cache_creation_input_tokens (cache write) do NOT affect input_tokens.
+def test_non_anthropic_cache_write_tokens_not_subtracted_from_input(mock_client):
+    """Test that cache_creation_input_tokens do NOT affect input_tokens for non-Anthropic providers.
 
-    Only cache_read_tokens should be subtracted from input_tokens, not cache_write_tokens.
+    When no provider metadata is set (or for non-Anthropic providers), cache tokens should
+    NOT be subtracted from input_tokens. This is because different providers report tokens
+    differently - only Anthropic's LangChain integration requires subtraction.
     """
     prompt = ChatPromptTemplate.from_messages([("user", "Create cache")])
 
diff --git a/posthog/version.py b/posthog/version.py
@@ -1,4 +1,4 @@
-VERSION = "7.4.2"
+VERSION = "7.4.3"
 
 if __name__ == "__main__":
     print(VERSION, end="")  # noqa: T201