@@ -101,6 +101,7 @@ class UsageMetadataChunk(BaseModel):
101101 prompt_tokens : int
102102 completion_tokens : int
103103 total_tokens : int
104+ cached_prompt_tokens : int = 0
104105
105106
106107class LiteLLMClient :
@@ -217,6 +218,59 @@ def _append_fallback_user_content_if_missing(
217218 )
218219
219220
221+ def _extract_cached_prompt_tokens (usage : Any ) -> int :
222+ """Extracts cached prompt tokens from LiteLLM usage.
223+
224+ Providers expose cached token metrics in different shapes. Common patterns:
225+ - usage["prompt_tokens_details"]["cached_tokens"] (OpenAI/Azure style)
226+ - usage["prompt_tokens_details"] is a list of dicts with cached_tokens
227+ - usage["cached_prompt_tokens"] (LiteLLM-normalized for some providers)
228+ - usage["cached_tokens"] (flat)
229+
230+ Args:
231+ usage: Usage dictionary from LiteLLM response.
232+
233+ Returns:
234+ Integer number of cached prompt tokens if present; otherwise 0.
235+ """
236+ try :
237+ usage_dict = usage
238+ if hasattr (usage , "model_dump" ):
239+ usage_dict = usage .model_dump ()
240+ elif isinstance (usage , str ):
241+ try :
242+ usage_dict = json .loads (usage )
243+ except json .JSONDecodeError :
244+ return 0
245+
246+ if not isinstance (usage_dict , dict ):
247+ return 0
248+
249+ details = usage_dict .get ("prompt_tokens_details" )
250+ if isinstance (details , dict ):
251+ value = details .get ("cached_tokens" )
252+ if isinstance (value , int ):
253+ return value
254+ elif isinstance (details , list ):
255+ total = sum (
256+ item .get ("cached_tokens" , 0 )
257+ for item in details
258+ if isinstance (item , dict )
259+ and isinstance (item .get ("cached_tokens" ), int )
260+ )
261+ if total > 0 :
262+ return total
263+
264+ for key in ("cached_prompt_tokens" , "cached_tokens" ):
265+ value = usage_dict .get (key )
266+ if isinstance (value , int ):
267+ return value
268+ except (TypeError , AttributeError ) as e :
269+ logger .debug ("Error extracting cached prompt tokens: %s" , e )
270+
271+ return 0
272+
273+
220274def _content_to_message_param (
221275 content : types .Content ,
222276) -> Union [Message , list [Message ]]:
@@ -533,6 +587,7 @@ def _model_response_to_chunk(
533587 prompt_tokens = response ["usage" ].get ("prompt_tokens" , 0 ),
534588 completion_tokens = response ["usage" ].get ("completion_tokens" , 0 ),
535589 total_tokens = response ["usage" ].get ("total_tokens" , 0 ),
590+ cached_prompt_tokens = _extract_cached_prompt_tokens (response ["usage" ]),
536591 ), None
537592
538593
@@ -576,6 +631,9 @@ def _model_response_to_generate_content_response(
576631 prompt_token_count = response ["usage" ].get ("prompt_tokens" , 0 ),
577632 candidates_token_count = response ["usage" ].get ("completion_tokens" , 0 ),
578633 total_token_count = response ["usage" ].get ("total_tokens" , 0 ),
634+ cached_content_token_count = _extract_cached_prompt_tokens (
635+ response ["usage" ]
636+ ),
579637 )
580638 return llm_response
581639
@@ -965,6 +1023,7 @@ async def generate_content_async(
9651023 prompt_token_count = chunk .prompt_tokens ,
9661024 candidates_token_count = chunk .completion_tokens ,
9671025 total_token_count = chunk .total_tokens ,
1026+ cached_content_token_count = chunk .cached_prompt_tokens ,
9681027 )
9691028
9701029 if (
0 commit comments