Portkey-AI · mintlify · Mar 25, 2026
diff --git a/integrations/llms/gemini.mdx b/integrations/llms/gemini.mdx
@@ -1003,6 +1003,188 @@ curl --location 'https://api.portkey.ai/v1/chat/completions' \
 
 ---
 
+## Explicit context caching
+
+Google Gemini supports [context caching](https://ai.google.dev/gemini-api/docs/caching) to reduce costs and latency for repeated prompts with large amounts of context. You can explicitly create a cache and then reference it in subsequent inference requests.
+
+### Step 1: Create a context cache
+
+Use the Gemini `cachedContents` endpoint through Portkey to create a cache:
+
+<Tabs>
+  <Tab title="cURL">
+```sh
+curl 'https://api.portkey.ai/v1/cached_contents' \
+-H 'x-portkey-provider: google' \
+-H 'Content-Type: application/json' \
+-H 'x-portkey-api-key: {{your_api_key}}' \
+-H 'Authorization: {{your_gemini_api_key}}' \
+-H 'x-portkey-custom-host: https://generativelanguage.googleapis.com/v1beta' \
+-d '{
+  "model": "models/gemini-1.5-pro-001",
+  "displayName": "my-cache-display-name",
+  "contents": [{
+    "role": "user",
+    "parts": [{
+      "text": "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)"
+    }]
+  },
+  {
+    "role": "model",
+    "parts": [{
+      "text": "Thank you, I am your helpful assistant."
+    }]
+  }]
+}'
+```
+  </Tab>
+  <Tab title="Python">
+```python
+import requests
+
+url = "https://api.portkey.ai/v1/cached_contents"
+headers = {
+    "x-portkey-provider": "google",
+    "Content-Type": "application/json",
+    "x-portkey-api-key": "PORTKEY_API_KEY",
+    "Authorization": "GEMINI_API_KEY",
+    "x-portkey-custom-host": "https://generativelanguage.googleapis.com/v1beta"
+}
+
+payload = {
+    "model": "models/gemini-1.5-pro-001",
+    "displayName": "my-cache-display-name",
+    "contents": [{
+        "role": "user",
+        "parts": [{
+            "text": "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)"
+        }]
+    },
+    {
+        "role": "model",
+        "parts": [{
+            "text": "Thank you, I am your helpful assistant."
+        }]
+    }]
+}
+
+response = requests.post(url, headers=headers, json=payload)
+print(response.json())
+# Save the cache name from the response for use in step 2
+```
+  </Tab>
+  <Tab title="NodeJS">
+```javascript
+const response = await fetch("https://api.portkey.ai/v1/cached_contents", {
+    method: "POST",
+    headers: {
+        "x-portkey-provider": "google",
+        "Content-Type": "application/json",
+        "x-portkey-api-key": "PORTKEY_API_KEY",
+        "Authorization": "GEMINI_API_KEY",
+        "x-portkey-custom-host": "https://generativelanguage.googleapis.com/v1beta"
+    },
+    body: JSON.stringify({
+        model: "models/gemini-1.5-pro-001",
+        displayName: "my-cache-display-name",
+        contents: [{
+            role: "user",
+            parts: [{
+                text: "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)"
+            }]
+        },
+        {
+            role: "model",
+            parts: [{
+                text: "Thank you, I am your helpful assistant."
+            }]
+        }]
+    })
+});
+
+const data = await response.json();
+console.log(data);
+// Save the cache name from the response for use in step 2
+```
+  </Tab>
+</Tabs>
+
+<Note>
+Context caching requires a minimum of **1024 tokens** in the cached content. The cache has a default TTL (time-to-live) which you can configure using the `ttl` parameter.
+</Note>
+
+### Step 2: Use the cache in inference requests
+
+Once the cache is created, reference it in your chat completion requests using the `cached_content` parameter. Pass the cache name returned from step 1 (e.g., `cachedContents/abc123`):
+
+<Tabs>
+  <Tab title="cURL">
+```sh
+curl 'https://api.portkey.ai/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'x-portkey-api-key: {{your_api_key}}' \
+-H 'x-portkey-provider: google' \
+-H 'Authorization: {{your_gemini_api_key}}' \
+-d '{
+    "model": "gemini-1.5-pro-001",
+    "cached_content": "cachedContents/abc123",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Based on the context I provided earlier, answer my question."
+      }
+    ]
+}'
+```
+  </Tab>
+  <Tab title="Python">
+```python
+from portkey_ai import Portkey
+
+portkey = Portkey(
+    api_key="PORTKEY_API_KEY",
+)
+
+completion = portkey.chat.completions.create(
+    model="@google/gemini-1.5-pro-001",
+    cached_content="cachedContents/abc123",
+    messages=[
+        {"role": "user", "content": "Based on the context I provided earlier, answer my question."}
+    ]
+)
+
+print(completion)
+```
+  </Tab>
+  <Tab title="NodeJS">
+```javascript
+import Portkey from 'portkey-ai';
+
+const portkey = new Portkey({
+    apiKey: "PORTKEY_API_KEY",
+});
+
+const completion = await portkey.chat.completions.create({
+    model: "@google/gemini-1.5-pro-001",
+    cached_content: "cachedContents/abc123",
+    messages: [
+        { role: "user", content: "Based on the context I provided earlier, answer my question." }
+    ]
+});
+
+console.log(completion);
+```
+  </Tab>
+</Tabs>
+
+<Warning>
+The model used in the inference request **must match** the model used when creating the cache.
+</Warning>
+
+For more details on context caching options like TTL configuration and cache management, refer to the [Google Gemini context caching documentation](https://ai.google.dev/gemini-api/docs/caching).
+
+---
+
 ## Thought Signatures (Tool Calling Verification)
 
 <Note>