diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts index 5740fe46..276fa66c 100644 --- a/src/lib/navigation.ts +++ b/src/lib/navigation.ts @@ -353,36 +353,87 @@ export const tabNavigation: NavTab[] = [ { title: 'Concepts', items: [ - { title: 'Core Concepts', href: '/docs/prism/concepts/core' }, - { title: 'API Reference', href: '/docs/prism/concepts/api-reference' }, + { title: 'How it works', href: '/docs/prism/concepts/core' }, + { title: 'Virtual keys & access control', href: '/docs/prism/concepts/virtual-keys' }, { title: 'Configuration', href: '/docs/prism/concepts/configuration' }, - { title: 'Platform Integration', href: '/docs/prism/concepts/platform-integration' }, + { title: 'Platform integration', href: '/docs/prism/concepts/platform-integration' }, ] }, { title: 'Features', items: [ - { title: 'Manage Providers', href: '/docs/prism/features/providers' }, - { title: 'Routing & Reliability', href: '/docs/prism/features/routing' }, - { title: 'Guardrails', href: '/docs/prism/features/guardrails' }, - { title: 'Caching', href: '/docs/prism/features/caching' }, - { title: 'Cost Tracking & Budgets', href: '/docs/prism/features/cost-tracking' }, - { title: 'Streaming', href: '/docs/prism/features/streaming' }, - { title: 'Shadow Experiments', href: '/docs/prism/features/shadow-experiments' }, - { title: 'Rate Limiting', href: '/docs/prism/features/rate-limiting' }, - { title: 'MCP & A2A', href: '/docs/prism/features/mcp-a2a' }, - { title: 'Simulation Using SDK', href: '/docs/simulation/features/simulation-using-sdk' }, - { title: 'Evaluate Tool Calling', href: '/docs/simulation/features/evaluate-tool-calling' }, - { title: 'Fix My Agent', href: '/docs/simulation/features/fix-my-agent' }, - { title: 'Replay', href: '/docs/simulation/features/observe-to-simulate' }, - { title: 'Voice Replay', href: '/docs/simulation/features/voice-replay' }, - { title: 'Prompt Simulation', href: '/docs/simulation/features/prompt-simulation' }, + { + title: 'Providers', + items: [ + { title: 'Supported providers', href: '/docs/prism/features/providers' }, + { title: 'Self-hosted models', href: '/docs/prism/features/self-hosted-models' }, + ] + }, + { + title: 'API Reference', + items: [ + { title: 'Endpoints overview', href: '/docs/prism/api/endpoints' }, + { title: 'Chat completions', href: '/docs/prism/api/chat' }, + { title: 'Embeddings & reranking', href: '/docs/prism/api/embeddings' }, + { title: 'Media endpoints', href: '/docs/prism/api/media' }, + { title: 'Assistants API', href: '/docs/prism/api/assistants' }, + { title: 'Files & vector stores', href: '/docs/prism/api/files' }, + { title: 'Async & batch', href: '/docs/prism/api/async-batch' }, + { title: 'Request & response headers', href: '/docs/prism/api/headers' }, + ] + }, + { + title: 'Routing', + items: [ + { title: 'Routing & reliability', href: '/docs/prism/features/routing' }, + ] + }, + { + title: 'Safety & Policy', + items: [ + { title: 'Guardrails', href: '/docs/prism/features/guardrails' }, + ] + }, + { + title: 'Performance', + items: [ + { title: 'Caching', href: '/docs/prism/features/caching' }, + { title: 'Rate limiting', href: '/docs/prism/features/rate-limiting' }, + ] + }, + { + title: 'Cost & Observability', + items: [ + { title: 'Cost tracking', href: '/docs/prism/features/cost-tracking' }, + { title: 'Observability', href: '/docs/prism/features/observability' }, + { title: 'Shadow experiments', href: '/docs/prism/features/shadow-experiments' }, + ] + }, + { + title: 'Agentic', + items: [ + { title: 'MCP & A2A', href: '/docs/prism/features/mcp-a2a' }, + ] + }, + ] + }, + { + title: 'Admin', + items: [ + { title: 'Organization management', href: '/docs/prism/admin/organizations' }, ] }, { title: 'Deployment', items: [ - { title: 'Self-Hosted', href: '/docs/prism/deployment/self-hosted' }, + { title: 'Self-hosted', href: '/docs/prism/deployment/self-hosted' }, + ] + }, + { + title: 'Guides', + items: [ + { title: 'Error handling', href: '/docs/prism/guides/errors' }, + { title: 'Troubleshooting', href: '/docs/prism/guides/troubleshooting' }, ] }, ] diff --git a/src/pages/docs/prism/admin/organizations.mdx b/src/pages/docs/prism/admin/organizations.mdx new file mode 100644 index 00000000..fed3164d --- /dev/null +++ b/src/pages/docs/prism/admin/organizations.mdx @@ -0,0 +1,177 @@ +--- +title: "Organization management" +description: "Manage organizations, members, and org-level settings in Prism." +--- + +## About + +Each Prism organization is an isolated environment with its own providers, routing rules, rate limits, budgets, and API keys. Organizations are the top-level unit for multi-tenancy in Prism. + +--- + +## Organization settings + +Organization config controls all gateway behavior for that org. Settings are managed via the dashboard or the admin API. + + + + + +Go to **Settings > Organization** in the Future AGI dashboard. From here you can: + +- View and edit org-level configuration (providers, routing, caching, etc.) +- Manage members and roles +- View API key inventory +- Set budgets and rate limits + + + + + +```python +from prism import Prism + +# base_url = inference gateway, control_plane_url = admin/config API +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", +) + +# Get org config +config = client.org_configs.retrieve(org_id="your-org-id") + +# Update org config +client.org_configs.update( + org_id="your-org-id", + config={ + "rate_limiting": { + "enabled": True, + "rpm": 1000, + }, + "budgets": { + "limit": 500.00, + "period": "monthly", + }, + }, +) +``` + + + + + +```typescript +import { Prism } from "@futureagi/prism"; + +const client = new Prism({ + apiKey: "sk-prism-your-key", + baseUrl: "https://gateway.futureagi.com", + controlPlaneUrl: "https://api.futureagi.com", +}); + +const config = await client.orgConfigs.retrieve({ + orgId: "your-org-id", +}); + +await client.orgConfigs.update({ + orgId: "your-org-id", + config: { + rate_limiting: { + enabled: true, + rpm: 1000, + }, + budgets: { + limit: 500.0, + period: "monthly", + }, + }, +}); +``` + + + + + +--- + +## Members and roles + +Organizations can have multiple members with different roles. + +| Role | Permissions | +|---|---| +| **Owner** | Full access. Can delete the org, manage billing, and change all settings. | +| **Admin** | Can manage providers, keys, routing, budgets, and members (except owner). | +| **Member** | Can view config and create API keys. Cannot change org settings. | +| **Viewer** | Read-only access to dashboard, logs, and analytics. | + +### Managing members + +Members are managed through the Future AGI dashboard at **Settings > Members**. Invite new members by email. Each member can belong to multiple organizations. + +--- + +## API key management + +Each organization has its own pool of API keys (virtual keys). Keys inherit org-level settings and can have additional per-key restrictions. + +```python +# List keys for an org +keys = client.keys.list(org_id="your-org-id") +for key in keys: + print(f"{key.name}: {key.key_prefix}...") + +# Create a new key +new_key = client.keys.create( + org_id="your-org-id", + name="backend-service", + rate_limit_rpm=100, + allowed_models=["gpt-4o", "gpt-4o-mini"], +) +print(f"Key: {new_key.key}") # full key shown only at creation + +# Revoke a key +client.keys.delete(key_id=new_key.id) +``` + +See [Virtual keys & access control](/docs/prism/concepts/virtual-keys) for detailed key configuration (RBAC, IP ACL, model restrictions). + +--- + +## Multi-tenancy patterns + +### One org per customer + +For SaaS products, create a separate org per customer. Each customer gets isolated providers, budgets, and rate limits: + +- Customer A: budget $100/month, access to gpt-4o-mini only +- Customer B: budget $500/month, access to gpt-4o and claude-sonnet-4-6 +- Customer C: unlimited budget, all models + +### One org with per-key isolation + +For internal teams, use a single org with per-key restrictions: + +- Marketing team key: rate limit 50 RPM, budget $200/month +- Engineering team key: rate limit 500 RPM, budget $1000/month +- Data science key: rate limit 200 RPM, all models, no budget cap + +--- + +## Next Steps + + + + Per-key restrictions, RBAC, and IP ACL + + + Configuration hierarchy and sections + + + Per-org and per-key rate limits + + + Cost attribution across teams + + diff --git a/src/pages/docs/prism/api/assistants.mdx b/src/pages/docs/prism/api/assistants.mdx new file mode 100644 index 00000000..7f8ef78e --- /dev/null +++ b/src/pages/docs/prism/api/assistants.mdx @@ -0,0 +1,347 @@ +--- +title: "Assistants API" +description: "Use the OpenAI Assistants API through Prism for managed conversations with tool use and file retrieval." +--- + +## About + +Prism fully proxies the OpenAI Assistants API. Create assistants with instructions and tools, manage conversation threads, and execute runs - all through the gateway. You get the same Assistants API you'd use with OpenAI directly, plus Prism's routing, cost tracking, rate limiting, and logging on every call. + +The Assistants API is stateful (OpenAI stores threads and messages server-side), so it only works with OpenAI as the provider. Use the OpenAI SDK pointed at Prism. + + +Routing and failover do not apply to the Assistants API. Threads and runs are stored on OpenAI's servers, so the assistant's model must be an OpenAI model. + + +--- + +## Endpoints + +### Assistants + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/assistants` | Create an assistant | +| GET | `/v1/assistants` | List assistants | +| GET | `/v1/assistants/{id}` | Get an assistant | +| POST | `/v1/assistants/{id}` | Update an assistant | +| DELETE | `/v1/assistants/{id}` | Delete an assistant | + +### Threads + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads` | Create a thread | +| GET | `/v1/threads/{id}` | Get a thread | +| POST | `/v1/threads/{id}` | Update a thread | +| DELETE | `/v1/threads/{id}` | Delete a thread | + +### Messages + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads/{id}/messages` | Add a message to a thread | +| GET | `/v1/threads/{id}/messages` | List messages in a thread | +| GET | `/v1/threads/{id}/messages/{msg_id}` | Get a message | +| POST | `/v1/threads/{id}/messages/{msg_id}` | Update a message | +| DELETE | `/v1/threads/{id}/messages/{msg_id}` | Delete a message | + +### Runs + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads/{id}/runs` | Create a run | +| GET | `/v1/threads/{id}/runs` | List runs | +| GET | `/v1/threads/{id}/runs/{run_id}` | Get a run | +| POST | `/v1/threads/{id}/runs/{run_id}` | Update a run | +| POST | `/v1/threads/{id}/runs/{run_id}/cancel` | Cancel a run | +| POST | `/v1/threads/{id}/runs/{run_id}/submit_tool_outputs` | Submit tool outputs | +| GET | `/v1/threads/{id}/runs/{run_id}/steps` | List run steps | +| POST | `/v1/threads/runs` | Create thread and run in one call | + +--- + +## Quick example + +Create an assistant, start a conversation, and get a response: + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +# 1. Create an assistant +assistant = client.beta.assistants.create( + name="Math Tutor", + instructions="You are a math tutor. Explain concepts step by step.", + model="gpt-4o", +) + +# 2. Create a thread +thread = client.beta.threads.create() + +# 3. Add a message +client.beta.threads.messages.create( + thread_id=thread.id, + role="user", + content="Explain the Pythagorean theorem", +) + +# 4. Run the assistant +run = client.beta.threads.runs.create_and_poll( + thread_id=thread.id, + assistant_id=assistant.id, +) + +# 5. Get the response +if run.status == "completed": + messages = client.beta.threads.messages.list(thread_id=thread.id) + for msg in messages.data: + if msg.role == "assistant": + print(msg.content[0].text.value) + break +``` + + + + + +```bash +# 1. Create an assistant +ASSISTANT_ID=$(curl -s -X POST https://gateway.futureagi.com/v1/assistants \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H "OpenAI-Beta: assistants=v2" \ + -d '{ + "name": "Math Tutor", + "instructions": "You are a math tutor. Explain concepts step by step.", + "model": "gpt-4o" + }' | jq -r '.id') + +# 2. Create a thread +THREAD_ID=$(curl -s -X POST https://gateway.futureagi.com/v1/threads \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H "OpenAI-Beta: assistants=v2" \ + -d '{}' | jq -r '.id') + +# 3. Add a message +curl -s -X POST "https://gateway.futureagi.com/v1/threads/$THREAD_ID/messages" \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H "OpenAI-Beta: assistants=v2" \ + -d '{"role": "user", "content": "Explain the Pythagorean theorem"}' + +# 4. Create a run +RUN_ID=$(curl -s -X POST "https://gateway.futureagi.com/v1/threads/$THREAD_ID/runs" \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H "OpenAI-Beta: assistants=v2" \ + -d "{\"assistant_id\": \"$ASSISTANT_ID\"}" | jq -r '.id') + +# 5. Poll until complete, then get messages +# (poll GET /v1/threads/$THREAD_ID/runs/$RUN_ID until status is "completed") +curl -s "https://gateway.futureagi.com/v1/threads/$THREAD_ID/messages" \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "OpenAI-Beta: assistants=v2" | jq '.data[0].content[0].text.value' +``` + + + + + +--- + +## Tool use + +Assistants can call tools (functions you define) during a run. When the run enters `requires_action` status, you submit tool outputs to continue. + +```python +import json + +# Create assistant with tools +assistant = client.beta.assistants.create( + name="Weather Bot", + instructions="You help users check the weather.", + model="gpt-4o", + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + }, + "required": ["city"], + }, + }, + }], +) + +thread = client.beta.threads.create() +client.beta.threads.messages.create( + thread_id=thread.id, + role="user", + content="What's the weather in Tokyo?", +) + +run = client.beta.threads.runs.create( + thread_id=thread.id, + assistant_id=assistant.id, +) + +# Poll until the run needs action or completes +import time +while run.status in ("queued", "in_progress"): + time.sleep(1) + run = client.beta.threads.runs.retrieve( + thread_id=thread.id, + run_id=run.id, + ) + +if run.status == "requires_action": + tool_calls = run.required_action.submit_tool_outputs.tool_calls + + # Process each tool call + tool_outputs = [] + for call in tool_calls: + args = json.loads(call.function.arguments) + # Your actual function call here + result = f"22°C and sunny in {args['city']}" + tool_outputs.append({ + "tool_call_id": call.id, + "output": result, + }) + + # Submit outputs and wait for completion + run = client.beta.threads.runs.submit_tool_outputs_and_poll( + thread_id=thread.id, + run_id=run.id, + tool_outputs=tool_outputs, + ) + +if run.status == "completed": + messages = client.beta.threads.messages.list(thread_id=thread.id) + print(messages.data[0].content[0].text.value) +``` + +--- + +## File search + +Assistants can search uploaded files using vector stores. Upload files, attach them to a vector store, then give the assistant access: + +```python +# Upload a file +file = client.files.create( + file=open("knowledge_base.pdf", "rb"), + purpose="assistants", +) + +# Create a vector store and add the file +vector_store = client.beta.vector_stores.create(name="Knowledge Base") +client.beta.vector_stores.files.create( + vector_store_id=vector_store.id, + file_id=file.id, +) + +# Create assistant with file search +assistant = client.beta.assistants.create( + name="Research Assistant", + instructions="Answer questions using the provided documents.", + model="gpt-4o", + tools=[{"type": "file_search"}], + tool_resources={ + "file_search": { + "vector_store_ids": [vector_store.id], + } + }, +) + +# Ask a question about the uploaded file +thread = client.beta.threads.create() +client.beta.threads.messages.create( + thread_id=thread.id, + role="user", + content="What does the document say about quarterly revenue?", +) + +run = client.beta.threads.runs.create_and_poll( + thread_id=thread.id, + assistant_id=assistant.id, +) + +if run.status == "completed": + messages = client.beta.threads.messages.list(thread_id=thread.id) + print(messages.data[0].content[0].text.value) +``` + +--- + +## Streaming runs + +Stream run events for real-time UI updates instead of polling: + +```python +from openai import AssistantEventHandler + +class MyHandler(AssistantEventHandler): + def on_text_created(self, text): + print("\nassistant > ", end="", flush=True) + + def on_text_delta(self, delta, snapshot): + print(delta.value, end="", flush=True) + + def on_tool_call_created(self, tool_call): + print(f"\n Tool call: {tool_call.type}", flush=True) + +# Using thread and assistant from earlier examples +with client.beta.threads.runs.stream( + thread_id=thread.id, # from the thread you created + assistant_id=assistant.id, # from the assistant you created + event_handler=MyHandler(), +) as stream: + stream.until_done() +``` + +--- + +## What Prism adds + +Since Prism proxies every Assistants API call, you get: + +- **Cost tracking**: Every run, message creation, and retrieval call is logged with cost in the `x-prism-cost` header +- **Rate limiting**: Per-key and per-org limits apply to all Assistants API calls +- **Logging**: Full request/response logging for debugging and compliance +- **Access control**: Virtual key restrictions (allowed models, IP ACL) apply to the assistant's model + +The `x-prism-*` response headers are returned on every Assistants API response, just like any other Prism endpoint. + + +--- + +## Next Steps + + + + Stateless text generation (no thread management) + + + Full list of all 97 gateway endpoints + + + Control access and permissions per key + + + Monitor spend across all API calls + + diff --git a/src/pages/docs/prism/api/async-batch.mdx b/src/pages/docs/prism/api/async-batch.mdx new file mode 100644 index 00000000..fd68cc3d --- /dev/null +++ b/src/pages/docs/prism/api/async-batch.mdx @@ -0,0 +1,262 @@ +--- +title: "Async & batch" +description: "Run inference jobs asynchronously or process large batches of requests through the Prism Gateway." +--- + +## About + +Prism supports two modes for deferred processing: **async inference** sends a single request and returns a job ID you poll for the result, and **batch processing** submits many requests at once for bulk execution at lower cost. + +Both modes support all the same models and parameters as synchronous chat completions. + +--- + +## Endpoints + +| Method | Path | Description | +|---|---|---| +| GET | `/v1/async/{job_id}` | Get async job status and result | +| DELETE | `/v1/async/{job_id}` | Cancel an async job | +| POST | `/v1/scheduled` | Schedule a completion for later | +| GET | `/v1/scheduled` | List scheduled jobs | +| GET | `/v1/scheduled/{job_id}` | Get a scheduled job | +| DELETE | `/v1/scheduled/{job_id}` | Cancel a scheduled job | + +--- + +## Async inference + +Send a chat completion request with async mode enabled. The gateway returns immediately with a job ID. Poll the job endpoint to get the result when it's ready. + +### Sending an async request + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +# Send async request with x-prism-async header +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Write a detailed essay about climate change"}], + extra_headers={"x-prism-async": "true"}, +) + +# Response contains the job ID +job_id = response.id +print(f"Job ID: {job_id}") +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H "x-prism-async: true" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Write a detailed essay about climate change"}] + }' +``` + + + + + +### Polling for results + +```python +import time +import requests + +headers = {"Authorization": "Bearer sk-prism-your-key"} + +while True: + resp = requests.get( + f"https://gateway.futureagi.com/v1/async/{job_id}", + headers=headers, + ) + data = resp.json() + + if data["status"] == "completed": + print(data["result"]["choices"][0]["message"]["content"]) + break + elif data["status"] == "failed": + print(f"Job failed: {data.get('error')}") + break + else: + time.sleep(2) +``` + +### Job statuses + +| Status | Description | +|---|---| +| `pending` | Job is queued | +| `running` | Job is being processed | +| `completed` | Result is ready | +| `failed` | Job failed (check `error` field) | +| `cancelled` | Job was cancelled | + +--- + +## Scheduled completions + +Schedule a request to run at a specific time. Useful for time-sensitive content generation or deferred workloads. + +```bash +curl -X POST https://gateway.futureagi.com/v1/scheduled \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "scheduled_at": "2026-04-05T09:00:00Z", + "request": { + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Generate the daily summary report"}] + } + }' +``` + +### Managing scheduled jobs + +```bash +# List scheduled jobs +curl https://gateway.futureagi.com/v1/scheduled \ + -H "Authorization: Bearer sk-prism-your-key" + +# Get a specific job +curl https://gateway.futureagi.com/v1/scheduled/job_123 \ + -H "Authorization: Bearer sk-prism-your-key" + +# Cancel a scheduled job +curl -X DELETE https://gateway.futureagi.com/v1/scheduled/job_123 \ + -H "Authorization: Bearer sk-prism-your-key" +``` + +--- + +## Batch processing + +For high-volume workloads, the OpenAI Batch API lets you submit a file of requests and retrieve results when processing is complete. Batch requests typically run at lower cost (50% discount with OpenAI). + +### Creating a batch + +```python +from openai import OpenAI +import json + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +# 1. Create a JSONL file with requests +requests_data = [ + { + "custom_id": "req-1", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Summarize: Machine learning is..."}], + }, + }, + { + "custom_id": "req-2", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Summarize: Neural networks are..."}], + }, + }, +] + +with open("batch_input.jsonl", "w") as f: + for req in requests_data: + f.write(json.dumps(req) + "\n") + +# 2. Upload the input file +input_file = client.files.create( + file=open("batch_input.jsonl", "rb"), + purpose="batch", +) + +# 3. Create the batch +batch = client.batches.create( + input_file_id=input_file.id, + endpoint="/v1/chat/completions", + completion_window="24h", +) +print(f"Batch ID: {batch.id}, Status: {batch.status}") +``` + +### Checking batch status + +```python +import time + +while True: + batch = client.batches.retrieve(batch.id) + print(f"Status: {batch.status} ({batch.request_counts.completed}/{batch.request_counts.total})") + + if batch.status == "completed": + break + elif batch.status in ("failed", "cancelled", "expired"): + print(f"Batch ended: {batch.status}") + break + + time.sleep(30) +``` + +### Retrieving results + +```python +if batch.output_file_id: + content = client.files.content(batch.output_file_id) + results = content.text.strip().split("\n") + + for line in results: + result = json.loads(line) + print(f"{result['custom_id']}: {result['response']['body']['choices'][0]['message']['content'][:100]}") +``` + +--- + +## When to use each mode + +| Mode | Best for | Latency | Cost | +|---|---|---|---| +| Synchronous | Interactive apps, real-time responses | Lowest | Standard | +| Async | Long-running requests, fire-and-forget | Medium (poll) | Standard | +| Scheduled | Time-triggered jobs, deferred work | Scheduled | Standard | +| Batch | High-volume processing, data pipelines | Hours | Discounted (up to 50% off) | + +--- + +## Next Steps + + + + Synchronous text generation + + + Monitor batch and async job costs + + + Per-key limits apply to batch submissions + + + Full list of all gateway endpoints + + diff --git a/src/pages/docs/prism/api/chat.mdx b/src/pages/docs/prism/api/chat.mdx new file mode 100644 index 00000000..04bafea8 --- /dev/null +++ b/src/pages/docs/prism/api/chat.mdx @@ -0,0 +1,694 @@ +--- +title: "Chat completions" +description: "The primary endpoint for generating text with LLMs through Prism. Supports streaming, function calling, vision, and structured outputs." +--- + +## About + +`POST /v1/chat/completions` is the main endpoint. It works exactly like the OpenAI API — same request body, same response format. Prism adds routing, caching, guardrails, and cost tracking transparently, and supports streaming via SSE. + +## Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], +) + +print(response.choices[0].message.content) +``` + + + + + +```python +from openai import OpenAI + +# Same OpenAI SDK, just swap base_url and api_key +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], +) + +print(response.choices[0].message.content) +``` + + + + + +```python +import litellm + +response = litellm.completion( + model="openai/gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", +) + +print(response.choices[0].message.content) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"} + ] + }' +``` + + + + + +--- + +## Request body + +All standard OpenAI chat completion parameters are supported: + +| Parameter | Type | Description | +|---|---|---| +| `model` | string | **Required.** The model to use (e.g., `gpt-4o`, `claude-sonnet-4-6`). | +| `messages` | array | **Required.** The conversation messages. See [Message format](#message-format) below. | +| `temperature` | number | Sampling temperature (0-2). | +| `top_p` | number | Nucleus sampling (0-1). | +| `n` | integer | Number of completions to generate. | +| `stream` | boolean | Enable SSE streaming. See [Streaming](#streaming). | +| `stream_options` | object | `{include_usage: true}` to get token counts in the final chunk. | +| `stop` | string or array | Stop sequences. | +| `max_tokens` | integer | Maximum tokens to generate. | +| `max_completion_tokens` | integer | Max tokens for o1/o3-style models. | +| `presence_penalty` | number | Penalize repeated topics (-2 to 2). | +| `frequency_penalty` | number | Penalize repeated tokens (-2 to 2). | +| `logit_bias` | object | Token ID to bias value mapping. | +| `logprobs` | boolean | Return log probabilities. | +| `top_logprobs` | integer | Number of top log probs per token (0-20). | +| `user` | string | End-user ID for tracking and rate limiting. | +| `seed` | integer | Seed for reproducible outputs. | +| `tools` | array | Function definitions for tool/function calling. | +| `tool_choice` | string or object | `"auto"`, `"none"`, `"required"`, or a specific tool. | +| `response_format` | object | `{type: "json_object"}` or `{type: "json_schema", json_schema: {...}}`. | +| `modalities` | array | Output modalities, e.g., `["text", "audio"]`. | +| `audio` | object | Audio output config: `{voice: "alloy", format: "wav"}`. | + + +Prism passes through unknown fields to the provider. Provider-specific parameters (like Anthropic's `thinking` or any vendor extension) work without Prism needing to know about them. + + +--- + +## Response body + +```json +{ + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": 1711000000, + "model": "gpt-4o-2024-08-06", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 25, + "completion_tokens": 8, + "total_tokens": 33 + } +} +``` + +| Field | Description | +|---|---| +| `choices[].finish_reason` | `"stop"` (natural end), `"length"` (hit max tokens), `"tool_calls"` (model wants to call a function), `"content_filter"` (blocked by provider) | +| `usage` | Token counts. Always present on non-streaming responses. | + +--- + +## Streaming + +Set `stream: true` to receive the response as Server-Sent Events (SSE). Each chunk arrives as a `data:` line: + +``` +data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"The"},"finish_reason":null}]} + +data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":" capital"},"finish_reason":null}]} + +... + +data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":25,"completion_tokens":8,"total_tokens":33}} + +data: [DONE] +``` + +The final chunk before `[DONE]` includes `usage` with token counts. Prism forces `stream_options.include_usage = true` on every streaming request so that cost tracking and credit deduction work correctly. + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +stream = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Write a haiku about coding"}], + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +stream = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Write a haiku about coding"}], + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```python +import litellm + +response = litellm.completion( + model="openai/gpt-4o", + messages=[{"role": "user", "content": "Write a haiku about coding"}], + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", + stream=True, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Write a haiku about coding"}], + "stream": true + }' +``` + + + + + +### Streaming behavior + +- **Pre-request plugins** (guardrails, rate limiting, etc.) run before the stream starts. If a guardrail blocks the request, you get a JSON error response, not a stream. +- **Post-response plugins** (cost, logging, metrics) run after the final chunk, once token usage is known. +- **Cache**: Streaming requests bypass the cache entirely, both on read and write. +- **Failover**: Not supported mid-stream. If the provider fails after streaming starts, the error appears as an SSE data event. +- **Client disconnect**: Post-plugins still run even if you disconnect early, so cost tracking stays accurate. + +--- + +## Function calling + +Define tools in the request, and the model can choose to call them. The response will have `finish_reason: "tool_calls"` with the function name and arguments. + + + + + +```python +import json +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name"}, + }, + "required": ["location"], + }, + }, + } +] + +messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + +# First call: model decides to call a tool +response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, + tool_choice="auto", +) + +if response.choices[0].finish_reason == "tool_calls": + # Add the assistant's tool call to the conversation + messages.append(response.choices[0].message) + + # Execute each tool call and add the result + for tool_call in response.choices[0].message.tool_calls: + args = json.loads(tool_call.function.arguments) + result = {"temperature": "22°C", "condition": "Sunny"} # your function here + + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(result), + }) + + # Second call: model uses the tool result to respond + final = client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, + ) + print(final.choices[0].message.content) +``` + + + + + +```python +import json +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name"}, + }, + "required": ["location"], + }, + }, + } +] + +messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + +# First call: model decides to call a tool +response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, + tool_choice="auto", +) + +if response.choices[0].finish_reason == "tool_calls": + messages.append(response.choices[0].message) + + for tool_call in response.choices[0].message.tool_calls: + args = json.loads(tool_call.function.arguments) + result = {"temperature": "22°C", "condition": "Sunny"} # your function here + + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(result), + }) + + # Second call: model uses the tool result to respond + final = client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, + ) + print(final.choices[0].message.content) +``` + + + + + +```python +import json +import litellm + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name"}, + }, + "required": ["location"], + }, + }, + } +] + +messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + +response = litellm.completion( + model="openai/gpt-4o", + messages=messages, + tools=tools, + tool_choice="auto", + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", +) + +if response.choices[0].finish_reason == "tool_calls": + messages.append(response.choices[0].message) + + for tool_call in response.choices[0].message.tool_calls: + result = {"temperature": "22°C", "condition": "Sunny"} + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(result), + }) + + final = litellm.completion( + model="openai/gpt-4o", + messages=messages, + tools=tools, + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", + ) + print(final.choices[0].message.content) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "What'\''s the weather in Tokyo?"}], + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a location", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"] + } + } + }], + "tool_choice": "auto" + }' +``` + + + + + +Prism passes tools through to the provider without modification. All providers that support function calling (OpenAI, Anthropic, Gemini, etc.) work with the same tool definitions. + +--- + +## Vision (multimodal inputs) + +Send images alongside text by using the content array format: + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/photo.jpg"}}, + ], + } + ], +) + +print(response.choices[0].message.content) +``` + + + + + +```python +import litellm + +response = litellm.completion( + model="openai/gpt-4o", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/photo.jpg"}}, + ], + } + ], + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", +) + +print(response.choices[0].message.content) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/photo.jpg"}} + ] + }] + }' +``` + + + + + + +Not all models support vision. Use a model with image understanding capabilities (gpt-4o, claude-sonnet-4-6, gemini-2.0-flash, etc.). + + +Both HTTPS URLs and base64 data URIs (`data:image/png;base64,...`) are supported. Prism translates the content format to each provider's native representation (Anthropic base64 blocks, Gemini inline parts, Bedrock image blocks). + +--- + +## Structured outputs + +Force the model to return valid JSON matching a schema: + +```python +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "List 3 European capitals"}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "capitals", + "schema": { + "type": "object", + "properties": { + "capitals": { + "type": "array", + "items": {"type": "string"}, + } + }, + "required": ["capitals"], + }, + }, + }, +) +``` + +Prism forwards `response_format` to the provider as-is. The provider handles constrained decoding. Use `"type": "json_object"` for simpler JSON without a schema. + +--- + +## Message format + +Each message in the `messages` array has: + +| Field | Type | Description | +|---|---|---| +| `role` | string | `"system"`, `"user"`, `"assistant"`, or `"tool"` | +| `content` | string or array | Text string, or array of content parts for multimodal inputs | +| `name` | string | Optional sender name | +| `tool_calls` | array | Tool calls made by the assistant (on assistant messages) | +| `tool_call_id` | string | ID of the tool call this message responds to (on tool messages) | + +--- + +## Response headers + +Prism adds these headers to every response (streaming and non-streaming): + +| Header | Description | +|---|---| +| `x-prism-request-id` | Unique request ID for log correlation | +| `x-prism-provider` | Which provider handled the request (e.g., `openai`) | +| `x-prism-latency-ms` | Total latency in milliseconds | +| `x-prism-model-used` | Actual model returned by the provider | +| `x-prism-cost` | Estimated cost in USD | +| `x-prism-cache` | `hit` or `miss` | +| `x-prism-guardrail-triggered` | `true` if a guardrail fired | +| `x-prism-fallback-used` | `true` if a fallback provider or model was used | +| `x-prism-routing-strategy` | Which routing strategy was applied | +| `x-prism-credits-remaining` | Remaining credit balance (managed keys) | +| `x-ratelimit-limit-requests` | Rate limit ceiling | +| `x-ratelimit-remaining-requests` | Remaining requests in current window | + +--- + +## Switching providers + +Change the model name to route to a different provider. The request format stays identical: + +```python +# OpenAI +response = client.chat.completions.create(model="gpt-4o", messages=messages) + +# Anthropic +response = client.chat.completions.create(model="claude-sonnet-4-6", messages=messages) + +# Gemini +response = client.chat.completions.create(model="gemini-2.0-flash", messages=messages) +``` + +Prism translates the request to each provider's native format. Your code doesn't change. + +--- + +## Next Steps + + + + Control which provider handles each request + + + Add safety checks to requests and responses + + + Cache responses to reduce latency and cost + + + See all available API endpoints + + diff --git a/src/pages/docs/prism/api/embeddings.mdx b/src/pages/docs/prism/api/embeddings.mdx new file mode 100644 index 00000000..333955cc --- /dev/null +++ b/src/pages/docs/prism/api/embeddings.mdx @@ -0,0 +1,420 @@ +--- +title: "Embeddings & reranking" +description: "Generate text embeddings and rerank documents through the Prism Gateway." +--- + +## About + +Prism proxies embedding and reranking requests to any configured provider. The API follows the OpenAI format for embeddings and a similar format for reranking. All gateway features (caching, cost tracking, rate limiting, failover) apply to these endpoints the same way they apply to chat completions. + +--- + +## Endpoints + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/embeddings` | Generate vector embeddings for text | +| POST | `/v1/rerank` | Rerank documents by relevance to a query | + +--- + +## Embeddings + +### Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.embeddings.create( + model="text-embedding-3-small", + input="The quick brown fox jumps over the lazy dog", +) + +vector = response.data[0].embedding +print(f"Dimensions: {len(vector)}") +print(f"Cost: {response.prism.cost}") +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +response = client.embeddings.create( + model="text-embedding-3-small", + input="The quick brown fox jumps over the lazy dog", +) + +vector = response.data[0].embedding +print(f"Dimensions: {len(vector)}") +``` + + + + + +```python +import litellm + +response = litellm.embedding( + model="openai/text-embedding-3-small", + input=["The quick brown fox jumps over the lazy dog"], + api_base="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +vector = response.data[0].embedding +print(f"Dimensions: {len(vector)}") +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/embeddings \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "text-embedding-3-small", + "input": "The quick brown fox jumps over the lazy dog" + }' +``` + + + + + +### Batch embeddings + +Pass an array to embed multiple texts in a single request. Each item in the response includes an `index` field matching its position in the input array. + +```python +response = client.embeddings.create( + model="text-embedding-3-small", + input=[ + "First document about machine learning", + "Second document about web development", + "Third document about database design", + ], +) + +for item in response.data: + print(f"Input {item.index}: {len(item.embedding)} dimensions") +``` + +### Reduced dimensions + +Some models support returning shorter vectors. Use the `dimensions` parameter to reduce the output size. Smaller vectors use less storage and are faster to compare, at the cost of some accuracy. + +```python +# Full dimensions (1536 for text-embedding-3-small) +full = client.embeddings.create( + model="text-embedding-3-small", + input="Hello world", +) +print(f"Full: {len(full.data[0].embedding)} dims") + +# Reduced to 512 dimensions +reduced = client.embeddings.create( + model="text-embedding-3-small", + input="Hello world", + dimensions=512, +) +print(f"Reduced: {len(reduced.data[0].embedding)} dims") +``` + + +The `dimensions` parameter is supported by OpenAI's `text-embedding-3-*` models and some Cohere models. Older models like `text-embedding-ada-002` do not support it. + + +### Encoding format + +By default, embeddings are returned as arrays of floats. For lower bandwidth, request `base64` encoding: + +```python +response = client.embeddings.create( + model="text-embedding-3-small", + input="Hello world", + encoding_format="base64", +) +# response.data[0].embedding is a base64 string +``` + +### Response format + +```json +{ + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [0.0023, -0.0091, 0.0152, ...] + } + ], + "model": "text-embedding-3-small", + "usage": { + "prompt_tokens": 9, + "total_tokens": 9 + } +} +``` + +--- + +## Reranking + +Reranking takes a query and a list of documents, then returns the documents sorted by relevance. Use it after an initial retrieval step (vector search, BM25) to improve ranking quality before passing results to an LLM. + +### Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +documents = [ + "Machine learning is a branch of artificial intelligence.", + "Dogs are popular household pets.", + "Neural networks learn patterns from data.", + "The weather in Paris is mild in spring.", +] + +response = client.rerank.create( + model="rerank-v3.5", + query="What is machine learning?", + documents=documents, +) + +for result in response.results: + print(f"Index: {result.index}, Score: {result.relevance_score:.4f}") + print(f" {documents[result.index]}") +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/rerank \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "rerank-v3.5", + "query": "What is machine learning?", + "documents": [ + "Machine learning is a branch of artificial intelligence.", + "Dogs are popular household pets.", + "Neural networks learn patterns from data.", + "The weather in Paris is mild in spring." + ] + }' +``` + + + + + +### Parameters + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `model` | string | Yes | Reranking model to use | +| `query` | string | Yes | The search query to rank against | +| `documents` | string[] | Yes | List of text documents to rerank | +| `top_n` | integer | No | Return only the top N results. Defaults to all documents. | +| `return_documents` | boolean | No | Include the document text in the response. Default: `false`. | + +### Limiting results + +Use `top_n` to return only the most relevant documents: + +```python +response = client.rerank.create( + model="rerank-v3.5", + query="What is machine learning?", + documents=["doc1...", "doc2...", "doc3...", "doc4..."], + top_n=2, # only return the 2 most relevant +) +``` + +### Response format + +```json +{ + "results": [ + { + "index": 0, + "relevance_score": 0.9875, + "document": "Machine learning is a branch of artificial intelligence." + }, + { + "index": 2, + "relevance_score": 0.8432, + "document": "Neural networks learn patterns from data." + } + ], + "model": "rerank-v3.5", + "usage": { + "prompt_tokens": 42, + "total_tokens": 42 + } +} +``` + +The `document` field is only present when `return_documents=true`. + +--- + +## Supported models + +### Embedding models + +| Provider | Models | Dimensions | +|---|---|---| +| OpenAI | `text-embedding-3-small` | 1536 (or custom via `dimensions`) | +| OpenAI | `text-embedding-3-large` | 3072 (or custom via `dimensions`) | +| OpenAI | `text-embedding-ada-002` | 1536 | +| Google | `gemini-embedding-001` | 768 | +| Cohere | `embed-english-v3.0`, `embed-multilingual-v3.0` | 1024 | + +### Reranking models + +| Provider | Models | +|---|---| +| Cohere | `rerank-v3.5`, `rerank-english-v3.0`, `rerank-multilingual-v3.0` | + + +Available models depend on which providers are configured for your organization. Use `GET /v1/models` to see what's available on your key. + + +--- + +## RAG pipeline example + +A typical retrieval-augmented generation pipeline using embeddings for search and reranking for precision: + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +# Step 1: Embed the query +query = "How does photosynthesis work?" +query_embedding = client.embeddings.create( + model="text-embedding-3-small", + input=query, +).data[0].embedding + +# Step 2: Search your vector database (pseudo-code) +# candidates = vector_db.search(query_embedding, top_k=20) + +# Step 3: Rerank the candidates for better precision +candidates = [ + "Photosynthesis converts light energy into chemical energy in plants.", + "Plants use chlorophyll to absorb sunlight during photosynthesis.", + "The mitochondria is the powerhouse of the cell.", + "Carbon dioxide and water are inputs to the photosynthesis process.", +] + +reranked = client.rerank.create( + model="rerank-v3.5", + query=query, + documents=candidates, + top_n=3, +) + +# Step 4: Use the top results as context for the LLM +context = "\n".join( + candidates[r.index] for r in reranked.results +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": f"Answer based on this context:\n{context}"}, + {"role": "user", "content": query}, + ], +) + +print(response.choices[0].message.content) +``` + +--- + +## Caching embeddings + +The same input always produces the same vector, so embeddings are a good fit for exact-match caching. With caching enabled, repeated inputs return instantly without calling the provider: + +```python +from prism import Prism, GatewayConfig, CacheConfig + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + config=GatewayConfig( + cache=CacheConfig(enabled=True, strategy="exact", ttl=86400), + ), +) + +# First call: cache miss, calls the provider +response = client.embeddings.create( + model="text-embedding-3-small", + input="Hello world", +) +print(response.prism.cache_status) # None or "miss" + +# Second call with same input: cache hit, instant response +response = client.embeddings.create( + model="text-embedding-3-small", + input="Hello world", +) +print(response.prism.cache_status) # "hit_exact" +print(response.prism.cost) # 0 (no provider call) +``` + +--- + +## Next Steps + + + + Primary endpoint for text generation + + + Cache strategies and per-request cache control + + + See which providers are available + + + Full reference for x-prism-* headers + + diff --git a/src/pages/docs/prism/api/endpoints.mdx b/src/pages/docs/prism/api/endpoints.mdx new file mode 100644 index 00000000..8d696bee --- /dev/null +++ b/src/pages/docs/prism/api/endpoints.mdx @@ -0,0 +1,314 @@ +--- +title: "Endpoints overview" +description: "Complete list of all API endpoints available through the Prism Gateway." +--- + +## About + +Prism exposes 97 endpoints across 20+ categories. All inference endpoints live under `/v1/` and follow the OpenAI API format. Admin endpoints live under `/-/` and require an admin token. + +## Base URL + +All endpoints are relative to your Prism gateway URL: + +``` +https://gateway.futureagi.com +``` + +Inference endpoints use the `/v1/` prefix and accept your virtual API key (`sk-prism-...`) as a Bearer token. Admin endpoints use the `/-/` prefix and require the admin token. + +--- + +## Chat and completions + +The primary endpoints for generating text with LLMs. + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/chat/completions` | Chat completion (streaming and non-streaming) | +| POST | `/v1/completions` | Text completion (legacy) | +| POST | `/v1/count_tokens` | Count tokens for a set of messages | + +--- + +## Embeddings, reranking, and search + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/embeddings` | Generate text embeddings | +| POST | `/v1/rerank` | Rerank text passages by relevance | +| POST | `/v1/search` | Search API | +| POST | `/v1/ocr` | Optical character recognition | + +--- + +## Audio + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/audio/speech` | Text-to-speech | +| POST | `/v1/audio/speech/stream` | Streaming text-to-speech | +| POST | `/v1/audio/transcriptions` | Speech-to-text (Whisper) | +| POST | `/v1/audio/translations` | Translate audio to English | + +--- + +## Images and video + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/images/generations` | Generate images from prompts | +| POST | `/v1/videos` | Submit video generation job | +| GET | `/v1/videos` | List video jobs | +| GET | `/v1/videos/{video_id}` | Get video job status | +| DELETE | `/v1/videos/{video_id}` | Cancel video job | + +--- + +## Files + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/files` | Upload a file | +| GET | `/v1/files` | List files | +| GET | `/v1/files/{file_id}` | Get file metadata | +| GET | `/v1/files/{file_id}/content` | Download file content | +| DELETE | `/v1/files/{file_id}` | Delete a file | + +--- + +## Vector stores + +Used with the Assistants API for file-based retrieval. + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/vector_stores` | Create vector store | +| GET | `/v1/vector_stores` | List vector stores | +| GET | `/v1/vector_stores/{id}` | Get vector store | +| POST | `/v1/vector_stores/{id}` | Update vector store | +| DELETE | `/v1/vector_stores/{id}` | Delete vector store | +| POST | `/v1/vector_stores/{id}/search` | Search a vector store | +| POST | `/v1/vector_stores/{id}/files` | Add file to vector store | +| GET | `/v1/vector_stores/{id}/files` | List files in vector store | +| DELETE | `/v1/vector_stores/{id}/files/{file_id}` | Remove file from vector store | +| POST | `/v1/vector_stores/{id}/file_batches` | Batch add files | + +--- + +## Assistants API + +Full proxy for the OpenAI Assistants API. Create assistants, manage threads, send messages, and execute runs. + +### Assistants + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/assistants` | Create assistant | +| GET | `/v1/assistants` | List assistants | +| GET | `/v1/assistants/{id}` | Get assistant | +| POST | `/v1/assistants/{id}` | Update assistant | +| DELETE | `/v1/assistants/{id}` | Delete assistant | + +### Threads + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads` | Create thread | +| GET | `/v1/threads/{id}` | Get thread | +| POST | `/v1/threads/{id}` | Update thread | +| DELETE | `/v1/threads/{id}` | Delete thread | + +### Messages + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads/{id}/messages` | Add message | +| GET | `/v1/threads/{id}/messages` | List messages | +| GET | `/v1/threads/{id}/messages/{msg_id}` | Get message | +| POST | `/v1/threads/{id}/messages/{msg_id}` | Update message | +| DELETE | `/v1/threads/{id}/messages/{msg_id}` | Delete message | + +### Runs + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/threads/{id}/runs` | Create run | +| GET | `/v1/threads/{id}/runs` | List runs | +| GET | `/v1/threads/{id}/runs/{run_id}` | Get run | +| POST | `/v1/threads/{id}/runs/{run_id}` | Update run | +| POST | `/v1/threads/{id}/runs/{run_id}/cancel` | Cancel run | +| POST | `/v1/threads/{id}/runs/{run_id}/submit_tool_outputs` | Submit tool outputs | +| GET | `/v1/threads/{id}/runs/{run_id}/steps` | List run steps | +| GET | `/v1/threads/{id}/runs/{run_id}/steps/{step_id}` | Get run step | +| POST | `/v1/threads/runs` | Create thread and run in one call | + +--- + +## Responses API + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/responses` | Create response | +| GET | `/v1/responses/{id}` | Get response | +| DELETE | `/v1/responses/{id}` | Delete response | + +--- + +## Async inference + +| Method | Path | Description | +|---|---|---| +| GET | `/v1/async/{job_id}` | Get async job status and result | +| DELETE | `/v1/async/{job_id}` | Cancel async job | + +Async jobs are created by sending a regular chat completion request with async mode enabled. The batch API is available via admin endpoints below. + +--- + +## Scheduled completions + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/scheduled` | Schedule a completion for later | +| GET | `/v1/scheduled` | List scheduled jobs | +| GET | `/v1/scheduled/{job_id}` | Get scheduled job | +| DELETE | `/v1/scheduled/{job_id}` | Cancel scheduled job | + +--- + +## Realtime (WebSocket) + +| Method | Path | Description | +|---|---|---| +| GET | `/v1/realtime` | Upgrade to WebSocket for real-time audio/video streaming | + +--- + +## Native format passthrough + +For clients that prefer a provider's native API format instead of the OpenAI format. + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/messages` | Anthropic Messages API (native format) | +| POST | `/v1/messages/count_tokens` | Anthropic token counting | +| POST | `/v1beta/models/{model}:generateContent` | Google GenAI generate content | +| POST | `/v1beta/models/{model}:streamGenerateContent` | Google GenAI streaming | + +--- + +## Models + +| Method | Path | Description | +|---|---|---| +| GET | `/v1/models` | List all available models | +| GET | `/v1/models/{model}` | Get model details | + +--- + +## MCP (Model Context Protocol) + +Prism acts as an MCP server, aggregating tools from upstream MCP tool servers. + +| Method | Path | Description | +|---|---|---| +| POST | `/mcp` | MCP protocol endpoint | +| GET | `/mcp` | MCP SSE streaming endpoint | + +### Management + +| Method | Path | Description | +|---|---|---| +| GET | `/-/mcp/status` | MCP server status and stats | +| GET | `/-/mcp/tools` | List available tools | +| GET | `/-/mcp/resources` | List MCP resources | +| GET | `/-/mcp/prompts` | List MCP prompts | +| POST | `/-/mcp/test` | Test tool execution | + +--- + +## A2A (Agent-to-Agent) + +| Method | Path | Description | +|---|---|---| +| GET | `/.well-known/agent.json` | Agent capabilities card | +| POST | `/a2a` | A2A protocol messages | +| GET | `/v1/agents` | List registered A2A agents | + +--- + +## Admin: key management + +Requires admin token. + +| Method | Path | Description | +|---|---|---| +| POST | `/-/keys` | Create API key | +| GET | `/-/keys` | List keys | +| GET | `/-/keys/{key_id}` | Get key details | +| PUT | `/-/keys/{key_id}` | Update key | +| DELETE | `/-/keys/{key_id}` | Revoke key | +| POST | `/-/keys/{key_id}/credits` | Add credits to key | + +--- + +## Admin: organization config + +| Method | Path | Description | +|---|---|---| +| GET | `/-/orgs/{org_id}/config` | Get org config | +| PUT | `/-/orgs/{org_id}/config` | Set org config | +| DELETE | `/-/orgs/{org_id}/config` | Delete org config | +| GET | `/-/orgs/configs` | List all org configs | +| POST | `/-/orgs/configs/bulk` | Bulk load configs | + +--- + +## Admin: operations + +| Method | Path | Description | +|---|---|---| +| GET | `/-/cluster/nodes` | List cluster nodes | +| POST | `/-/admin/providers/{id}/rotate` | Start key rotation | +| GET | `/-/admin/providers/{id}/rotation` | Get rotation status | +| POST | `/-/admin/providers/{id}/rotate/promote` | Promote rotated key | +| POST | `/-/admin/providers/{id}/rotate/rollback` | Rollback rotation | +| POST | `/-/batches` | Submit batch job | +| GET | `/-/batches/{batch_id}` | Get batch status | +| POST | `/-/batches/{batch_id}/cancel` | Cancel batch | +| GET | `/-/shadow/stats` | Shadow testing statistics | + +--- + +## Health and diagnostics + +| Method | Path | Description | +|---|---|---| +| GET | `/healthz` | Liveness probe | +| GET | `/livez` | Liveness probe (alias) | +| GET | `/readyz` | Readiness probe | +| POST | `/-/reload` | Reload config from file | +| GET | `/-/config` | Server config summary | +| GET | `/-/metrics` | Prometheus metrics | +| GET | `/-/health/providers` | Provider health status | +| GET | `/-/health/providers/{org_id}` | Org-specific provider health | + +--- + +## Next Steps + + + + Understand the request pipeline + + + Make your first request in 5 minutes + + + See all LLM providers and how to add them + + + Configure load balancing and failover + + diff --git a/src/pages/docs/prism/api/files.mdx b/src/pages/docs/prism/api/files.mdx new file mode 100644 index 00000000..df5e4cf6 --- /dev/null +++ b/src/pages/docs/prism/api/files.mdx @@ -0,0 +1,240 @@ +--- +title: "Files & vector stores" +description: "Upload files and manage vector stores for use with the Assistants API through Prism." +--- + +## About + +Prism proxies the OpenAI Files and Vector Stores APIs. Upload documents for assistant file search, fine-tuning data, or batch processing. Vector stores index uploaded files for semantic retrieval during assistant runs. + +Like the Assistants API, files and vector stores are stored on OpenAI's servers. Use the OpenAI SDK pointed at Prism. + +--- + +## Files + +### Endpoints + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/files` | Upload a file | +| GET | `/v1/files` | List files | +| GET | `/v1/files/{file_id}` | Get file metadata | +| GET | `/v1/files/{file_id}/content` | Download file content | +| DELETE | `/v1/files/{file_id}` | Delete a file | + +### Upload a file + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +# Upload for use with Assistants +file = client.files.create( + file=open("report.pdf", "rb"), + purpose="assistants", +) +print(f"File ID: {file.id}") +print(f"Size: {file.bytes} bytes") +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/files \ + -H "Authorization: Bearer sk-prism-your-key" \ + -F file=@report.pdf \ + -F purpose=assistants +``` + + + + + +### Purpose values + +| Purpose | Use case | +|---|---| +| `assistants` | Files for assistant file search and code interpreter | +| `fine-tune` | Training data for fine-tuning | +| `batch` | Input files for batch API calls | + +### List and manage files + +```python +# List all files +files = client.files.list() +for f in files.data: + print(f"{f.id}: {f.filename} ({f.bytes} bytes, purpose={f.purpose})") + +# Get file metadata +file = client.files.retrieve("file-abc123") + +# Download file content +content = client.files.content("file-abc123") +with open("downloaded.pdf", "wb") as f: + f.write(content.read()) + +# Delete a file +client.files.delete("file-abc123") +``` + +--- + +## Vector stores + +Vector stores index uploaded files for semantic search. They're used with the Assistants API `file_search` tool. + +### Endpoints + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/vector_stores` | Create vector store | +| GET | `/v1/vector_stores` | List vector stores | +| GET | `/v1/vector_stores/{id}` | Get vector store | +| POST | `/v1/vector_stores/{id}` | Update vector store | +| DELETE | `/v1/vector_stores/{id}` | Delete vector store | +| POST | `/v1/vector_stores/{id}/search` | Search a vector store | +| POST | `/v1/vector_stores/{id}/files` | Add file to vector store | +| GET | `/v1/vector_stores/{id}/files` | List files in vector store | +| DELETE | `/v1/vector_stores/{id}/files/{file_id}` | Remove file from vector store | +| POST | `/v1/vector_stores/{id}/file_batches` | Batch add files | + +### Create a vector store and add files + +```python +# Create a vector store +vector_store = client.beta.vector_stores.create( + name="Product Documentation", +) +print(f"Vector store: {vector_store.id}") + +# Upload and add a file +file = client.files.create( + file=open("docs.pdf", "rb"), + purpose="assistants", +) + +client.beta.vector_stores.files.create( + vector_store_id=vector_store.id, + file_id=file.id, +) +``` + +### Batch upload + +Add multiple files at once: + +```python +# Upload several files +file_ids = [] +for path in ["chapter1.pdf", "chapter2.pdf", "chapter3.pdf"]: + f = client.files.create(file=open(path, "rb"), purpose="assistants") + file_ids.append(f.id) + +# Batch add to vector store +batch = client.beta.vector_stores.file_batches.create( + vector_store_id=vector_store.id, + file_ids=file_ids, +) +print(f"Batch status: {batch.status}") +``` + +### Search a vector store + +Search indexed files directly (outside of an assistant run): + +```python +results = client.beta.vector_stores.search( + vector_store_id=vector_store.id, + query="return policy", +) + +for result in results.data: + print(f"Score: {result.score:.4f}") + print(f"Content: {result.content[0].text[:200]}") + print() +``` + +### Use with an assistant + +Attach a vector store to an assistant for automatic file search during runs: + +```python +assistant = client.beta.assistants.create( + name="Support Agent", + instructions="Answer questions using the product documentation.", + model="gpt-4o", + tools=[{"type": "file_search"}], + tool_resources={ + "file_search": { + "vector_store_ids": [vector_store.id], + } + }, +) +``` + +See [Assistants API](/docs/prism/api/assistants) for the full assistant workflow. + +### Manage vector stores + +```python +# List vector stores +stores = client.beta.vector_stores.list() +for vs in stores.data: + print(f"{vs.id}: {vs.name} ({vs.file_counts.completed} files)") + +# List files in a vector store +files = client.beta.vector_stores.files.list(vector_store_id=vector_store.id) + +# Remove a file from a vector store +client.beta.vector_stores.files.delete( + vector_store_id=vector_store.id, + file_id="file-abc123", +) + +# Delete a vector store +client.beta.vector_stores.delete(vector_store.id) +``` + +--- + +## Supported file types + +| Category | Formats | +|---|---| +| Documents | `.pdf`, `.docx`, `.txt`, `.md`, `.html` | +| Code | `.py`, `.js`, `.ts`, `.java`, `.c`, `.cpp`, `.rb`, `.go`, `.rs` | +| Data | `.csv`, `.json`, `.jsonl` | +| Presentations | `.pptx` | + +Max file size: 512 MB. Max files per vector store: 10,000. + +--- + +## Next Steps + + + + Use files with assistants for retrieval and code execution + + + Full list of all gateway endpoints + + + Monitor storage and retrieval costs + + + Full reference for x-prism-* headers + + diff --git a/src/pages/docs/prism/api/headers.mdx b/src/pages/docs/prism/api/headers.mdx new file mode 100644 index 00000000..45f0475a --- /dev/null +++ b/src/pages/docs/prism/api/headers.mdx @@ -0,0 +1,291 @@ +--- +title: "Request & response headers" +description: "Reference for all x-prism-* request headers and response headers returned by the Prism AI Gateway." +--- + +## About + +Prism reads `x-prism-*` request headers to control per-request behavior (caching, sessions, routing) and writes `x-prism-*` response headers to report what happened (which provider, latency, cost, cache status). + +The Prism SDK handles these automatically. If you're using the OpenAI SDK or cURL, set them manually or use `create_headers()` to generate them. + +--- + +## Request headers + +### Tracking and correlation + +| Header | Value | Description | +|---|---|---| +| `x-prism-trace-id` | string | Custom trace ID for distributed tracing. If omitted, the gateway generates one. | +| `x-prism-session-id` | string | Group related requests into a logical session for analytics. | +| `x-prism-session-name` | string | Human-readable label for the session (used alongside `session-id`). | +| `x-prism-session-path` | string | Hierarchical path within a session, e.g. `/search/rerank`. | +| `x-prism-request-id` | string | Client-generated request ID for idempotency and log correlation. | +| `x-prism-user-id` | string | User identifier for per-user tracking, budgets, and analytics. | + +### Metadata and properties + +| Header | Value | Description | +|---|---|---| +| `x-prism-metadata` | JSON string | Arbitrary key-value pairs for cost attribution and filtering. Example: `{"team":"ml","env":"prod"}` | +| `x-prism-property-{key}` | string | Individual key-value properties. `x-prism-property-env: prod` is equivalent to including `"env":"prod"` in metadata. | + +### Cache control + +| Header | Value | Description | +|---|---|---| +| `x-prism-cache-ttl` | integer (seconds) | Override the cache TTL for this request. | +| `x-prism-cache-namespace` | string | Route to a specific cache namespace for isolation (e.g. `prod`, `staging`). | +| `x-prism-cache-force-refresh` | `true` | Bypass cache, fetch a fresh response from the provider, and update the cache with the new result. | +| `Cache-Control` | `no-store` | Disable caching entirely for this request. The response is not read from or written to cache. | + +### Routing control + +| Header | Value | Description | +|---|---|---| +| `x-prism-provider-lock` | string | Force this request to a specific provider, bypassing the routing strategy. Example: `openai`. | +| `x-prism-complexity-override` | string | Override complexity-based routing tier. Pass the tier name (e.g. `simple`, `moderate`, `complex`). | + +### Guardrails + +| Header | Value | Description | +|---|---|---| +| `x-prism-guardrail-policy` | string | Comma-separated list of guardrail policy IDs to apply to this request. Overrides org-level guardrail config. | + +### Gateway config (full override) + +| Header | Value | Description | +|---|---|---| +| `x-prism-config` | JSON string | Full `GatewayConfig` serialized as JSON. Overrides all per-request settings (cache, retry, fallback, guardrails, routing, timeouts). The Prism SDK's `GatewayConfig.to_headers()` generates this automatically. | +| `x-prism-request-timeout` | integer (ms) | Total request timeout in milliseconds. Also set automatically when using `TimeoutConfig.total` in the SDK. The gateway echoes the applied timeout back as `x-prism-timeout-ms` in the response. | + +--- + +## Response headers + +### Always present + +| Header | Example | Description | +|---|---|---| +| `x-prism-request-id` | `req-a1b2c3` | Unique identifier for this request. Use this when filing support tickets or searching logs. | +| `x-prism-trace-id` | `trace-x7y8z9` | Trace ID for distributed tracing. Matches the request header if one was sent. | +| `x-prism-provider` | `openai` | Which provider served this request. | +| `x-prism-model-used` | `gpt-4o-2024-08-06` | Actual model returned by the provider. May differ from the requested model if routing redirected the request. | +| `x-prism-latency-ms` | `342` | Total gateway latency in milliseconds, including the provider call. | +| `x-prism-timeout-ms` | `30000` | Timeout that was applied to this request. | + +### Conditional + +| Header | Present when | Value | +|---|---|---| +| `x-prism-cost` | Model has pricing data | Estimated cost in USD (e.g. `0.00234`). Returns `0` on exact cache hits. | +| `x-prism-cache` | Caching is enabled | `hit`, `hit_exact`, `hit_semantic`, `miss`, or `skip` | +| `x-prism-guardrail-triggered` | A guardrail fired | `true` | +| `x-prism-fallback-used` | A provider fallback occurred | `true` | +| `x-prism-routing-strategy` | A routing policy is active | Strategy name: `round-robin`, `weighted`, `least-latency`, `cost-optimized`, `adaptive`, `fastest` | +| `x-prism-credits-remaining` | Managed key with credit balance | Remaining USD balance (e.g. `12.50`) | + +### Rate limit headers + +Present when rate limiting is enabled for the key or org. + +| Header | Description | +|---|---| +| `x-ratelimit-limit-requests` | Maximum requests allowed per minute | +| `x-ratelimit-remaining-requests` | Requests remaining in the current window | +| `x-ratelimit-reset-requests` | Unix timestamp when the window resets | + +--- + +## Reading headers + +### Prism SDK + +Every response from the Prism SDK has a `.prism` attribute with typed access to all gateway metadata: + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) + +print(response.choices[0].message.content) +print(response.prism.provider) # openai +print(response.prism.latency_ms) # 342 +print(response.prism.cost) # 0.00015 +print(response.prism.cache_status) # miss +print(response.prism.model_used) # gpt-4o-2024-08-06 +print(response.prism.request_id) # req-a1b2c3 +print(response.prism.trace_id) # trace-x7y8z9 +print(response.prism.guardrail_triggered) # False +print(response.prism.fallback_used) # False +print(response.prism.routing_strategy) # None (or "weighted", etc.) + +# Rate limit info (when enabled) +if response.prism.ratelimit: + print(response.prism.ratelimit.limit) + print(response.prism.ratelimit.remaining) + print(response.prism.ratelimit.reset) +``` + +### OpenAI SDK + +The OpenAI SDK doesn't have `response.prism`. Use `with_raw_response` to read headers: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +raw = client.chat.completions.with_raw_response.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) + +print(raw.headers.get("x-prism-provider")) +print(raw.headers.get("x-prism-cost")) + +response = raw.parse() +``` + +### cURL + +Use the `-i` flag to include response headers in the output: + +```bash +curl -i -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + +--- + +## Setting request headers + +### Prism SDK + +The SDK accepts tracking parameters directly on each `create()` call: + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + session_id="sess-abc", + trace_id="trace-123", + user_id="user-42", + request_metadata={"team": "ml", "feature": "search"}, + properties={"env": "prod"}, +) +``` + +For gateway config, pass a `GatewayConfig` to the client constructor (applies to all requests) or override per-request with `extra_headers`: + +```python +from prism import Prism, GatewayConfig, CacheConfig, RetryConfig + +# Client-level config (applies to all requests) +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + config=GatewayConfig( + cache=CacheConfig(ttl=300, namespace="prod"), + retry=RetryConfig(max_retries=3), + ), +) + +# Per-request override +override = GatewayConfig(cache=CacheConfig(force_refresh=True)) +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers=override.to_headers(), +) +``` + +### OpenAI SDK with create_headers() + +Use `create_headers()` to generate all `x-prism-*` headers for the OpenAI SDK: + +```python +from openai import OpenAI +from prism import create_headers, GatewayConfig, CacheConfig + +headers = create_headers( + config=GatewayConfig(cache=CacheConfig(strategy="semantic", ttl=600)), + trace_id="trace-abc", + session_id="sess-123", + user_id="user-42", + metadata={"team": "ml", "env": "production"}, +) + +client = OpenAI( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", + default_headers=headers, +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) +``` + +### cURL + +Pass headers with `-H`: + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "x-prism-session-id: sess-abc" \ + -H "x-prism-trace-id: trace-123" \ + -H "x-prism-user-id: user-42" \ + -H "x-prism-metadata: {\"team\":\"ml\",\"env\":\"prod\"}" \ + -H "x-prism-cache-ttl: 300" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + +--- + +## Next Steps + + + + Primary API endpoint with streaming and function calling + + + Configure cache strategies and per-request cache control + + + Full GatewayConfig reference and override hierarchy + + + Use metadata headers for cost attribution by team and feature + + diff --git a/src/pages/docs/prism/api/media.mdx b/src/pages/docs/prism/api/media.mdx new file mode 100644 index 00000000..9cf72885 --- /dev/null +++ b/src/pages/docs/prism/api/media.mdx @@ -0,0 +1,439 @@ +--- +title: "Media endpoints" +description: "Text-to-speech, speech-to-text, audio translation, and image generation through the Prism Gateway." +--- + +## About + +Prism proxies audio and image requests to any configured provider. The API follows the OpenAI format. All gateway features (caching, rate limiting, cost tracking, failover) apply to these endpoints. + +--- + +## Endpoints + +| Method | Path | Description | +|---|---|---| +| POST | `/v1/audio/speech` | Text-to-speech | +| POST | `/v1/audio/speech/stream` | Streaming text-to-speech | +| POST | `/v1/audio/transcriptions` | Speech-to-text | +| POST | `/v1/audio/translations` | Translate audio to English | +| POST | `/v1/images/generations` | Generate images from prompts | + +--- + +## Text-to-speech + +Convert text to spoken audio. The response is raw audio bytes in the requested format. + +### Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +audio_bytes = client.audio.speech.create( + model="tts-1", + voice="alloy", + input="Hello! This is a test of text-to-speech through Prism.", +) + +with open("output.mp3", "wb") as f: + f.write(audio_bytes) +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +response = client.audio.speech.create( + model="tts-1", + voice="alloy", + input="Hello! This is a test of text-to-speech through Prism.", +) + +response.stream_to_file("output.mp3") +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/audio/speech \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "tts-1", + "voice": "alloy", + "input": "Hello! This is a test of text-to-speech through Prism." + }' \ + --output output.mp3 +``` + + + + + +### Parameters + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `model` | string | Yes | - | TTS model (`tts-1`, `tts-1-hd`, `gpt-4o-mini-tts`) | +| `input` | string | Yes | - | Text to convert to speech (max 4096 characters) | +| `voice` | string | Yes | - | Voice to use (`alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`) | +| `response_format` | string | No | `mp3` | Output format: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` | +| `speed` | float | No | `1.0` | Speed multiplier (0.25 to 4.0) | + +### HD quality + +Use `tts-1-hd` for higher quality audio at the cost of higher latency: + +```python +audio_bytes = client.audio.speech.create( + model="tts-1-hd", + voice="nova", + input="High quality audio output.", + response_format="flac", +) +``` + +--- + +## Speech-to-text (transcription) + +Transcribe audio files to text. Supports mp3, mp4, mpeg, mpga, m4a, wav, and webm formats. + +### Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +with open("recording.mp3", "rb") as f: + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=f, + ) + +print(transcription.text) +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +with open("recording.mp3", "rb") as f: + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=f, + ) + +print(transcription.text) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/audio/transcriptions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -F file=@recording.mp3 \ + -F model=whisper-1 +``` + + + + + +### Parameters + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `file` | file | Yes | Audio file to transcribe | +| `model` | string | Yes | Transcription model (`whisper-1`, `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`) | +| `language` | string | No | ISO-639-1 language code (e.g. `en`, `fr`, `de`). Improves accuracy if you know the language. | +| `prompt` | string | No | Hint text to guide the model's style or continue a previous segment | +| `response_format` | string | No | Output format: `json`, `text`, `srt`, `verbose_json`, `vtt` | +| `temperature` | float | No | Sampling temperature (0 to 1). Lower values are more deterministic. | +| `timestamp_granularities` | string[] | No | `word` and/or `segment` level timestamps (requires `verbose_json` format) | + +### Timestamps + +Get word-level or segment-level timestamps with `verbose_json`: + +```python +with open("recording.mp3", "rb") as f: + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=f, + response_format="verbose_json", + timestamp_granularities=["word", "segment"], + ) + +for word in transcription.words: + print(f"[{word.start:.2f}s - {word.end:.2f}s] {word.word}") +``` + +--- + +## Audio translation + +Translate audio from any supported language to English text. Same API as transcription but always outputs English. + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +with open("french_audio.mp3", "rb") as f: + translation = client.audio.translations.create( + model="whisper-1", + file=f, + ) + +print(translation.text) # English translation +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +with open("french_audio.mp3", "rb") as f: + translation = client.audio.translations.create( + model="whisper-1", + file=f, + ) + +print(translation.text) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/audio/translations \ + -H "Authorization: Bearer sk-prism-your-key" \ + -F file=@french_audio.mp3 \ + -F model=whisper-1 +``` + + + + + +--- + +## Image generation + +Generate images from text prompts. + +### Basic usage + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +response = client.images.generate( + model="dall-e-3", + prompt="A serene mountain lake at dawn, photorealistic", + n=1, + size="1024x1024", +) + +print(response.data[0].url) +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +response = client.images.generate( + model="dall-e-3", + prompt="A serene mountain lake at dawn, photorealistic", + n=1, + size="1024x1024", +) + +print(response.data[0].url) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/images/generations \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "dall-e-3", + "prompt": "A serene mountain lake at dawn, photorealistic", + "n": 1, + "size": "1024x1024" + }' +``` + + + + + +### Parameters + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `prompt` | string | Yes | - | Text description of the image to generate | +| `model` | string | No | `dall-e-3` | Image model (`dall-e-2`, `dall-e-3`, `gpt-image-1`) | +| `n` | integer | No | `1` | Number of images to generate (1 for DALL-E 3, 1-10 for DALL-E 2) | +| `size` | string | No | `1024x1024` | Image size. DALL-E 3: `1024x1024`, `1792x1024`, `1024x1792`. DALL-E 2: `256x256`, `512x512`, `1024x1024`. | +| `quality` | string | No | `standard` | `standard` or `hd` (DALL-E 3 and `gpt-image-1`) | +| `style` | string | No | `vivid` | `vivid` or `natural` (DALL-E 3 only) | +| `response_format` | string | No | `url` | `url` (temporary link) or `b64_json` (base64-encoded image data) | + +### Get base64 data instead of URL + +URLs expire after 1 hour. For persistent storage, request base64 data: + +```python +response = client.images.generate( + model="dall-e-3", + prompt="A watercolor painting of a cat", + response_format="b64_json", +) + +import base64 + +image_data = base64.b64decode(response.data[0].b64_json) +with open("cat.png", "wb") as f: + f.write(image_data) +``` + +### Response format + +```json +{ + "created": 1700000000, + "data": [ + { + "url": "https://oaidalleapiprodscus.blob.core.windows.net/...", + "revised_prompt": "A serene mountain lake at dawn..." + } + ] +} +``` + +DALL-E 3 returns a `revised_prompt` field showing the expanded prompt the model actually used. + +--- + +## Supported models + +### Text-to-speech + +| Provider | Models | Notes | +|---|---|---| +| OpenAI | `tts-1`, `tts-1-hd` | 6 voices, mp3/opus/aac/flac/wav/pcm | +| OpenAI | `gpt-4o-mini-tts` | Newer model, same voice options | + +### Speech-to-text + +| Provider | Models | Notes | +|---|---|---| +| OpenAI | `whisper-1` | 57 languages, timestamps, translation | +| OpenAI | `gpt-4o-transcribe` | Newer model with improved accuracy | +| OpenAI | `gpt-4o-mini-transcribe` | Smaller, faster transcription model | + +### Image generation + +| Provider | Models | Notes | +|---|---|---| +| OpenAI | `dall-e-3` | 1024x1024, 1792x1024, 1024x1792 | +| OpenAI | `dall-e-2` | 256x256, 512x512, 1024x1024 | +| OpenAI | `gpt-image-1` | Latest model. Returns `b64_json` only (no URL). | + + +Available models depend on which providers are configured for your organization. Use `GET /v1/models` to see what's available on your key. + + +--- + +## Next Steps + + + + Text generation with streaming and function calling + + + Vector embeddings and document reranking + + + Cache responses to reduce cost and latency + + + Full reference for x-prism-* headers + + diff --git a/src/pages/docs/prism/concepts/api-reference.mdx b/src/pages/docs/prism/concepts/api-reference.mdx index 2486e044..853eacb5 100644 --- a/src/pages/docs/prism/concepts/api-reference.mdx +++ b/src/pages/docs/prism/concepts/api-reference.mdx @@ -357,7 +357,7 @@ curl -X POST https://gateway.futureagi.com/v1/audio/transcriptions \ --- -## Next steps +## Next Steps diff --git a/src/pages/docs/prism/concepts/configuration.mdx b/src/pages/docs/prism/concepts/configuration.mdx index a9327b02..8fdd8df1 100644 --- a/src/pages/docs/prism/concepts/configuration.mdx +++ b/src/pages/docs/prism/concepts/configuration.mdx @@ -1,97 +1,204 @@ --- title: "Configuration" -description: "How organization configuration works in Prism: sections, hierarchy, and real-time updates." +description: "How Prism configuration works: hierarchy, sections, SDK config objects, and model mapping." --- ## About -Prism is configured at the organization level. Each organization has its own set of providers, guardrails, routing rules, rate limits, and budgets. Configuration changes are pushed to the gateway in real time with no restart required. +Prism is configured at the organization level. Each organization has its own providers, guardrails, routing rules, rate limits, and budgets. Changes take effect in real time with no gateway restart required. ---- - -## Configuration Hierarchy - -When a setting is specified in multiple places, Prism applies the most specific one: +Configuration can be set in four places. When the same setting exists in multiple places, the most specific one wins: ``` -Request Headers > API Key Config > Organization Config > Global Config +Request headers > Virtual key config > Organization config > Global defaults ``` -For example, a cache TTL set via the `x-prism-cache-ttl` request header overrides the TTL set in the organization config. +- **Request headers**: Per-request overrides sent via `x-prism-*` headers or `GatewayConfig.to_headers()`. See [headers reference](/docs/prism/api/headers). +- **Virtual key config**: Settings attached to a specific [virtual key](/docs/prism/concepts/virtual-keys) (e.g. rate limits, allowed models, guardrails). +- **Organization config**: Org-level settings configured via the dashboard or admin API. +- **Global defaults**: Gateway-wide defaults. For self-hosted deployments, these come from `config.yaml`. For the cloud gateway, these are platform defaults. + +For example, if the org sets cache TTL to 60 seconds but a request sends `x-prism-cache-ttl: 300`, that request uses a 300-second TTL. --- -## Configuration Sections - -| Section | What it controls | -| --- | --- | -| `providers` | Which LLM services are available and their credentials | -| `guardrails` | Safety checks applied to requests and responses | -| `routing` | How requests are distributed across providers (strategy, failover, retries) | -| `cache` | Caching mode, TTL, and namespace settings | -| `rate_limiting` | Maximum request rate per API key or organization | -| `budgets` | Spending limits per period and alert thresholds | -| `cost_tracking` | Cost calculation and attribution settings | -| `ip_acl` | IP Access Control List. Which source IP addresses are permitted | -| `alerting` | Email or webhook alerts for budget events, errors, and guardrail triggers | -| `privacy` | Data retention periods and request logging policies | -| `tool_policy` | Which tool and function calls are permitted | -| `mcp` | Model Context Protocol integration settings | -| `model_map` | Custom model name aliases. Map a friendly name like "my-gpt" to an actual model | -| `audit` | Audit log configuration and retention settings | +## Configuration sections + +| Section | What it controls | Feature page | +|---|---|---| +| `providers` | Which LLM services are available and their credentials | [Supported providers](/docs/prism/features/providers) | +| `routing` | How requests are distributed across providers | [Routing](/docs/prism/features/routing) | +| `cache` | Caching mode, TTL, and namespace settings | [Caching](/docs/prism/features/caching) | +| `rate_limiting` | Maximum request rate per key or organization | [Rate limiting](/docs/prism/features/rate-limiting) | +| `budgets` | Spending limits per period and alert thresholds | [Rate limiting & budgets](/docs/prism/features/rate-limiting) | +| `guardrails` | Safety checks on requests and responses | [Guardrails](/docs/prism/features/guardrails) | +| `cost_tracking` | Cost calculation and attribution settings | [Cost tracking](/docs/prism/features/cost-tracking) | +| `tool_policy` | Which tool and function calls are permitted | [Virtual keys](/docs/prism/concepts/virtual-keys) | +| `ip_acl` | Which source IP addresses are allowed | [Virtual keys](/docs/prism/concepts/virtual-keys) | +| `model_map` | Custom model name aliases (see [below](#model-mapping)) | - | +| `alerting` | Email or webhook alerts for budget events and errors | Coming soon | +| `privacy` | Data retention periods and request logging policies | Coming soon | +| `mcp` | Model Context Protocol integration settings | Coming soon | +| `audit` | Audit log configuration and retention | Coming soon | + +Each section has its own page with full configuration options. The rest of this page covers the config hierarchy and how to set config from code. --- -## Example Configuration +## Example configuration -A minimal organization configuration that sets up two providers with weighted routing, caching, and a monthly budget: +A minimal organization configuration with two providers, weighted routing, caching, and a monthly budget: -```json -{ - "providers": { - "openai": { - "api_key": "sk-...", - "models": ["gpt-4o", "gpt-4o-mini"] - }, - "anthropic": { - "api_key": "sk-ant-...", - "models": ["claude-sonnet-4-6", "claude-haiku-4-5"] - } - }, - "routing": { - "strategy": "weighted", - "weights": { "openai": 70, "anthropic": 30 }, - "failover": { - "enabled": true, - "providers": ["openai", "anthropic"] + + + + +Go to **Prism > Settings** in the Future AGI dashboard. Each section (providers, routing, caching, etc.) has its own tab. Changes save immediately and push to the gateway in real time. + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", +) + +client.org_configs.create( + org_id="your-org-id", + config={ + "providers": { + "openai": { + "api_key": "sk-...", + "models": ["gpt-4o", "gpt-4o-mini"], + }, + "anthropic": { + "api_key": "sk-ant-...", + "models": ["claude-sonnet-4-6", "claude-haiku-4-5"], + }, + }, + "routing": { + "strategy": "weighted", + "weights": {"openai": 70, "anthropic": 30}, + "failover": { + "enabled": True, + "providers": ["openai", "anthropic"], + }, + }, + "cache": { + "enabled": True, + "mode": "exact", + "ttl_seconds": 3600, + }, + "budgets": { + "limit": 500.00, + "period": "monthly", + "alert_threshold_percent": 80, + }, } - }, - "cache": { - "enabled": true, - "mode": "exact", - "ttl_seconds": 3600 - }, - "budgets": { - "limit": 500.00, - "period": "monthly", - "alert_threshold_percent": 80 - } -} +) +``` + + + + + +```typescript +import { Prism } from "@futureagi/prism"; + +const client = new Prism({ + apiKey: "sk-prism-your-key", + baseUrl: "https://gateway.futureagi.com", + controlPlaneUrl: "https://api.futureagi.com", +}); + +await client.orgConfigs.create({ + orgId: "your-org-id", + config: { + providers: { + openai: { + api_key: "sk-...", + models: ["gpt-4o", "gpt-4o-mini"], + }, + anthropic: { + api_key: "sk-ant-...", + models: ["claude-sonnet-4-6", "claude-haiku-4-5"], + }, + }, + routing: { + strategy: "weighted", + weights: { openai: 70, anthropic: 30 }, + failover: { + enabled: true, + providers: ["openai", "anthropic"], + }, + }, + cache: { + enabled: true, + mode: "exact", + ttl_seconds: 3600, + }, + budgets: { + limit: 500.0, + period: "monthly", + alert_threshold_percent: 80, + }, + }, +}); +``` + + + + + +**Self-hosted config.yaml:** + +```yaml +providers: + openai: + api_key: "${OPENAI_API_KEY}" + models: ["gpt-4o", "gpt-4o-mini"] + anthropic: + api_key: "${ANTHROPIC_API_KEY}" + models: ["claude-sonnet-4-6", "claude-haiku-4-5"] + +routing: + strategy: weighted + weights: + openai: 70 + anthropic: 30 + failover: + enabled: true + providers: ["openai", "anthropic"] + +cache: + enabled: true + mode: exact + ttl_seconds: 3600 + +budgets: + limit: 500.00 + period: monthly + alert_threshold_percent: 80 ``` -Changes to organization configuration are pushed to the gateway in real time. No restart or redeployment needed. +Changes to organization configuration push to the gateway in real time. No restart or redeployment needed. Self-hosted deployments watch the config file for changes. --- ## SDK configuration -The Prism SDK lets you apply configuration at two levels: **client-level** (affects all requests) and **per-request** (overrides for a single call). +The Prism SDK lets you set config at two levels: **client-level** (applies to every request) and **per-request** (overrides for a single call). ### Client-level config -Pass a `GatewayConfig` to the client constructor. It applies to every request made with that client: +Pass a `GatewayConfig` to the client constructor: ```python Python @@ -109,7 +216,7 @@ client = Prism( ), ) -# All requests through this client use the cache, retry, and fallback settings +# All requests through this client use these settings response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": "Hello"}], @@ -124,7 +231,7 @@ const client = new Prism({ baseUrl: "https://gateway.futureagi.com", config: { cache: { strategy: "exact", ttl: 300, namespace: "prod" }, - retry: { max_retries: 3, on_status_codes: [429, 500, 502, 503] }, + retry: { maxRetries: 3, onStatusCodes: [429, 500, 502, 503] }, fallback: { targets: [{ model: "gpt-4o-mini" }], }, @@ -140,62 +247,162 @@ const response = await client.chat.completions.create({ ### Per-request overrides -Override config for a single request using `extra_headers`. The `GatewayConfig.to_headers()` method serialises the config to `x-prism-config`: +Override config for a single request using `GatewayConfig.to_headers()`: ```python from prism import GatewayConfig, CacheConfig -# Force a cache refresh for this specific request override = GatewayConfig(cache=CacheConfig(force_refresh=True)) -headers = override.to_headers() response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": "What time is it?"}], - extra_headers=headers, + extra_headers=override.to_headers(), +) +``` + +You can also set individual headers directly: + +```python +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={ + "x-prism-cache-force-refresh": "true", + "x-prism-cache-namespace": "staging", + }, ) ``` -### Using with the OpenAI SDK +### Using with other clients -If you're not using the Prism SDK, use `create_headers()` to generate `x-prism-*` headers for any OpenAI-compatible client: +If you're not using the Prism SDK, use `create_headers()` to generate `x-prism-*` headers for any OpenAI-compatible client (OpenAI SDK, LiteLLM, LangChain, cURL, etc.): ```python from openai import OpenAI from prism import create_headers, GatewayConfig, CacheConfig headers = create_headers( - api_key="sk-prism-your-key", config=GatewayConfig(cache=CacheConfig(strategy="semantic", ttl=600)), trace_id="trace-abc", metadata={"team": "ml", "env": "production"}, ) client = OpenAI( + api_key="sk-prism-your-key", base_url="https://gateway.futureagi.com/v1", default_headers=headers, ) ``` -### Override precedence +See [Request & response headers](/docs/prism/api/headers) for the full list of `x-prism-*` headers. + +--- + +## Model mapping + +Model mapping creates aliases for model names. Send `my-fast-model` in API requests and the gateway resolves it to `gpt-4o-mini` (or whatever you mapped it to). Swap the underlying model any time without touching application code. + + + + + +Go to **Prism > Settings > Model Mapping** and add alias-to-model pairs. + + + + + +```python +client.org_configs.update( + org_id="your-org-id", + config={ + "model_map": { + "my-fast-model": "gpt-4o-mini", + "my-smart-model": "claude-sonnet-4-6", + "my-cheap-model": "gemini-2.0-flash", + } + } +) +``` + + + + + +```typescript +await client.orgConfigs.update({ + orgId: "your-org-id", + config: { + model_map: { + "my-fast-model": "gpt-4o-mini", + "my-smart-model": "claude-sonnet-4-6", + "my-cheap-model": "gemini-2.0-flash", + }, + }, +}); +``` + + + + + +**Self-hosted config.yaml:** + +```yaml +model_map: + my-fast-model: gpt-4o-mini + my-smart-model: claude-sonnet-4-6 + my-cheap-model: gemini-2.0-flash +``` + +Then use the alias in requests: + +```python +response = client.chat.completions.create( + model="my-fast-model", # resolves to gpt-4o-mini + messages=[{"role": "user", "content": "Hello"}], +) +``` + + +If you send a model name that doesn't match any configured provider or model map entry, the gateway returns a 404 with the message: `model "X" not found in any configured provider. Configure model_map or use 'provider/model' format.` + + +--- + +## GatewayConfig reference + +The `GatewayConfig` dataclass groups all per-request config overrides: + +| Field | Type | Description | +|---|---|---| +| `cache` | `CacheConfig` | Cache strategy, TTL, namespace, force refresh | +| `retry` | `RetryConfig` | Max retries, backoff settings, status codes | +| `fallback` | `FallbackConfig` | Fallback model targets and trigger conditions | +| `load_balance` | `LoadBalanceConfig` | Load balancing strategy and targets | +| `guardrails` | `GuardrailConfig` | Input/output guardrail policies and settings | +| `routing` | `ConditionalRoutingConfig` | Conditional routing rules | +| `mirror` | `TrafficMirrorConfig` | Shadow traffic configuration | +| `timeout` | `TimeoutConfig` | Connect, read, write, and total timeouts | -Per-request headers override client-level config, which overrides org config. See [Configuration Hierarchy](#configuration-hierarchy) above. +`GatewayConfig.to_headers()` serializes the entire config to `x-prism-config` as a JSON header, plus individual backward-compatible headers for cache, guardrail, and timeout settings. --- ## Next Steps - - Understand the key building blocks of Prism + + Full reference for all x-prism-* headers - - Explore the full Prism API reference + + Key types, RBAC, and access control - - Add and configure LLM providers + + Routing strategies and failover configuration - - Apply safety checks to requests and responses + + Plugin pipeline and request lifecycle diff --git a/src/pages/docs/prism/concepts/core.mdx b/src/pages/docs/prism/concepts/core.mdx index 09586f3a..c67ca176 100644 --- a/src/pages/docs/prism/concepts/core.mdx +++ b/src/pages/docs/prism/concepts/core.mdx @@ -1,176 +1,190 @@ --- -title: "Core Concepts" -description: "Understand the key building blocks of Prism: gateways, virtual API keys, organizations, providers, and configurations." +title: "How it works" +description: "Understand Prism's request pipeline, plugin architecture, virtual keys, multi-tenancy, and configuration hierarchy." --- -## Overview +## About -Prism is built on a small set of core building blocks. Understanding these helps you configure and operate the gateway effectively. You do not need to understand all of them to get started. The [Quickstart](/docs/prism/quickstart) covers the minimum, but this page explains how everything fits together. +Every request flows through a pipeline of plugins in a fixed order: authentication, caching, budget checks, guardrails, rate limiting, then the provider call, followed by cost tracking and logging. Cache hits skip the provider entirely. Per-org configuration keeps tenants isolated. ---- +## The request pipeline -## Gateway +Prism is a proxy that sits between your application and your LLM providers. Every request passes through a chain of plugins before reaching the provider, and the response passes through another chain on the way back. -The gateway is Prism's core engine. It is a high-performance proxy that receives every LLM request and passes it through a series of checks before forwarding it to a provider. +The plugins run in a fixed priority order. Lower numbers run first: -Each step in the pipeline runs in a fixed order: +### Pre-request plugins (run before the provider call) -``` -Request → IP ACL → Auth → RBAC → Cache Lookup - │ - [hit] ←────┤────→ [miss] - │ │ - Return cached Budget → Guardrails (Pre) - → Rate Limit → Provider Call - │ - Guardrails (Post) → Cost → Logging → Response -``` +| Priority | Plugin | What it does | +|---|---|---| +| 10 | **IP ACL** | Blocks requests from denied IP addresses or CIDR ranges | +| 20 | **Auth** | Validates the virtual API key, identifies the organization | +| 30 | **RBAC** | Checks role-based permissions (can this key call this model?) | +| 35 | **Cache** | Checks for an exact or semantic cache match. On a hit, skips everything below and returns instantly. | +| 40 | **Budget** | Checks org/key/user spend against configured limits | +| 50 | **Guardrails** | Runs safety checks on the incoming request (PII, injection, blocklist, etc.) | +| 60 | **Tool policy** | Filters or rejects tool/function calls based on allow/deny lists | +| 70 | **Validation** | Validates the model name against the model database | +| 80 | **Rate limit** | Enforces RPM/TPM limits per org, key, user, or model | -**What each step does:** +### Provider call -- **IP ACL (IP Access Control List):** Checks whether the request's source IP address is permitted. Blocks requests from IPs not on the allowlist. -- **Auth:** Validates the virtual API key in the `Authorization` header. -- **RBAC (Role-Based Access Control):** Checks whether this key has permission to make this type of request (e.g., is it allowed to call this model or endpoint?). -- **Cache Lookup:** Checks whether an identical or semantically similar request has been answered before. Cache hits skip everything below and return instantly. -- **Budget:** Verifies the organization's spending limit has not been exceeded. -- **Guardrails (Pre):** Runs safety checks on the incoming request before it reaches the provider. -- **Rate Limit:** Enforces per-key or per-org request rate limits. -- **Provider Call:** Forwards the request to the selected LLM provider. -- **Guardrails (Post):** Runs safety checks on the provider's response before it reaches your application. -- **Cost:** Calculates the request cost based on token usage. -- **Logging:** Records the request, response, and metadata for observability. +After all pre-request plugins pass, Prism forwards the request to the selected LLM provider. The routing layer picks the provider based on your configured strategy (round-robin, weighted, least-latency, etc.) and handles failover if the primary provider is down. -Cache hits skip guardrails, rate limiting, the provider call, and cost calculation entirely, returning the stored response immediately with zero provider cost. +### Post-response plugins (run after the provider responds) ---- +Some post-plugins run sequentially because they depend on each other. The rest run in parallel for performance. -## Virtual API Keys +**Sequential (order matters):** -Prism uses virtual API keys (prefixed `sk-prism-`) to authenticate requests. These are Prism-specific keys, not the keys for OpenAI, Anthropic, or any other provider. +| Priority | Plugin | What it does | +|---|---|---| +| 35 | **Cache** | Writes the fresh response to cache for future requests | +| 40 | **Budget** | Updates spend counters | +| 80 | **Rate limit** | Updates rate counters | +| 500 | **Cost** | Calculates the request cost from token usage and model pricing | +| 510 | **Credits** | Deducts cost from the key's credit balance (managed keys only) | -When a request arrives, Prism: +**Parallel (independent observers, run concurrently):** -1. Validates the virtual key -2. Identifies which organization the key belongs to -3. Loads that organization's provider credentials, guardrails, routing rules, and rate limits -4. Routes the request to the appropriate LLM provider using the organization's stored provider credentials +| Priority | Plugin | What it does | +|---|---|---| +| 900 | **Logging** | Buffers the request trace for the control plane | +| 900 | **Audit** | Emits structured audit events to configured sinks | +| 997 | **Alerting** | Checks alert rule conditions (error rate, cost, latency) | +| 998 | **Prometheus** | Increments counters and histograms | +| 999 | **OpenTelemetry** | Exports a span to your OTLP endpoint | + + +Post-plugin failures are non-fatal. If logging or metrics fail, the response has already been sent to your application. Errors are logged as warnings but never block the response. + + +--- + +## Cache hits and short-circuiting + +When the cache plugin finds an exact match at priority 35, it short-circuits the pipeline. The provider is never called, and the cached response is returned immediately. + +On an exact cache hit: +- Budget, guardrails, tool policy, validation, and rate limiting are all skipped +- Cost and credits are skipped (no tokens were consumed) +- Logging, audit, metrics, and alerting still run (so cache hits appear in your dashboards) -Virtual keys keep your provider API keys secure. Your application code never sees or stores raw provider credentials, only the Prism virtual key. +Semantic cache hits (similar but not identical requests) also short-circuit the provider call. Cost and credits plugins still run on semantic hits, unlike exact hits where they're skipped entirely. --- -## Organizations and Multi-Tenancy +## Virtual API keys -Multi-tenancy means multiple independent users, teams, or customers share the same gateway infrastructure while remaining completely isolated from one another. Each organization has its own set of providers, guardrails, routing rules, rate limits, and budgets. One organization's configuration cannot affect another's. +Prism uses virtual keys (prefixed `sk-prism-`) to authenticate requests. These are not your provider API keys - they're Prism-specific keys that map to an organization and its configuration. + +When a request arrives with a virtual key, Prism: + +1. Validates the key and checks it hasn't expired or been revoked +2. Identifies which organization the key belongs to +3. Loads that organization's providers, guardrails, routing rules, rate limits, and budgets +4. Routes the request using the org's stored provider credentials -This is useful in several scenarios: +Your application never sees or stores raw provider API keys. Rotate a provider key in Prism and every application using that org's virtual keys picks up the change automatically. -- **SaaS products:** Give each of your customers their own isolated gateway environment with separate provider keys and guardrails. -- **Team separation:** Track spend and enforce policies per team without shared limits affecting each other. -- **Staging vs. production:** Run production and staging on the same gateway with different configurations. -- **Resellers:** Provision isolated environments for downstream customers. +Each virtual key can have its own restrictions: -Each organization gets its own isolated: +- **Model restrictions** - limit which models this key can call +- **Provider restrictions** - limit which providers this key can use +- **RPM/TPM limits** - per-key rate limits (independent of org limits) +- **Expiration date** - auto-expires the key +- **Allowed IPs** - restrict which IPs can use this key +- **Tool allow/deny lists** - control which function calls are permitted +- **Guardrail overrides** - change enforcement mode per key +- **BYOK (Bring Your Own Key)** - let the caller supply their own provider key +- **Credit balance** - managed keys with a USD budget that auto-deducts per request -- Providers (and their encrypted API keys) +--- + +## Multi-tenancy + +Multiple organizations share the same gateway but are completely isolated. Each organization has its own: + +- Providers and their encrypted API keys - Guardrails and safety policies - Routing rules and strategies -- Rate limits -- Budgets and spend tracking +- Rate limits and budgets - Cache namespace +- Tool policies +- MCP tool server registrations +- Audit and alerting configuration -**Configuration hierarchy.** When a setting is specified in multiple places, Prism applies the most specific one. Request headers override API key config, which overrides organization config, which overrides global defaults. +One organization's configuration never affects another's. -``` -Request Headers > API Key Config > Organization Config > Global Config -``` +**Common use cases:** +- **SaaS products** - each customer gets an isolated gateway environment +- **Team separation** - track spend and enforce policies per team +- **Staging vs production** - different configs on the same gateway +- **Resellers** - provision isolated environments for downstream customers --- -## Providers - -A provider is an LLM service that Prism routes requests to, for example, OpenAI, Anthropic, or Google Gemini. Each provider has its own API format, authentication method, and model catalog. +## Configuration hierarchy -You configure each provider once (supplying its API key and any required settings), and Prism handles all communication with it from that point. When you make a request, you specify which model to use; Prism determines which provider hosts that model and routes the request accordingly. +When a setting is defined in multiple places, the most specific one wins: -Prism translates between its unified OpenAI-format API and each provider's native format. Providers like Anthropic and Google Gemini have different native APIs, but Prism handles the translation transparently. Your client code stays the same regardless of which provider handles the request. +``` +Request headers > API key config > Organization config > Global config +``` -Provider configuration includes: +For example, if the org sets cache TTL to 5 minutes but a request sends `x-prism-cache-ttl: 60`, that request uses a 60-second TTL. If a key has a guardrail override that sets PII detection to "log only," it overrides the org's "enforce" setting for requests using that key. -- **Name:** The identifier used when configuring routing, failover order, or routing strategies. -- **API format:** How the provider's native API works. Prism normalizes all providers to the OpenAI format. -- **Base URL:** The endpoint Prism calls when routing to this provider. -- **API key:** Your credential for this provider, stored encrypted. Never exposed in API responses. -- **Models:** Which models are available through this provider. +This lets you set sensible defaults at the org level and override them for specific keys or individual requests without changing the org config. --- -## Organization Configuration - -Organization configuration controls all gateway behavior for a given organization. It is versioned, and changes are applied to the gateway in real time with no restart required. - -| Section | What it controls | -|---------|-----------------| -| `providers` | Which LLM services are available and their credentials | -| `guardrails` | Safety checks applied to requests and responses | -| `routing` | How requests are distributed across providers (strategy, failover, retries) | -| `cache` | Caching mode, TTL, and namespace settings | -| `rate_limiting` | Maximum request rate per API key or organization | -| `budgets` | Spending limits per period and alert thresholds | -| `cost_tracking` | Cost calculation and attribution settings | -| `ip_acl` | IP Access Control List: which source IP addresses are permitted | -| `alerting` | Email or webhook alerts for budget events, errors, and guardrail triggers | -| `privacy` | Data retention periods and request logging policies | -| `tool_policy` | Which tool and function calls are permitted | -| `mcp` | Model Context Protocol integration settings | -| `model_map` | Custom model name aliases. Map a friendly name like `my-gpt` to an actual model | -| `audit` | Audit log configuration and retention settings | - -Changes to organization configuration are pushed to the gateway in real time. No restart or redeployment needed. - ---- +## Hot-reload and sync -## Guardrails +Configuration changes take effect without restarting the gateway. -Guardrails are safety checks that run on every request and response. Prism includes 18+ built-in types. +**Control plane sync:** Every 15 seconds (configurable), the gateway pulls the latest org configs and API keys from the control plane. Only orgs whose config actually changed (detected via SHA-256 hash comparison) trigger updates. Unchanged orgs are skipped. -Each guardrail operates in one of three enforcement modes: +**What happens on a config change:** +- Provider clients are rebuilt with new credentials +- Dynamic guardrail configs are refreshed +- Budget counters are recalculated +- Cache namespaces are isolated per org, so one org's cache change doesn't affect others -| Mode | HTTP Status | What happens | -|------|------------|--------------| -| Enforce | 403 Forbidden | The request is blocked. Prism returns an error to the client. The LLM is never called and no cost is incurred. | -| Monitor | 200 OK | The request proceeds normally, but a warning is logged. Use this to observe traffic patterns before enforcing. | -| Log | 200 OK | The request proceeds. The potential violation is recorded silently for later analysis. | +**Key revocation:** When a key is revoked via the admin API, the revocation is broadcast to all gateway replicas via Redis pub/sub immediately - no waiting for the next 15-second sync. -Start with Monitor mode to understand your traffic before switching to Enforce. This prevents unexpected request blocking while you tune thresholds. +**Model database:** The model pricing and capability database is swapped atomically via an atomic pointer. No locking, no downtime. --- -## Sessions +## Sessions and metadata + +**Sessions:** Group related requests using the `x-prism-session-id` header. Sessions are for grouping and analytics only. Prism does not maintain conversation state between requests. -Group related requests together using the `x-prism-session-id` header. Sessions are used for grouping and analytics only. Prism does not maintain conversation state or memory between requests. +**Custom metadata:** Attach arbitrary key-value pairs using the `x-prism-metadata` header. Metadata appears in logs and analytics for cost attribution and tracking by team, feature, user, or any custom dimension. --- -## Custom Metadata +## Streaming + +For streaming requests, pre-request plugins run normally before the stream starts. The stream then flows directly to your application chunk by chunk. Post-response plugins run after the final chunk, once the full response (including token usage) is available. -Attach arbitrary JSON metadata to requests using the `x-prism-metadata` header. Metadata appears in logs and analytics for cost attribution and tracking by team, feature, user, or any custom dimension. +Streaming requests bypass the cache entirely - both on read and write. This is because streaming responses arrive in chunks and caching partial streams creates consistency problems. --- -## What you can do next +## Next Steps - - Get up and running with Prism in minutes + + Get your first request through Prism in 5 minutes - - Explore supported endpoints, request headers, and response headers + + SDK config, per-request overrides, and the configuration hierarchy - - Learn how to configure organizations, routing, and policies + + See all supported LLM providers and how to add them - - Configure LLM providers and credentials + + Set up safety checks on requests and responses diff --git a/src/pages/docs/prism/concepts/platform-integration.mdx b/src/pages/docs/prism/concepts/platform-integration.mdx index dc888b54..082c3cf0 100644 --- a/src/pages/docs/prism/concepts/platform-integration.mdx +++ b/src/pages/docs/prism/concepts/platform-integration.mdx @@ -131,7 +131,7 @@ The `x-prism-metadata` header (or `metadata=` parameter in the SDK) is how you c --- -## Next steps +## Next Steps diff --git a/src/pages/docs/prism/concepts/virtual-keys.mdx b/src/pages/docs/prism/concepts/virtual-keys.mdx new file mode 100644 index 00000000..ed3b8d05 --- /dev/null +++ b/src/pages/docs/prism/concepts/virtual-keys.mdx @@ -0,0 +1,292 @@ +--- +title: "Virtual keys & access control" +description: "Manage API keys, control model access with RBAC, restrict IPs, and enforce per-key policies." +--- + +## About + +Virtual keys (`sk-prism-...`) authenticate requests and control what each caller can do. You can restrict models, providers, IPs, tools, and rate limits per key, and layer RBAC roles on top for team-level governance. Prism provides three levels of IP control: global, per-org, and per-key. + +## Virtual API keys + +Every request to Prism uses a virtual key (`sk-prism-...`). These are not provider keys - they're Prism-specific credentials that map to an organization and its policies. + +When a request arrives, Prism validates the key and loads the caller's permissions, restrictions, and configuration. The actual provider API key is stored separately in the org config and never exposed. + +### Key properties + +Each virtual key can have the following restrictions: + +| Property | Type | Description | +|---|---|---| +| `name` | string | Display name for the key | +| `owner` | string | User ID or email of the key owner | +| `key_type` | string | `byok` (default) or `managed` (credit-based billing) | +| `models` | string[] | Models this key can call. Empty = all models. | +| `providers` | string[] | Providers this key can use. Empty = all providers. | +| `allowed_ips` | string[] | IPs or CIDRs allowed to use this key. Empty = no restriction. | +| `allowed_tools` | string[] | Function/tool names this key can invoke. Empty = all tools. | +| `denied_tools` | string[] | Tools blocked for this key, regardless of allow list. | +| `rate_limit_rpm` | int | Requests per minute limit for this key. 0 = no limit. | +| `rate_limit_tpm` | int | Tokens per minute limit for this key. 0 = no limit. | +| `expires_at` | datetime | When the key expires. Null = no expiry. | +| `metadata` | object | Arbitrary key-value pairs for tracking (team, env, feature, etc.) | +| `credit_balance` | float | USD balance for managed keys. Auto-deducted per request. | +| `guardrails` | object | Per-key guardrail overrides (disable, change action or threshold). | + +### Key types + +**BYOK (Bring Your Own Key)** - the default. The virtual key controls access and policies. Provider billing flows through the org's own provider account. The provider API key is stored in the org config, not on the virtual key. + +**Managed** - same access control as BYOK, plus a USD credit balance. Each request deducts the actual cost from the balance. When credits run out, requests are blocked. Use this for reseller scenarios or per-team budget enforcement. + +--- + +## Creating and managing keys + + + + + +Go to **Settings > API Keys** in the Future AGI dashboard to create, view, and revoke keys. + + + + + +All key operations require the admin token in the `Authorization` header. + +**Create a key:** + +```bash +curl -X POST https://gateway.futureagi.com/-/keys \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "production-backend", + "owner": "alice@example.com", + "models": ["gpt-4o", "claude-sonnet-4-6"], + "providers": ["openai", "anthropic"], + "rate_limit_rpm": 100, + "rate_limit_tpm": 50000, + "allowed_ips": ["10.0.0.0/8"], + "metadata": {"team": "ml", "env": "production"}, + "expires_at": "2026-12-31T23:59:59Z" + }' +``` + +The response includes the raw key value. This is the only time it's shown - store it securely. + +**List keys:** + +```bash +curl https://gateway.futureagi.com/-/keys \ + -H "Authorization: Bearer your-admin-token" +``` + +**Revoke a key:** + +```bash +curl -X DELETE "https://gateway.futureagi.com/-/keys/key_123" \ + -H "Authorization: Bearer your-admin-token" +``` + +Revocations are broadcast to all gateway replicas via Redis pub/sub immediately. + +**Add credits (managed keys):** + +```bash +curl -X POST "https://gateway.futureagi.com/-/keys/key_123/credits" \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{"amount": 50.00}' +``` + + + + + +--- + +## Per-key guardrail overrides + +Each key can override the org's guardrail settings. Useful when certain keys need different safety policies - for example, an internal testing key that logs PII detections instead of blocking them. + +```yaml +# In config.yaml +auth: + keys: + - name: "internal-testing" + key: "sk-prism-test-key-value" + guardrails: + overrides: + - name: "pii-detection" + action: "log" # override org's "block" to "log" + - name: "prompt-injection" + disabled: true # disable entirely for this key + - name: "content-moderation" + threshold: 0.9 # raise threshold (less sensitive) +``` + +--- + +## RBAC (Role-Based Access Control) + +Layer team-level permissions on top of individual key restrictions. RBAC runs at pipeline priority 30, after authentication. + +### Roles and permissions + +Define roles with permission patterns: + +```yaml +rbac: + enabled: true + default_role: member + roles: + admin: + permissions: ["*"] # full access + member: + permissions: ["models:gpt-4o", "models:claude-*", "providers:openai"] + readonly: + permissions: ["models:gpt-4o-mini"] # cheapest model only +``` + +Permission patterns support wildcards: +- `*` - all permissions +- `models:*` - all models +- `models:gpt-*` - all models starting with "gpt-" +- `providers:openai` - exact provider match +- `guardrails:override` - allows per-request guardrail policy header + +### Teams + +Group users into teams with shared permissions: + +```yaml +rbac: + teams: + ml-team: + role: member + models: ["gpt-4o", "claude-sonnet-4-6", "gemini-2.0-flash"] + members: + alice@example.com: + role: admin # Alice has admin role within this team + bob@example.com: {} # Bob inherits the team's "member" role +``` + +### Role resolution order + +When determining a user's role, Prism checks in this order (first match wins): + +1. **User-level** - role set on the user within their team +2. **Key-level** - `role` in the key's metadata +3. **Team-level** - the team's default role +4. **Global default** - `default_role` in RBAC config + +The team is determined from `team` in the key's metadata. Set it when creating the key: + +```json +{ + "name": "alice-key", + "owner": "alice@example.com", + "metadata": {"team": "ml-team", "role": "admin"} +} +``` + +If no team is set in metadata, only the global default role applies. + +--- + +## IP access control + +Three layers of IP control, checked in order. Any deny at any layer blocks the request. + +### Layer 1: Global ACL (pipeline priority 10) + +Runs before authentication. Blocks IPs at the network level. + +```yaml +ip_acl: + enabled: true + allow: + - "10.0.0.0/8" + - "192.168.1.100" + deny: + - "203.0.113.0/24" +``` + +Deny list is checked first. If the IP matches a deny rule, it's blocked regardless of the allow list. If an allow list is configured, only IPs matching it are permitted. + +### Layer 2: Per-org ACL + +Set via the org config admin API. Runs even if global ACL is disabled. + +```bash +curl -X PUT "https://gateway.futureagi.com/-/orgs/org_123/config" \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{ + "ip_acl": { + "enabled": true, + "allow": ["10.0.0.0/8"], + "deny": ["1.2.3.4"] + } + }' +``` + +### Layer 3: Per-key IP restriction + +Set on the virtual key's `allowed_ips` field. This is checked inside the auth plugin (priority 20), not as a separate pipeline stage. + +```bash +curl -X POST https://gateway.futureagi.com/-/keys \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "restricted-key", + "allowed_ips": ["10.0.1.0/24", "192.168.1.50"] + }' +``` + +All three layers accept both bare IPs (`192.168.1.1`) and CIDR notation (`10.0.0.0/8`). + +--- + +## Access groups + +Group models under a logical name for easier policy management: + +```yaml +routing: + access_groups: + fast-models: + description: "Low-latency models for real-time use" + models: ["gpt-4o-mini", "claude-haiku-4-5", "gemini-2.0-flash"] + premium-models: + description: "High-quality models for complex tasks" + models: ["gpt-4o", "claude-sonnet-4-6", "gemini-2.0-pro"] + aliases: + best: "gpt-4o" + cheap: "gpt-4o-mini" +``` + +Instead of listing individual models on each key, assign access group names. Aliases let users request `model: "best"` and Prism resolves it to the actual model name. + +--- + +## Next Steps + + + + See where keys and RBAC fit in the request pipeline + + + Configure the safety checks that keys can override + + + Set per-key and per-org rate limits and spend caps + + + Configure how requests are distributed across providers + + diff --git a/src/pages/docs/prism/deployment/self-hosted.mdx b/src/pages/docs/prism/deployment/self-hosted.mdx index b74fb8b4..57d63db4 100644 --- a/src/pages/docs/prism/deployment/self-hosted.mdx +++ b/src/pages/docs/prism/deployment/self-hosted.mdx @@ -1,5 +1,5 @@ --- -title: "Self-Hosted Deployment" +title: "Self-hosted deployment" description: "Deploy Prism AI Gateway on your own infrastructure using Docker or a Go binary." --- @@ -124,7 +124,7 @@ providers: api_key: "${ANTHROPIC_API_KEY}" api_format: "anthropic" models: - - claude-sonnet-4-20250514 + - claude-sonnet-4-6 ollama: base_url: "http://localhost:11434" @@ -244,7 +244,7 @@ Each provider in the `providers:` section supports: | Setting | Required | Description | |---------|----------|-------------| -| `api_key` | Yes | API key (can use `${ENV_VAR}` syntax) | +| `api_key` | Cloud only | API key (can use `${ENV_VAR}` syntax). Not needed for self-hosted providers like Ollama. | | `api_format` | Yes | Format: `openai`, `anthropic`, `gemini`, `bedrock`, `cohere`, `azure` | | `base_url` | No | Custom endpoint (auto-filled for known providers) | | `type` | No | Provider shorthand: `groq`, `mistral`, `ollama`, `vllm`, etc. | @@ -322,15 +322,18 @@ For production, use a public endpoint (e.g., behind a reverse proxy with TLS). R ## Building from source -If you prefer to build the binary yourself: +If you have access to the source repository, build the binary directly: ```bash -git clone https://github.com/futureagi/core-backend.git -cd core-backend/prism-gateway +cd prism-gateway go build -o prism-gateway ./cmd/prism ./prism-gateway --config config.yaml ``` + +The source repository is private. Contact support for access. + + ## Environment variables All values in `config.yaml` that use `${VAR_NAME}` syntax are resolved from environment variables at startup. For example: @@ -363,13 +366,19 @@ View logs from the container: docker logs -f prism-gateway ``` -## Next steps +## Next Steps - - Deep dive into all configuration options + + Configuration hierarchy and SDK config objects + + + Configure LLM providers + + + Error codes and retry strategies - - Add, configure, and manage LLM providers + + Debug common deployment issues diff --git a/src/pages/docs/prism/features/caching.mdx b/src/pages/docs/prism/features/caching.mdx index 82b764f3..843d52ff 100644 --- a/src/pages/docs/prism/features/caching.mdx +++ b/src/pages/docs/prism/features/caching.mdx @@ -1,37 +1,13 @@ --- title: "Caching" -description: "Reduce costs and latency with Prism's exact match and semantic caching." +description: "Reduce costs and latency with exact match and semantic caching at the gateway level." --- ## About -Prism caches LLM responses at the gateway level. Entirely server-side — no client cache logic needed. A cache hit returns an instant response without calling the provider. The `X-Prism-Cache` header shows cache status (`miss`/`hit`/`skip`), and `X-Prism-Cost` returns `0` on cache hits, eliminating provider charges for repeated queries. +Prism caches LLM responses server-side. A cache hit returns an instant response without calling the provider. The `X-Prism-Cache` response header shows cache status (`hit` or `miss`), and `X-Prism-Cost` returns `0` on exact cache hits since no provider tokens were consumed. ---- - -## Exact vs semantic caching - -| | Exact Match | Semantic Cache | -|---|---|---| -| **How it matches** | Identical request parameters (same messages, model, temperature) | Semantically similar queries via vector embeddings | -| **Example** | Same prompt, character for character | "What's the weather today?" matches "Tell me today's weather" | -| **Latency** | Fastest — hash lookup | Slightly higher — embedding computation on each miss | -| **Use case** | Deterministic queries, templates, FAQ bots | Paraphrased questions, conversational variations | -| **Consistency** | Exact same response every time | Nearest cached response | - -**Exact Match** — Fastest and most predictable. Use for deterministic queries where exact repetition is common. - -**Semantic Cache** — Uses vector embeddings to match semantically equivalent queries, even when worded differently. Catches paraphrased questions that exact match would miss. - - -Streaming requests bypass cache entirely. Cache only applies to non-streaming completions. - - ---- - -## About - -Prism caches LLM responses at the gateway level. Entirely server-side, no client cache logic needed. A cache hit returns an instant response without calling the provider. The X-Prism-Cache header shows cache status (miss/hit/skip), and X-Prism-Cost returns 0 on cache hits, eliminating provider charges for repeated queries. +No client-side cache logic needed. Caching works for all providers through the same configuration. --- @@ -43,32 +19,75 @@ Prism caches LLM responses at the gateway level. Entirely server-side, no client --- -## Cache modes - -Prism supports two caching strategies: +## Exact match vs semantic cache -**Exact Match**: Caches responses for identical request parameters. Fastest and most predictable. Use for deterministic queries where exact repetition is common. - -**Semantic Cache**: Uses vector embeddings (numerical representations of text meaning generated by a language model) to match semantically equivalent queries, even when worded differently. For example, "What's the weather like today?" and "Tell me today's weather" would match the same cached response. Even though the words differ. Catches paraphrased questions and variations that exact match would miss. Slightly higher latency than exact match due to the embedding computation. +| | Exact match | Semantic cache | +|---|---|---| +| **How it matches** | Identical request parameters (same messages, model, temperature) | Similar queries via vector embeddings | +| **Example** | Same prompt, character for character | "What's the weather today?" matches "Tell me today's weather" | +| **Latency** | Fastest - hash lookup | Slightly higher - embedding computation | +| **Use case** | Deterministic queries, templates | Paraphrased questions, conversational variations | +| **Cost on hit** | Zero (skips cost/credits plugins) | Cost plugins still run (embedding lookup has overhead) | -Streaming requests bypass cache entirely. Cache only applies to non-streaming completions. +Streaming requests bypass cache entirely - both on read and write. Cache only applies to non-streaming completions. --- ## Configuration -Configure caching with enabled flag, strategy, TTL, and maximum entries: - | Setting | Description | Default | -|---------|-------------|---------| +|---|---|---| | `enabled` | Enable or disable caching | `false` | -| `strategy` | Cache strategy: `"exact"` or `"semantic"` | `"exact"` | +| `strategy` | `"exact"` or `"semantic"` | `"exact"` | | `default_ttl` | Time-to-live for cached entries (e.g. `5m`, `1h`) | `5m` | | `max_entries` | Maximum number of cached entries (LRU eviction) | `10000` | -**config.yaml:** + + + + +Go to **Prism > Caching** in the Future AGI dashboard to enable caching, choose a strategy, and set TTL. + + + + + +```python +from prism import Prism, GatewayConfig, CacheConfig + +# Set cache config at the client level +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + config=GatewayConfig( + cache=CacheConfig(enabled=True, strategy="exact", ttl=300, namespace="prod"), + ), +) +``` + + + + + +```typescript +import { Prism } from '@futureagi/prism'; + +const client = new Prism({ + apiKey: 'sk-prism-your-key', + baseUrl: 'https://gateway.futureagi.com', + config: { + cache: { enabled: true, strategy: 'exact', ttl: 300, namespace: 'prod' }, + }, +}); +``` + + + + + +**Self-hosted config.yaml:** ```yaml cache: @@ -81,128 +100,126 @@ cache: ## Cache namespaces -Partition cache into isolated buckets using namespaces. Each namespace maintains its own cache entries, preventing cross-contamination. +Partition cache into isolated buckets. Each namespace maintains its own entries, so entries from one environment don't leak into another. -Use cases for namespaces: -- **Environment isolation**: Separate prod, staging, dev caches -- **Multi-tenant isolation**: Each tenant gets its own cache namespace -- **A/B testing**: Different cache namespaces for different experiment variants +Use the `x-prism-cache-namespace` request header or set it in the SDK config: ---- +```python +# Per-request namespace +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={"x-prism-cache-namespace": "staging"}, +) +``` -## Configuring cache via SDK - - - - ```python - from prism import Prism, GatewayConfig, CacheConfig - - client = Prism( - api_key="sk-prism-your-key", - base_url="https://gateway.futureagi.com", - config=GatewayConfig( - cache=CacheConfig(enabled=True, strategy="exact", ttl=300, namespace="prod"), - ), - ) - ``` - - - ```typescript - import { Prism } from '@futureagi/prism'; - - const client = new Prism({ - apiKey: 'sk-prism-your-key', - baseUrl: 'https://gateway.futureagi.com', - config: { cache: { enabled: true, strategy: 'exact', ttl: 300, namespace: 'prod' } }, - }); - ``` - - +Common namespace patterns: +- **Environment isolation**: `prod`, `staging`, `dev` +- **Multi-tenant isolation**: one namespace per customer +- **A/B testing**: different namespaces per experiment variant --- ## Per-request cache control -Override cache behavior on a per-request basis using headers: +Override cache behavior on individual requests using headers: | Header | Value | Effect | -|--------|-------|--------| -| x-prism-cache-force-refresh | true | Bypass cache, fetch fresh response | -| Cache-Control | no-store | Disable caching for this request | -| x-prism-cache-ttl | seconds | Override TTL for this response | +|---|---|---| +| `x-prism-cache-force-refresh` | `true` | Bypass cache, fetch fresh response, update cache | +| `Cache-Control` | `no-store` | Disable caching for this request entirely | +| `x-prism-cache-ttl` | seconds | Override TTL for this specific response | +| `x-prism-cache-namespace` | string | Route to a specific cache namespace | -**cURL example:** + -```bash -curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-key" \ - -H "x-prism-cache-force-refresh: true" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [{"role": "user", "content": "What is AI?"}] - }' + + +```python +# Force a fresh response (bypass cache) +response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "What is AI?"}], + extra_headers={"x-prism-cache-force-refresh": "true"}, +) ``` -**Python SDK example:** + + + ```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": "What is AI?"}], - extra_headers={"x-prism-cache-force-refresh": "true"} + extra_headers={"x-prism-cache-force-refresh": "true"}, ) ``` -**TypeScript SDK example:** + -```typescript -const response = await client.chat.completions.create({ - model: "gpt-4o-mini", - messages: [{ role: "user", content: "What is AI?" }], - headers: { "x-prism-cache-force-refresh": "true" } -}); + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "x-prism-cache-force-refresh: true" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "What is AI?"}] + }' ``` + + + + --- ## Cache backends -Prism supports multiple storage backends for cached responses: - **Exact match backends:** | Backend | Use case | -|---------|----------| +|---|---| | In-memory (default) | Single-instance deployments, development | | Redis | Multi-instance deployments, shared cache across replicas | -| Disk | Large cache volumes with persistence | -| Cloud storage (S3, GCS, Azure Blob) | High-durability, long-TTL caching | -**Semantic cache backends** (vector stores for embedding similarity search): +**Semantic cache backends** (vector stores): | Backend | Notes | -|---------|-------| +|---|---| | In-memory | Development and small-scale deployments | | Qdrant | Production-grade self-hosted vector search | | Pinecone | Managed vector database | -| Weaviate | Open-source vector search engine | - -The backend is configured at the gateway level in `config.yaml`. The default is in-memory, which requires no additional setup. -Backend configuration is set at the self-hosted gateway level. If you're using the Prism cloud gateway at `https://gateway.futureagi.com`, the backend is managed for you. +Backend configuration is set at the gateway level in `config.yaml`. If you're using the cloud gateway at `gateway.futureagi.com`, the backend is managed for you. --- -## Next steps +## Next Steps - + Configure load balancing and failover - - Monitor and optimize spending + + Monitor spending per provider and model + + + Set per-key and per-org rate limits + + + See where caching fits in the request pipeline diff --git a/src/pages/docs/prism/features/cost-tracking.mdx b/src/pages/docs/prism/features/cost-tracking.mdx index 510cbf79..08ce33ed 100644 --- a/src/pages/docs/prism/features/cost-tracking.mdx +++ b/src/pages/docs/prism/features/cost-tracking.mdx @@ -1,46 +1,28 @@ --- -title: "Cost Tracking & Budgets" -description: "Track LLM costs per request, set budget limits, and configure spend alerts." +title: "Cost tracking" +description: "Track LLM costs per request, attribute spend by team and feature, and configure budget alerts." --- ## About -Prism tracks the cost of every LLM request automatically, giving full visibility into AI spend. Each request returns its cost in the X-Prism-Cost header. Budget limits prevent runaway costs. Cost attribution by team, feature, or user is available via metadata headers. +Prism calculates the cost of every request automatically based on token usage and model pricing. The cost appears in the `x-prism-cost` response header and in the `response.prism.cost` SDK accessor. No setup required. ---- +Cost is calculated as: -## When to use +``` +cost = (input_tokens * input_price_per_token) + (output_tokens * output_price_per_token) +``` -- **Spend monitoring**: Track exactly how much each request, model, and provider costs in real time -- **Budget enforcement**: Prevent runaway costs with configurable spending limits per org -- **Cost attribution**: Break down spend by team, feature, or user with custom metadata headers -- **Threshold alerts**: Receive email notifications when spend crosses defined thresholds +Exact cache hits return `x-prism-cost: 0` since no provider call was made. --- -## Per-Request Cost Tracking +## Reading cost per request -Every request through Prism includes cost information in the response headers. + - - -```bash -curl https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-key" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [{"role": "user", "content": "Hello"}] - }' -``` - -Response includes: -``` -X-Prism-Cost: 0.00015 -``` - + - ```python from prism import Prism @@ -50,134 +32,208 @@ client = Prism( ) response = client.chat.completions.create( - model="gpt-4o-mini", + model="gpt-4o", messages=[{"role": "user", "content": "Hello"}], ) print(f"Cost: ${response.prism.cost}") -print(f"Total spend so far: ${client.current_cost}") +print(f"Provider: {response.prism.provider}") +print(f"Model: {response.prism.model_used}") ``` - - -Cost is calculated as: +The Prism SDK also tracks cumulative cost across all requests made with a client: + +```python +# After several requests... +print(f"Total session cost: ${client.current_cost:.4f}") + +# Reset the counter +client.reset_cost() +``` + + + + + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +raw = client.chat.completions.with_raw_response.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) +print(f"Cost: ${raw.headers.get('x-prism-cost')}") +print(f"Provider: {raw.headers.get('x-prism-provider')}") ``` -cost = (input_tokens × input_price_per_token) + (output_tokens × output_price_per_token) + + + + + +```bash +curl -i https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +# Look for: x-prism-cost: 0.00015 ``` -Cache hits return `X-Prism-Cost: 0`. + + + --- -## Cost Analytics +## Cost attribution -View detailed cost breakdowns and trends across your organization. +Tag requests with metadata to break down costs by team, feature, user, or any custom dimension. Metadata is indexed and queryable in the analytics dashboard. - - - Access the analytics dashboard at https://app.futureagi.com/dashboard/gateway/analytics + - Available breakdowns: - - Total spend (current period) - - Cost by model - - Cost by provider - - Cost by API key - - Cost timeseries - + - ```python -from prism import Prism +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + request_metadata={"team": "data-science", "feature": "recommendations", "user": "alice"}, +) +``` -client = Prism( - api_key="sk-prism-your-key", - base_url="https://gateway.futureagi.com", - control_plane_url="https://api.futureagi.com", + + + + +```python +import json + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={ + "x-prism-metadata": json.dumps({"team": "data-science", "feature": "recommendations", "user": "alice"}), + }, ) +``` -overview = client.analytics.overview(start_date="2025-01-01", end_date="2025-01-31") -costs = client.analytics.cost_breakdown(group_by="model") -latency = client.analytics.latency_stats(percentiles=[50, 95, 99]) -model_cmp = client.analytics.model_comparison(models=["gpt-4o", "claude-sonnet-4-6"]) + + + + +```bash +curl https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -H 'x-prism-metadata: {"team":"data-science","feature":"recommendations","user":"alice"}' \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' ``` - + + + --- -## Budgets +## Analytics dashboard -Set spending limits to prevent runaway costs. When a budget is exceeded, new requests are blocked until the next period begins. +The Future AGI dashboard shows cost breakdowns and trends across your organization. - - - Navigate to Settings → Budgets at https://app.futureagi.com/dashboard/gateway/settings +Available views: +- Total spend for the current period +- Cost by model +- Cost by provider +- Cost by API key +- Cost timeseries (daily/weekly/monthly) +- Cost by metadata dimension (team, feature, user) - Configure: - - Budget limit (USD) - - Budget period (daily, weekly, monthly) - - Alert threshold percentage - +### SDK analytics - ```python -config = client.org_configs.create( - org_id="your-org-id", - config={ - "budgets": { - "limit": 100.00, - "period": "monthly", - "alert_threshold_percent": 80 - } - }, +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", ) -``` - - -| Setting | Values | Description | -|---------|--------|-------------| -| `budget_limit` | USD amount | Maximum spend allowed per period | -| `budget_period` | daily, weekly, monthly | Reset frequency | -| `alert_threshold_percent` | 0-100 | Percentage of budget before alert fires | +# Spending overview +overview = client.analytics.overview( + start_date="2026-01-01", + end_date="2026-01-31", +) -When budget is exceeded, new requests receive a 429 error until the next period. Email alert is sent when threshold is crossed. +# Cost breakdown by model +costs = client.analytics.cost_breakdown(group_by="model") + +# Compare models +comparison = client.analytics.model_comparison( + models=["gpt-4o", "claude-sonnet-4-6"], +) +``` --- -## Email Alerts +## Budget alerts + +Get notified when spending crosses a threshold. Alerts are configured per organization. -Configure alerts for budget overages, errors, latency spikes, and guardrail triggers. + - - - Navigate to Settings → Alerts at https://app.futureagi.com/dashboard/gateway/email-alerts + - Create a new alert: - 1. Name the alert - 2. Select event type - 3. Set recipients - 4. Configure severity - +Go to **Prism > Settings > Alerts** in the Future AGI dashboard. Create a new alert by selecting the event type, setting recipients, and configuring severity. + + + + - ```python alert = client.alerts.create( - name="Budget warning", + name="Budget warning at 80%", condition="cost > 80", recipients=["team@example.com"], severity="high", ) ``` - + + + + + +```typescript +const alert = await client.alerts.create({ + name: "Budget warning at 80%", + condition: "cost > 80", + recipients: ["team@example.com"], + severity: "high", +}); +``` + + + -| Event Type | Trigger | -|------------|---------| +### Alert types + +| Event | Trigger | +|---|---| | `budget_exceeded` | Spend crosses the budget limit | -| `error_spike` | Error rate exceeds threshold | -| `latency_spike` | P95 latency exceeds threshold (P95 means 95% of requests are faster than this value. A spike means the slowest 5% got significantly slower) | +| `budget_threshold` | Spend crosses a percentage threshold (e.g. 80%) | +| `error_spike` | Error rate exceeds configured threshold | +| `latency_spike` | P95 latency exceeds configured threshold | | `guardrail_triggered` | A guardrail blocks or flags a request | @@ -186,51 +242,25 @@ Configure a cooldown period to prevent alert flooding when thresholds are repeat --- -## Cost Attribution with Metadata +## Budget enforcement -Tag requests with custom metadata to break down costs by team, feature, user, or any custom dimension. - - - -```bash -curl https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-key" \ - -H "x-prism-metadata: team=data-science,feature=recommendations,user=alice" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [{"role": "user", "content": "Hello"}] - }' -``` - - - -```python -response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": "Hello"}], - extra_headers={ - "x-prism-metadata": "team=data-science,feature=recommendations,user=alice" - }, -) -``` - - - -Metadata is indexed and queryable in the analytics dashboard for cost attribution. +Budgets are configured on the [Rate limiting & budgets](/docs/prism/features/rate-limiting) page. When a budget is exceeded with `action: block`, new requests return a 429 error until the next period. See that page for configuration details. --- -## Next steps +## Next Steps - - Route requests intelligently across models and providers + + Configure spending limits and rate controls + + + Full reference for cost and metadata headers - - Set up safety checks for your LLM traffic + + Cost-optimized routing across providers - - Connect and manage LLM providers + + Reduce costs with response caching diff --git a/src/pages/docs/prism/features/guardrails.mdx b/src/pages/docs/prism/features/guardrails.mdx index 46eff519..25c1846d 100644 --- a/src/pages/docs/prism/features/guardrails.mdx +++ b/src/pages/docs/prism/features/guardrails.mdx @@ -77,9 +77,16 @@ Choose how Prism handles guardrail violations. Start with Monitor mode to understand traffic patterns before switching to Enforce. +### Fail-open vs fail-closed + +What happens when a guardrail service itself errors (timeout, crash)? + +- **Fail-open** (default): the request proceeds. Use this when availability matters more than safety enforcement. +- **Fail-closed** (`fail_open: false`): the request is blocked. Use this when safety is non-negotiable, even at the cost of occasional false rejections during outages. + --- -## Score Thresholds +## Score thresholds Guardrails return confidence scores from 0.0 (safe) to 1.0 (maximum violation). Set thresholds to control sensitivity. @@ -107,12 +114,11 @@ Example response with score: Configure guardrails via the dashboard or SDK. - + - ![Guardrails dashboard](/screenshot/product/prism/guardrails-dashboard.png) - 1. Navigate to Guardrails at https://app.futureagi.com/dashboard/gateway/guardrails - 2. Click Add Guardrail Policy + 1. Go to **Prism > Guardrails** in the Future AGI dashboard + 2. Click **Add Guardrail Policy** 3. Select guardrail type (e.g., PII Detection) 4. Choose enforcement mode: Enforce or Monitor 5. Configure type-specific settings (entities, thresholds, etc.) @@ -120,7 +126,7 @@ Configure guardrails via the dashboard or SDK. 7. Click Save - + ```python from prism import Prism @@ -172,7 +178,7 @@ policy = client.guardrails.policies.create( ``` - + ```typescript import { Prism } from "@futureagi/prism"; @@ -230,7 +236,7 @@ const policy = await client.guardrails.policies.create({ ### PII Detection - + ```python response = client.chat.completions.create( model="gpt-4o-mini", @@ -274,7 +280,7 @@ curl https://gateway.futureagi.com/v1/chat/completions \ ### Prompt Injection - + ```python response = client.chat.completions.create( model="gpt-4o-mini", @@ -318,7 +324,7 @@ curl https://gateway.futureagi.com/v1/chat/completions \ ### Clean Request - + ```python response = client.chat.completions.create( model="gpt-4o-mini", @@ -534,7 +540,7 @@ Dashboard steps: Python SDK: - + ```python blocklist = client.guardrails.blocklists.create( name="Restricted Topics", @@ -557,7 +563,7 @@ config = client.guardrails.configs.create( ) ``` - + ```typescript const blocklist = await client.guardrails.blocklists.create({ name: "Restricted Topics", @@ -608,16 +614,19 @@ client.feedback.create( --- -## Next steps +## Next Steps - - Route requests intelligently across models and providers + + Configure load balancing and failover + + + Per-key guardrail overrides and access control - - Connect and manage LLM providers + + See where guardrails fit in the request pipeline - - Track and control LLM spending + + Control request throughput and spending diff --git a/src/pages/docs/prism/features/mcp-a2a.mdx b/src/pages/docs/prism/features/mcp-a2a.mdx index 50d97dfa..f6314cf0 100644 --- a/src/pages/docs/prism/features/mcp-a2a.mdx +++ b/src/pages/docs/prism/features/mcp-a2a.mdx @@ -395,13 +395,19 @@ Prism's agent card specifies which schemes it supports. When routing to downstre --- -## Next steps +## Next Steps - - Validate and control tool execution with pre and post-call guardrails + + Validate and control tool calls with guardrails - - Route requests intelligently and ensure reliable agent communication + + Per-key tool access control and RBAC + + + Route agent requests across providers + + + Full list of MCP and A2A endpoints diff --git a/src/pages/docs/prism/features/observability.mdx b/src/pages/docs/prism/features/observability.mdx new file mode 100644 index 00000000..d2201d66 --- /dev/null +++ b/src/pages/docs/prism/features/observability.mdx @@ -0,0 +1,227 @@ +--- +title: "Observability" +description: "Monitor Prism Gateway with logs, metrics, and distributed tracing." +--- + +## About + +Prism logs every request and response, exports metrics to Prometheus and OpenTelemetry, and propagates trace IDs for distributed tracing. No additional setup needed for basic logging - it's on by default. + +--- + +## Request logging + +Every request through Prism is logged with: + +- Request ID, trace ID, session ID +- Model requested and model actually used +- Provider that handled the request +- Input/output token counts +- Cost +- Latency +- Cache status (hit/miss/skip) +- Guardrail results +- Any errors or fallback events + +Logs sync to the Future AGI dashboard automatically. View them in **Prism > Logs**. + +### Log levels + +| Level | What's logged | +|---|---| +| `error` | Failed requests, provider errors, guardrail blocks | +| `warn` | Fallbacks, slow requests, budget warnings | +| `info` | Every request (default) | +| `debug` | Full request/response bodies, header details | + +For self-hosted deployments, set the log level in `config.yaml`: + +```yaml +logging: + level: info +``` + +--- + +## Distributed tracing + +Prism propagates trace IDs across the request lifecycle. Set `x-prism-trace-id` on incoming requests and the same ID appears in all downstream provider calls and logs. + + + + + +```python +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + trace_id="trace-from-my-app-abc123", + user_id="user-42", +) + +print(response.prism.trace_id) # trace-from-my-app-abc123 +print(response.prism.provider) # openai +print(response.prism.latency_ms) # 342 +print(response.prism.cost) # 0.00015 +``` + + + + + +```python +raw = client.chat.completions.with_raw_response.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={ + "x-prism-trace-id": "trace-from-my-app-abc123", + "x-prism-user-id": "user-42", + }, +) +print(raw.headers.get("x-prism-trace-id")) +print(raw.headers.get("x-prism-cost")) +``` + + + + + +```bash +curl -i https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "x-prism-trace-id: trace-from-my-app-abc123" \ + -H "x-prism-user-id: user-42" \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]}' +# Look for x-prism-trace-id in response headers +``` + + + + + +If you don't set a trace ID, Prism generates one automatically. Use it for correlating gateway logs with your application logs. + +### OpenTelemetry integration + +Self-hosted deployments can export traces to any OpenTelemetry-compatible backend: + +```yaml +telemetry: + traces: + enabled: true + exporter: otlp + endpoint: "http://otel-collector:4317" + service_name: "prism-gateway" +``` + +--- + +## Metrics + +Prism exports Prometheus metrics on the `/-/metrics` endpoint. + +### Available metrics + +| Metric | Type | Description | +|---|---|---| +| `prism_requests_total` | Counter | Total requests by model, provider, status code | +| `prism_request_duration_seconds` | Histogram | Request latency distribution | +| `prism_tokens_total` | Counter | Total tokens (input + output) by model | +| `prism_cost_total` | Counter | Total cost in USD by model and provider | +| `prism_cache_hits_total` | Counter | Cache hits by strategy (exact/semantic) | +| `prism_cache_misses_total` | Counter | Cache misses | +| `prism_provider_errors_total` | Counter | Provider errors by provider and error code | +| `prism_circuit_breaker_state` | Gauge | Circuit breaker state (0=closed, 1=open, 2=half-open) | +| `prism_rate_limit_exceeded_total` | Counter | Rate limit rejections by key | +| `prism_guardrail_triggered_total` | Counter | Guardrail triggers by guardrail name and action | + +### Scrape configuration + +```yaml +# prometheus.yml +scrape_configs: + - job_name: "prism-gateway" + scrape_interval: 15s + metrics_path: "/-/metrics" + static_configs: + - targets: ["prism-gateway:8080"] +``` + +### Self-hosted metrics config + +```yaml +telemetry: + metrics: + enabled: true + prometheus: + enabled: true + path: "/-/metrics" +``` + +--- + +## Session tracking + +Group related requests into sessions for conversation-level analytics. Set `x-prism-session-id` on each request in a conversation: + +```python +session_id = "user-123-conversation-456" +messages = [] + +# Each turn in the conversation shares the same session_id +messages.append({"role": "user", "content": "What's the capital of France?"}) +response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + session_id=session_id, + user_id="user-123", +) +messages.append({"role": "assistant", "content": response.choices[0].message.content}) + +messages.append({"role": "user", "content": "What's its population?"}) +response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + session_id=session_id, + user_id="user-123", +) +``` + +Sessions appear in the dashboard under **Prism > Sessions** and show: +- Total requests in the session +- Cumulative cost +- Models and providers used +- Timeline of requests + +--- + +## Alerting + +Configure alerts to get notified about issues. See [Cost tracking > Budget alerts](/docs/prism/features/cost-tracking#budget-alerts) for alert configuration. + +| Event | When it fires | +|---|---| +| Budget threshold crossed | Spend exceeds configured percentage | +| Error rate spike | Error rate exceeds threshold over a time window | +| Latency spike | P95 latency exceeds threshold | +| Guardrail triggered | A guardrail blocks or flags a request | + +--- + +## Next Steps + + + + Cost attribution and budget management + + + All headers for request correlation + + + Deploy with metrics and logging + + + A/B test models on production traffic + + diff --git a/src/pages/docs/prism/features/providers.mdx b/src/pages/docs/prism/features/providers.mdx index 8b95414e..0021fd20 100644 --- a/src/pages/docs/prism/features/providers.mdx +++ b/src/pages/docs/prism/features/providers.mdx @@ -1,323 +1,345 @@ --- -title: "Manage Providers" -description: "Add, configure, and manage LLM providers in Prism." +title: "Supported providers" +description: "All LLM providers Prism supports, how to add them, and how to switch providers at request time." --- ## About -Providers are the LLM services Prism routes requests to. Each has its own API format, authentication, and model catalog. Prism translates between its unified OpenAI-compatible API and each provider's native format, allowing you to switch providers without changing client code. +Prism supports 20+ cloud and self-hosted LLM providers through a unified OpenAI-compatible API. Add a provider once with its API key, then switch between providers by changing the model name in your request. + +## Cloud providers + +| Provider | Type | `api_format` | Auth | Notes | +|---|---|---|---|---| +| OpenAI | `openai` | `openai` | API key | Native format | +| Anthropic | `anthropic` | `anthropic` | API key | Auto-translated to OpenAI format | +| Google Gemini | `gemini` | `gemini` | API key | Auto-translated to OpenAI format | +| Google Vertex AI | `vertexai` | `gemini` | Bearer token | Uses GCP project/location headers | +| AWS Bedrock | `bedrock` | `bedrock` | SigV4 | Requires AWS region, cross-region failover supported | +| Azure OpenAI | `azure` | `azure` | API key | Requires `api_version`, supports Azure AD bearer auth | +| Cohere | `cohere` | `cohere` | API key | Auto-translated to OpenAI format | +| Groq | `groq` | `openai` | API key | OpenAI-compatible | +| Mistral AI | `mistral` | `openai` | API key | OpenAI-compatible | +| Together AI | `together` | `openai` | API key | OpenAI-compatible | +| Fireworks AI | `fireworks` | `openai` | API key | OpenAI-compatible | +| DeepInfra | `deepinfra` | `openai` | API key | OpenAI-compatible | +| Perplexity | `perplexity` | `openai` | API key | OpenAI-compatible | +| Cerebras | `cerebras` | `openai` | API key | OpenAI-compatible | +| xAI (Grok) | `xai` | `openai` | API key | OpenAI-compatible | +| OpenRouter | `openrouter` | `openai` | API key | OpenAI-compatible | +| Hugging Face | `huggingface` | `openai` | API key | Inference API | +| Anyscale | `anyscale` | `openai` | API key | OpenAI-compatible | +| Replicate | `replicate` | `openai` | API key | OpenAI-compatible | + +Providers marked "OpenAI-compatible" use the same wire format as OpenAI. No translation needed. Providers with native formats (Anthropic, Gemini, Bedrock, Cohere) are automatically translated by Prism - your code stays identical regardless of which provider handles the request. ---- - -## When to use - -- **Multi-provider redundancy**: Configure backup providers for automatic failover -- **Cost optimization**: Route to cheaper providers for appropriate workloads -- **Self-hosted models**: Connect Ollama, vLLM, or LM Studio running on your infrastructure -- **Provider evaluation**: Test multiple providers with the same prompts to compare quality + +Prism supports all models from each provider, including new releases. Use any model name your provider supports. + ---- +## Self-hosted providers -## Supported providers - -### Cloud providers - -| Provider | `api_format` | Native API translated | -|----------|-------------|----------------------| -| OpenAI | `openai` | Native | -| Anthropic | `anthropic` | Yes — Prism translates to OpenAI format | -| Google Gemini | `gemini` | Yes — Prism translates to OpenAI format | -| Google Vertex AI | `gemini` | Yes — uses Bearer token auth | -| AWS Bedrock | `bedrock` | Yes — uses AWS SigV4 signing | -| Azure OpenAI | `azure` | Compatible | -| Cohere | `cohere` | Yes — Prism translates to OpenAI format | -| Groq | `openai` | Compatible | -| Mistral AI | `openai` | Compatible | -| Together AI | `openai` | Compatible | -| Fireworks AI | `openai` | Compatible | -| DeepInfra | `openai` | Compatible | -| Perplexity | `openai` | Compatible | -| Cerebras | `openai` | Compatible | -| xAI (Grok) | `openai` | Compatible | -| OpenRouter | `openai` | Compatible | - -### Self-hosted providers - -| Provider | `api_format` | Notes | -|----------|-------------|-------| -| Ollama | `openai` | Auto-discovers models from `/v1/models` | -| vLLM | `openai` | Auto-discovers models from `/v1/models` | -| LM Studio | `openai` | Compatible | -| HuggingFace TGI | `openai` | Compatible | -| Any OpenAI-compatible | `openai` | Works with any server implementing the OpenAI API | +| Provider | Type | Notes | +|---|---|---| +| Ollama | `ollama` | Auto-discovers models from `/v1/models` | +| vLLM | `vllm` | Auto-discovers models from `/v1/models` | +| LM Studio | `lmstudio` | OpenAI-compatible | +| HuggingFace TGI | `tgi` | OpenAI-compatible | +| LocalAI | `localai` | OpenAI-compatible | +| Any OpenAI-compatible server | - | Works with any server implementing the OpenAI API | -Providers with a native API format different from OpenAI (Anthropic, Gemini, Bedrock, Cohere) are automatically translated by Prism. Your client code stays identical regardless of which provider handles the request. +Your self-hosted endpoint must be reachable from the Prism gateway. Use a tunnel (ngrok, Cloudflare Tunnel), a cloud VM with a public IP, or deploy behind a reverse proxy. - -Prism supports all models from each provider, including new releases. Configure any model name your provider supports. - - --- ## Adding a provider - - - ![Providers dashboard](/screenshot/product/prism/providers-dashboard.png) - - 1. Open Prism dashboard at https://app.futureagi.com/dashboard/gateway/providers - 2. Click Add Provider - 3. Select provider from list - 4. Enter API key and optional configuration - 5. Click Save - - - ```python - from prism import Prism - - client = Prism( - api_key="sk-prism-your-key", - base_url="https://gateway.futureagi.com", - control_plane_url="https://api.futureagi.com", - ) - - # Create or update the org config with provider credentials - org_config = client.org_configs.create( - org_id="your-org-id", - config={ - "providers": { - "openai": { - "api_key": "sk-your-openai-key", - "api_format": "openai", - "models": ["gpt-4o", "gpt-4o-mini"], - } - } - } - ) - - # List all org configs - configs = client.org_configs.list() - - # Update an existing config - client.org_configs.update( - org_config["id"], - config={ - "providers": { - "openai": {"api_key": "sk-your-new-key"}, - "anthropic": {"api_key": "sk-ant-your-key", "api_format": "anthropic"}, - } + + + + +1. Go to **Prism > Providers** in the Future AGI dashboard +2. Click **Add Provider** +3. Select the provider from the list +4. Enter your API key and any required settings +5. Click **Save** + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", +) + +client.org_configs.create( + org_id="your-org-id", + config={ + "providers": { + "openai": { + "api_key": "sk-your-openai-key", + "api_format": "openai", + "models": ["gpt-4o", "gpt-4o-mini"], + }, + "anthropic": { + "api_key": "sk-ant-your-key", + "api_format": "anthropic", + }, } - ) - ``` - - - ```typescript - import { Prism } from "@futureagi/prism"; - - const client = new Prism({ - apiKey: "sk-prism-your-key", - baseUrl: "https://gateway.futureagi.com", - controlPlaneUrl: "https://api.futureagi.com", - }); - - // Create or update the org config with provider credentials - const orgConfig = await client.orgConfigs.create({ - orgId: "your-org-id", - config: { - providers: { - openai: { - api_key: "sk-your-openai-key", - api_format: "openai", - models: ["gpt-4o", "gpt-4o-mini"], - }, - }, - }, - }); + } +) +``` - // List all org configs - const configs = await client.orgConfigs.list(); + - // Update an existing config - await client.orgConfigs.update(orgConfig.id, { - config: { + + +```typescript +import { Prism } from "@futureagi/prism"; + +const client = new Prism({ + apiKey: "sk-prism-your-key", + baseUrl: "https://gateway.futureagi.com", + controlPlaneUrl: "https://api.futureagi.com", +}); + +await client.orgConfigs.create({ + orgId: "your-org-id", + config: { providers: { - openai: { api_key: "sk-your-new-key" }, - anthropic: { api_key: "sk-ant-your-key", api_format: "anthropic" }, + openai: { + api_key: "sk-your-openai-key", + api_format: "openai", + models: ["gpt-4o", "gpt-4o-mini"], + }, + anthropic: { + api_key: "sk-ant-your-key", + api_format: "anthropic", + }, }, - }, - }); - ``` - + }, +}); +``` + + + Provider API keys are stored encrypted and never exposed in API responses. - -Custom models can be configured via the Custom & Self-Hosted section or through org config. - - --- ## Switching providers at request time -Use the same code with different model names to switch providers: +Change the model name to route to a different provider. Same code, same API, different LLM. -**Python SDK:** + + + ```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + # OpenAI response = client.chat.completions.create( - model="gpt-4o-mini", + model="gpt-4o", messages=[{"role": "user", "content": "Hello"}] ) -# Anthropic +# Anthropic - same code, different model response = client.chat.completions.create( - model="anthropic/claude-haiku-4-5", + model="claude-sonnet-4-6", messages=[{"role": "user", "content": "Hello"}] ) # Google Gemini response = client.chat.completions.create( - model="gemini/gemini-2.0-flash", + model="gemini-2.0-flash", messages=[{"role": "user", "content": "Hello"}] ) ``` -**TypeScript SDK:** + -```typescript -// OpenAI -const response = await client.chat.completions.create({ - model: "gpt-4o-mini", - messages: [{ role: "user", content: "Hello" }] -}); + -// Anthropic -const response = await client.chat.completions.create({ - model: "anthropic/claude-haiku-4-5", - messages: [{ role: "user", content: "Hello" }] -}); +```python +from openai import OpenAI -// Google Gemini -const response = await client.chat.completions.create({ - model: "gemini/gemini-2.0-flash", - messages: [{ role: "user", content: "Hello" }] -}); +# Works with the OpenAI SDK - just swap base_url and api_key +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) +``` + + + + + +```python +import litellm + +response = litellm.completion( + model="openai/gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com/v1", +) ``` + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + + + + + --- -## Custom & Self-Hosted providers - -Connect self-hosted models running on your infrastructure using Ollama, vLLM, or LM Studio. - - - - ![Custom providers dashboard](/screenshot/product/prism/custom-providers-dashboard.png) - - 1. Open Prism dashboard at https://app.futureagi.com/dashboard/gateway/providers - 2. Click Add Provider - 3. Enter your model's public endpoint URL - 4. Enter model name - 5. Click Save - - - ```python - from prism import Prism - - client = Prism( - api_key="sk-prism-your-key", - base_url="https://gateway.futureagi.com", - control_plane_url="https://api.futureagi.com", - ) - - # Add Ollama and vLLM via org config - client.org_configs.create( - org_id="your-org-id", - config={ - "providers": { - "ollama": { - "base_url": "https://your-ollama-endpoint.example.com", - "api_format": "openai", - "type": "ollama", - # models auto-discovered from /v1/models - }, - "vllm": { - "base_url": "https://your-vllm-endpoint.example.com", - "api_format": "openai", - "type": "vllm", - "models": ["meta-llama/Llama-3.1-8B-Instruct"], - }, - } +## Self-hosted setup + +Connect models running on your own infrastructure. + + + + + +1. Go to **Prism > Providers** +2. Click **Add Provider** +3. Enter your model's public endpoint URL +4. Enter the model name +5. Click **Save** + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", +) + +client.org_configs.create( + org_id="your-org-id", + config={ + "providers": { + "ollama": { + "base_url": "https://your-ollama.example.com", + "api_format": "openai", + "type": "ollama", + # models auto-discovered from /v1/models + }, + "vllm": { + "base_url": "https://your-vllm.example.com", + "api_format": "openai", + "type": "vllm", + "models": ["meta-llama/Llama-3.1-8B-Instruct"], + }, } - ) - ``` - - - ```typescript - import { Prism } from "@futureagi/prism"; - - const client = new Prism({ - apiKey: "sk-prism-your-key", - baseUrl: "https://gateway.futureagi.com", - controlPlaneUrl: "https://api.futureagi.com", - }); - - // Add Ollama and vLLM via org config - await client.orgConfigs.create({ - orgId: "your-org-id", - config: { + } +) +``` + + + + + +```typescript +import { Prism } from "@futureagi/prism"; + +const client = new Prism({ + apiKey: "sk-prism-your-key", + baseUrl: "https://gateway.futureagi.com", + controlPlaneUrl: "https://api.futureagi.com", +}); + +await client.orgConfigs.create({ + orgId: "your-org-id", + config: { providers: { - ollama: { - base_url: "https://your-ollama-endpoint.example.com", - api_format: "openai", - type: "ollama", - // models auto-discovered from /v1/models - }, - vllm: { - base_url: "https://your-vllm-endpoint.example.com", - api_format: "openai", - type: "vllm", - models: ["meta-llama/Llama-3.1-8B-Instruct"], - }, + ollama: { + base_url: "https://your-ollama.example.com", + api_format: "openai", + type: "ollama", + }, + vllm: { + base_url: "https://your-vllm.example.com", + api_format: "openai", + type: "vllm", + models: ["meta-llama/Llama-3.1-8B-Instruct"], + }, }, - }, - }); - ``` - - + }, +}); +``` - -Your self-hosted endpoint must be publicly accessible from Prism's gateway. Use a tunnel service (ngrok, Cloudflare Tunnel), a cloud VM with a public IP, or deploy behind a reverse proxy with a public domain. - + + + --- ## Provider health -Prism automatically monitors provider health and availability. Unhealthy providers are automatically excluded from routing until they recover. The system tracks: +Prism monitors provider health automatically. It tracks response times, error rates, and availability. When a provider becomes unhealthy: -- Response times and latency -- Error rates and failure patterns -- Rate limit status -- Availability and uptime +1. The circuit breaker opens to stop sending requests to the failing provider +2. Traffic fails over to healthy alternatives +3. After a cooldown period, Prism sends probe requests to check recovery +4. Once the provider responds successfully, it's added back to the rotation -When a provider becomes unhealthy, Prism triggers failover to healthy alternatives and opens the circuit breaker to prevent cascading failures. Alerts notify you of provider issues so you can investigate and resolve problems. +See [Failover & circuit breaking](/docs/prism/features/routing) for configuration details. --- -## Next steps +## Next Steps - - Configure load balancing and failover + + Configure load balancing across providers + + + Monitor spending per provider and model + + + Understand the full request pipeline - - Monitor and optimize spending + + Add safety checks before requests reach providers diff --git a/src/pages/docs/prism/features/rate-limiting.mdx b/src/pages/docs/prism/features/rate-limiting.mdx index 806462e0..e929396f 100644 --- a/src/pages/docs/prism/features/rate-limiting.mdx +++ b/src/pages/docs/prism/features/rate-limiting.mdx @@ -1,43 +1,130 @@ --- -title: "Rate Limiting" -description: "Control request throughput to the Prism AI Gateway with configurable rate limits." +title: "Rate limiting & budgets" +description: "Control request throughput and spending with per-key rate limits, org budgets, and managed key credits." --- ## About -Rate limiting protects your gateway from traffic spikes and controls API consumption. Prism enforces rate limits at the gateway level, returning 429 responses when thresholds are exceeded. When rate limited, the gateway includes `X-Ratelimit-*` headers so clients know when to retry. +Rate limiting controls how many requests a key or org can make per minute. Budgets control how much money can be spent per period. Credits give individual keys a prepaid USD balance. All three work together to prevent runaway costs and protect provider quotas. -## Configuration +--- + +## When to use + +- **Prevent abuse**: Cap RPM per key so one user can't monopolize gateway capacity +- **Control spending**: Set monthly budgets per org so teams can't exceed their allocation +- **Reseller billing**: Give each customer key a credit balance that auto-deducts per request +- **Protect provider quotas**: Global RPM limits prevent hitting provider rate limits + +--- + +## Rate limiting + +Prism supports rate limits at three levels: global, per-org, and per-key. + +| Level | Scope | How to set | +|---|---|---| +| **Global** | All requests to the gateway | `config.yaml` | +| **Per-org** | All requests from one organization | Org config via admin API | +| **Per-key** | Requests using a specific API key | Key config (RPM and TPM) | + +The most restrictive limit applies. If the global limit is 1000 RPM and a key's limit is 100 RPM, that key is capped at 100 RPM. -Rate limiting is configured in your `config.yaml` file at the gateway level: +### Configuration + + + + + +Go to **Prism > Rate Limits** in the Future AGI dashboard to set global and per-org limits. + +Per-key limits are set when creating or editing a key in **Settings > API Keys**. + + + + + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + control_plane_url="https://api.futureagi.com", +) + +# Set per-org rate limits +client.org_configs.create( + org_id="your-org-id", + config={ + "rate_limiting": { + "enabled": True, + "rpm": 500, # requests per minute for this org + "tpm": 100000, # tokens per minute for this org + } + } +) +``` + + + + + +```typescript +import { Prism } from "@futureagi/prism"; + +const client = new Prism({ + apiKey: "sk-prism-your-key", + baseUrl: "https://gateway.futureagi.com", + controlPlaneUrl: "https://api.futureagi.com", +}); + +await client.orgConfigs.create({ + orgId: "your-org-id", + config: { + rate_limiting: { + enabled: true, + rpm: 500, + tpm: 100000, + }, + }, +}); +``` + + + + + +**Self-hosted config.yaml:** ```yaml +# Global rate limit (all requests) rate_limiting: enabled: true - global_rpm: 1000 # Maximum requests per minute (0 = unlimited) -``` + global_rpm: 1000 -The `global_rpm` setting applies to all incoming requests through this gateway instance. Set it to `0` for unlimited requests, or specify a positive integer to enforce a per-minute ceiling. - - -Rate limiting is enforced globally across all requests to the gateway. Per-user, per-key, or per-model rate limiting is not currently supported. - +# Per-key limits are set on the key itself +auth: + keys: + - name: "limited-key" + key: "sk-prism-..." + rate_limit_rpm: 100 + rate_limit_tpm: 50000 +``` -## Response headers +### Response headers -When rate limiting is enabled, every response includes headers that tell you about your current quota: +Every response includes rate limit headers: | Header | Description | -|--------|-------------| -| `X-Ratelimit-Limit-Requests` | The maximum number of requests allowed per minute | -| `X-Ratelimit-Remaining-Requests` | How many requests you have left in the current minute | -| `X-Ratelimit-Reset-Requests` | Unix timestamp (seconds) when the rate limit window resets | +|---|---| +| `X-Ratelimit-Limit-Requests` | Maximum requests allowed per minute | +| `X-Ratelimit-Remaining-Requests` | Requests remaining in the current window | +| `X-Ratelimit-Reset-Requests` | Unix timestamp when the window resets | -Use these headers to monitor your usage and back off before hitting the limit. +### Error response (429) -## Error response (429) - -When you exceed the rate limit, the gateway returns a 429 status code with this error body: +When a rate limit is exceeded: ```json { @@ -49,35 +136,48 @@ When you exceed the rate limit, the gateway returns a 429 status code with this } ``` -## Reading rate limit headers +### Retry logic -Here's how to inspect rate limit headers using cURL: + -```bash -curl -i -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-..." \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o", - "messages": [{"role": "user", "content": "Hello"}] - }' -``` + + +```python +import time +from prism import Prism, RateLimitError + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +def call_with_retry(max_retries=3): + for attempt in range(max_retries): + try: + return client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + ) + except RateLimitError: + if attempt < max_retries - 1: + time.sleep(2 ** attempt) # 1s, 2s, 4s + continue + raise -The `-i` flag includes response headers in the output. Look for the `X-Ratelimit-*` headers in the response. +result = call_with_retry() +``` -## Handling rate limit errors + -When you receive a 429 response, wait until the `X-Ratelimit-Reset-Requests` timestamp before retrying. Here's how to implement retry logic in your application: + - -```python Python +```python import time -from prism import Prism -from prism._exceptions import PrismError +from openai import OpenAI, RateLimitError -client = Prism( - api_key="sk-prism-...", - base_url="https://gateway.futureagi.com" +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", ) def call_with_retry(max_retries=3): @@ -87,61 +187,165 @@ def call_with_retry(max_retries=3): model="gpt-4o", messages=[{"role": "user", "content": "Hello"}], ) - except PrismError as e: - if "rate_limit" in str(e).lower() and attempt < max_retries - 1: - # Exponential backoff: 1s, 2s, 4s + except RateLimitError: + if attempt < max_retries - 1: time.sleep(2 ** attempt) continue raise result = call_with_retry() -print(result) ``` -```typescript TypeScript -import { Prism, RateLimitError } from "@futureagi/prism"; + -const client = new Prism({ - apiKey: "sk-prism-...", - baseUrl: "https://gateway.futureagi.com", -}); + + +```bash +# Check rate limit headers with -i flag +curl -i -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +# Look for X-Ratelimit-Remaining-Requests in the response headers +``` + + + + + +--- + +## Budgets + +Set spending limits per org, per key, per user, or per model. Budgets can be daily, weekly, monthly, or total. + +| Setting | Description | +|---|---| +| `period` | `daily`, `weekly`, `monthly`, or `total` | +| `limit` | USD amount | +| `action` | `block` (hard limit, reject requests) or `warn` (soft limit, log warning) | + + + + + +Go to **Prism > Budgets** in the Future AGI dashboard to set org-level budgets and alerts. + + -async function callWithRetry(maxRetries = 3) { - for (let attempt = 0; attempt < maxRetries; attempt++) { - try { - return await client.chat.completions.create({ - model: "gpt-4o", - messages: [{ role: "user", content: "Hello" }], - }); - } catch (error) { - if (error instanceof RateLimitError && attempt < maxRetries - 1) { - // Exponential backoff: 1s, 2s, 4s - await new Promise((resolve) => - setTimeout(resolve, Math.pow(2, attempt) * 1000) - ); - continue; - } - throw error; + + +```python +client.org_configs.create( + org_id="your-org-id", + config={ + "budgets": { + "enabled": True, + "org_budget": { + "period": "monthly", + "limit": 500.00, + "action": "block", + } + } } +) +``` + + + + + +```typescript +await client.orgConfigs.create({ + orgId: "your-org-id", + config: { + budgets: { + enabled: true, + org_budget: { + period: "monthly", + limit: 500.00, + action: "block", + }, + }, + }, +}); +``` + + + + + +**Self-hosted config.yaml:** + +```yaml +budgets: + enabled: true + org_budget: + period: monthly + limit: 500.00 + action: block +``` + +When a budget is exceeded with `action: block`, new requests return: + +```json +{ + "error": { + "type": "budget_exceeded", + "code": "rate_limit_exceeded", + "message": "Organization monthly budget of $500.00 exceeded" } } +``` + +--- -const result = await callWithRetry(); -console.log(result); +## Managed key credits + +Managed keys have a USD credit balance that auto-deducts the cost of each request. When credits run out, requests are blocked. + +**Create a managed key with credits:** + +```bash +curl -X POST https://gateway.futureagi.com/-/keys \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "customer-key", + "key_type": "managed", + "credit_balance": 25.00 + }' +``` + +**Add more credits:** + +```bash +curl -X POST "https://gateway.futureagi.com/-/keys/key_123/credits" \ + -H "Authorization: Bearer your-admin-token" \ + -H "Content-Type: application/json" \ + -d '{"amount": 50.00}' ``` - - -Use exponential backoff when retrying. Start with a short delay (1 second) and double it with each retry. This prevents overwhelming the gateway when multiple clients hit the limit simultaneously. - +The remaining balance is returned in the `x-prism-credits-remaining` response header on every request made with a managed key. -## Next steps +--- + +## Next Steps - - Monitor API costs and usage across your gateway + + See per-request cost breakdown and attribution + + + Configure per-key restrictions and RBAC + + + Control which provider handles each request - - Configure intelligent routing and failover strategies + + See where rate limiting fits in the pipeline diff --git a/src/pages/docs/prism/features/routing.mdx b/src/pages/docs/prism/features/routing.mdx index 2827bdeb..ae399b1a 100644 --- a/src/pages/docs/prism/features/routing.mdx +++ b/src/pages/docs/prism/features/routing.mdx @@ -5,7 +5,7 @@ description: "Configure load balancing, failover, retries, and circuit breaking ## About -Prism's routing layer distributes requests across multiple providers and models to maximize reliability and performance. If one provider is down or slow, traffic automatically shifts to healthy alternatives. This ensures your application stays responsive even when individual providers experience outages or rate limiting. +Prism's routing layer distributes requests across multiple providers and models for reliability and performance. If one provider is down or slow, traffic automatically shifts to healthy alternatives. This ensures your application stays responsive even when individual providers experience outages or rate limiting. --- @@ -77,25 +77,22 @@ These parameters appear in the JSON configuration blocks throughout this page. | Least Latency | `least-latency` | Routes to the fastest provider based on recent response times | | Cost Optimized | `cost-optimized` | Cheapest provider that supports the requested model | | Adaptive | `adaptive` | Dynamically adjusts weights based on real-time performance | -| Race | `fastest` | Sends to all providers simultaneously, returns the first response — you are billed for every call made, including those whose responses are discarded | +| Race | `fastest` | Sends to all providers simultaneously, returns the first response. You are billed for every call made, including those whose responses are discarded | --- ## Configuring a routing policy - + - ![Routing dashboard](/screenshot/product/prism/routing-dashboard.png) - - 1. Open Prism dashboard at https://app.futureagi.com/dashboard/gateway/routing - 2. Navigate to Routing - 3. Click Create Policy - 4. Enter name and optional description - 5. Select strategy - 6. Configure strategy-specific settings - 7. Click Save + + 1. Go to **Prism > Routing** in the Future AGI dashboard + 2. Click **Create Policy** + 3. Select a strategy and configure provider weights, failover, retries, etc. + 4. Click **Save** + - + ```python from prism import Prism @@ -124,7 +121,7 @@ These parameters appear in the JSON configuration blocks throughout this page. ) ``` - + ```typescript import { Prism } from "@futureagi/prism"; @@ -199,9 +196,9 @@ Prism uses exponential backoff for retries. This means it waits progressively lo ## Circuit breaking -Think of a circuit breaker like a fuse box. When a provider starts failing repeatedly, the circuit "trips". Prism stops sending requests to that provider entirely and routes to healthy alternatives instead. After a recovery window, Prism tests the provider with a few trial requests. If those succeed, the circuit "closes" and normal routing resumes. This prevents a single failing provider from degrading your entire application. +Circuit breaking stops sending requests to a provider that is failing repeatedly. After a cooldown, Prism tests the provider with a few trial requests. If those succeed, normal routing resumes. This prevents a single failing provider from degrading your entire application. -Circuit breaking prevents cascading failures by stopping requests to a provider that is experiencing issues. The circuit breaker has three states: Closed (normal operation), Open (rejecting requests), and Half-Open (testing recovery). +The circuit breaker has three states: | State | Behavior | |-------|----------| @@ -244,7 +241,7 @@ Configure per-request and per-provider timeouts to prevent hanging requests. ## Example: High-availability setup -This configuration combines weighted routing, failover, retries, and circuit breaking for a production-grade setup: +This configuration combines weighted routing, failover, retries, and circuit breaking for a production setup: ```json { @@ -375,7 +372,7 @@ routing: condition: field: "model" op: "$in" - value: ["gpt-4o", "claude-opus-4-20250514"] + value: ["gpt-4o", "claude-opus-4-6"] action: provider: "gemini" # Lower cost for long-context tasks @@ -420,24 +417,149 @@ Configure per-model fallback chains for automatic failover when a specific model routing: model_fallbacks: gpt-4o: - - claude-sonnet-4-20250514 + - claude-sonnet-4-6 - gemini-2.0-pro - claude-sonnet-4-20250514: + claude-sonnet-4-6: - gpt-4o - gemini-2.0-pro ``` -When `gpt-4o` fails, Prism automatically tries `claude-sonnet-4-20250514`, then `gemini-2.0-pro`. +When `gpt-4o` fails, Prism automatically tries `claude-sonnet-4-6`, then `gemini-2.0-pro`. + +--- + +## Complexity-based routing + +Route requests to different models based on prompt complexity. Prism scores each request on 8 signals and maps it to a tier. + +**Scoring signals:** + +| Signal | Default weight | What it measures | +|---|---|---| +| `token_count` | 0.15 | Total input tokens | +| `message_count` | 0.10 | Number of messages in the conversation | +| `system_prompt_length` | 0.10 | Length of the system prompt | +| `tool_count` | 0.15 | Number of tools/functions provided | +| `multimodal` | 0.15 | Whether the request contains images or audio | +| `keyword_heuristics` | 0.15 | Presence of reasoning keywords ("analyze", "step by step", "compare", etc.) | +| `structured_output` | 0.10 | Whether `response_format` is set | +| `max_tokens` | 0.10 | Requested output length | + +Each signal produces a 0-100 score. The weighted sum maps to a tier: + +```yaml +routing: + complexity: + enabled: true + default_tier: "moderate" + tiers: + simple: + max_score: 30 + model: "gpt-4o-mini" + provider: "openai" + moderate: + max_score: 70 + model: "gpt-4o" + provider: "openai" + complex: + max_score: 100 + model: "claude-sonnet-4-6" + provider: "anthropic" +``` + +A simple classification request scores low and routes to `gpt-4o-mini`. A multi-tool reasoning task scores high and routes to `claude-sonnet-4-6`. + +You can override the tier per request with the `x-prism-complexity-override` header. Pass the tier name (e.g., `simple`, `moderate`, `complex` - matching your configured tier names). + +--- + +## Provider lock (sticky routing) + +Force a request to a specific provider, bypassing the routing strategy. Useful for stateful workflows where you need consistency across multiple calls. + +Set it via the `x-prism-provider-lock` header or `provider_lock` in request metadata: + +```python +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={"x-prism-provider-lock": "openai"}, +) +``` + +Configure which providers can be locked to: + +```yaml +routing: + provider_lock: + enabled: true + allowed_providers: ["openai", "anthropic"] + deny_providers: ["groq"] # never lock to Groq +``` + +If `allowed_providers` is empty, all providers are allowed (except those in `deny_providers`). --- -## Next steps +## Adaptive strategy details + +The adaptive strategy learns from real traffic and adjusts weights over time: + +1. **Learning phase**: For the first N requests (default: 100), uses round-robin to gather baseline latency and error data from all providers. +2. **Active phase**: Computes per-provider weights every 30 seconds using latency (lower is better) and error rate (fewer errors is better). +3. **Weight smoothing**: New weights are blended with old weights using a smoothing factor (default: 0.3) to prevent wild swings. +4. **Minimum weight**: No provider drops below 5% weight, ensuring all providers stay in rotation. + +```yaml +routing: + default_strategy: "adaptive" + adaptive: + enabled: true + learning_requests: 100 + update_interval: 30s + smoothing_factor: 0.3 + min_weight: 0.05 + signal_weights: + latency: 0.5 + error_rate: 0.4 + # cost: 0.1 (parsed but not yet used in weight calculation) +``` + +--- + +## Race (fastest response) details + +The `fastest` strategy sends the same request to all eligible providers simultaneously and returns whichever responds first. The rest are cancelled. + +```yaml +routing: + default_strategy: "fastest" + fastest: + max_concurrent: 3 # limit parallel calls + cancel_delay: 50ms # wait before cancelling losers + excluded_providers: # skip these in the race + - "groq" +``` + + +You are billed by every provider that receives the request, not just the winner. Use this for latency-critical requests where cost is secondary. + + +--- + +## Next Steps - + Add and configure LLM providers for routing - - Reduce costs with response caching + + Reduce latency and cost with response caching + + + See where routing fits in the request pipeline + + + Add safety checks before and after routing diff --git a/src/pages/docs/prism/features/self-hosted-models.mdx b/src/pages/docs/prism/features/self-hosted-models.mdx new file mode 100644 index 00000000..0a950041 --- /dev/null +++ b/src/pages/docs/prism/features/self-hosted-models.mdx @@ -0,0 +1,195 @@ +--- +title: "Self-hosted models" +description: "Connect Prism to locally-running models via Ollama, vLLM, LM Studio, and other OpenAI-compatible servers." +--- + +## About + +Prism can route requests to models running on your own hardware alongside cloud providers. Self-hosted models are configured as providers with a `base_url` pointing to your local inference server. All gateway features (routing, caching, failover, guardrails) work the same way. + +--- + +## Supported inference servers + +| Server | `type` value | Notes | +|---|---|---| +| [Ollama](https://ollama.com) | `ollama` | Auto-discovers models. No model list needed. | +| [vLLM](https://docs.vllm.ai) | `vllm` | OpenAI-compatible server for production inference | +| [LM Studio](https://lmstudio.ai) | `lm_studio` | Desktop app with local server mode | +| Any OpenAI-compatible server | (omit type) | Set `api_format: "openai"` and `base_url` | + +--- + +## Configuration + +### Ollama + +```yaml +providers: + ollama: + base_url: "http://localhost:11434" + type: "ollama" + # Models are auto-discovered from Ollama's /v1/models endpoint +``` + +Ollama auto-discovers all pulled models. After pulling a model (`ollama pull llama3.1`), it's immediately available through Prism. + +### vLLM + +```yaml +providers: + vllm: + base_url: "http://gpu-server:8000" + type: "vllm" + api_format: "openai" + models: + - "meta-llama/Llama-3.1-70B-Instruct" +``` + +### LM Studio + +```yaml +providers: + lm-studio: + base_url: "http://localhost:1234" + type: "lm_studio" + api_format: "openai" +``` + +### Generic OpenAI-compatible server + +Any server that implements the `/v1/chat/completions` endpoint: + +```yaml +providers: + my-server: + base_url: "http://inference.internal:8080" + api_format: "openai" + models: + - "my-custom-model" +``` + +--- + +## Hybrid routing + +The main value of self-hosted models through Prism is hybrid routing: use cheap local models for simple requests and fall back to cloud providers for complex ones. + +### Cost-based routing + +Route to the cheapest option first: + +```yaml +routing: + default_strategy: "cost-optimized" + +providers: + ollama: + base_url: "http://localhost:11434" + type: "ollama" + + openai: + api_key: "${OPENAI_API_KEY}" + api_format: "openai" + models: ["gpt-4o", "gpt-4o-mini"] +``` + +### Failover from local to cloud + +Use local models as the primary, with cloud as a backup: + +```yaml +routing: + failover: + enabled: true + providers: ["ollama", "openai"] + failover_on: [429, 500, 502, 503, 504] + +providers: + ollama: + base_url: "http://localhost:11434" + type: "ollama" + + openai: + api_key: "${OPENAI_API_KEY}" + api_format: "openai" + models: ["gpt-4o"] +``` + +If Ollama is down or overloaded, requests automatically route to OpenAI. + +### Complexity-based routing + +Route simple queries to a local model and complex queries to a cloud model: + +```yaml +routing: + complexity: + enabled: true + tiers: + simple: + max_score: 30 + model: "llama3.1" + provider: "ollama" + complex: + max_score: 100 + model: "gpt-4o" + provider: "openai" +``` + +See [Routing > Complexity-based routing](/docs/prism/features/routing#complexity-based-routing) for the full scoring system. + +--- + +## Using self-hosted models from code + +Once configured, self-hosted models are used the same way as cloud models: + +```python +from prism import Prism + +client = Prism( + api_key="sk-prism-your-key", + base_url="http://localhost:8080", # your self-hosted Prism gateway +) + +# Route to Ollama +response = client.chat.completions.create( + model="llama3.1", + messages=[{"role": "user", "content": "Hello"}], +) + +# Or pin to a specific provider +response = client.chat.completions.create( + model="llama3.1", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={"x-prism-provider-lock": "ollama"}, +) +``` + +--- + +## Limitations + +- Self-hosted models don't support the Assistants API (threads are stored on OpenAI's servers) +- Embedding endpoints require the inference server to implement `/v1/embeddings` +- Cost tracking uses configured pricing. Set custom pricing for self-hosted models in the provider config, or costs will show as $0. + +--- + +## Next Steps + + + + Deploy the Prism gateway on your infrastructure + + + Configure hybrid routing strategies + + + Cloud and self-hosted provider list + + + Full config reference + + diff --git a/src/pages/docs/prism/features/shadow-experiments.mdx b/src/pages/docs/prism/features/shadow-experiments.mdx index 4d9c49d6..1959ff4a 100644 --- a/src/pages/docs/prism/features/shadow-experiments.mdx +++ b/src/pages/docs/prism/features/shadow-experiments.mdx @@ -1,5 +1,5 @@ --- -title: "Shadow Experiments" +title: "Shadow experiments" description: "Mirror a percentage of production LLM traffic to alternative models for zero-risk evaluation." --- @@ -29,9 +29,11 @@ When you enable shadow experiments: The user never waits for the shadow model. If the shadow call fails or times out, it doesn't affect the primary response. -## Configuring via SDK (per-request) +## Configuration -You can enable shadow experiments on a per-request basis by passing a `GatewayConfig` with `TrafficMirrorConfig` to the Prism client. +### Per-request (SDK) + +Pass a `GatewayConfig` with `TrafficMirrorConfig` to enable mirroring: ```python Python @@ -42,7 +44,7 @@ client = Prism( base_url="https://gateway.futureagi.com", config=GatewayConfig( mirror=TrafficMirrorConfig( - target_model="claude-sonnet-4-20250514", + target_model="claude-sonnet-4-6", target_provider="anthropic", sample_rate=0.1, # Mirror 10% of traffic ) @@ -65,7 +67,7 @@ const client = new Prism({ baseUrl: "https://gateway.futureagi.com", config: { mirror: { - target_model: "claude-sonnet-4-20250514", + target_model: "claude-sonnet-4-6", target_provider: "anthropic", sample_rate: 0.1, // Mirror 10% of traffic }, @@ -82,14 +84,14 @@ console.log(response.choices[0].message.content); ### Configuration options -- **`target_model`**: The model to mirror traffic to (e.g., `"claude-sonnet-4-20250514"`) +- **`target_model`**: The model to mirror traffic to (e.g., `"claude-sonnet-4-6"`) - **`target_provider`**: The provider of the shadow model (e.g., `"anthropic"`, `"openai"`) - **`sample_rate`**: Float between 0.0 and 1.0. `0.1` mirrors 10% of traffic, `1.0` mirrors 100% - **`enabled`**: Set to `false` to disable mirroring (defaults to `true`) -## Configuring in config.yaml (gateway-level) +### Gateway-level (config.yaml) -For persistent gateway-level configuration, add a `routing.mirror` section to your `config.yaml`: +For persistent configuration, add a `routing.mirror` section to `config.yaml`: ```yaml routing: @@ -98,12 +100,12 @@ routing: rules: - source_model: "gpt-4o" target_provider: "anthropic" - target_model: "claude-sonnet-4-20250514" + target_model: "claude-sonnet-4-6" sample_rate: 0.1 # Mirror 10% of gpt-4o traffic - source_model: "gpt-4-turbo" target_provider: "anthropic" - target_model: "claude-opus-4-20250514" + target_model: "claude-opus-4-6" sample_rate: 0.05 # Mirror 5% of gpt-4-turbo traffic - source_model: "*" # Wildcard: mirror ALL models @@ -113,6 +115,8 @@ routing: Use `"*"` as the `source_model` to mirror all requests regardless of the primary model. Rules are evaluated in order, so place more specific rules before wildcard rules. +--- + ## Collected data Each mirrored request produces a shadow result with the following fields: @@ -122,7 +126,7 @@ Each mirrored request produces a shadow result with the following fields: "request_id": "req_abc123", "experiment_id": "exp_xyz", "source_model": "gpt-4o", - "shadow_model": "claude-sonnet-4-20250514", + "shadow_model": "claude-sonnet-4-6", "source_response": "The capital of France is Paris.", "shadow_response": "Paris is the capital of France.", "source_latency_ms": 450, @@ -161,21 +165,23 @@ Each mirrored request produces a shadow result with the following fields: Shadow results appear in the Future AGI dashboard after periodic sync. Direct API access to results is not currently available. -## Important notes +--- + +## Limitations -- **Non-streaming mirrors**: Shadow copies are always sent as non-streaming requests, even if the original request was streaming -- **Billing**: You are billed for shadow calls to the target provider at standard rates -- **Sample rate format**: `sample_rate` is a float from 0.0 to 1.0 (not a percentage). Use `0.1` for 10%, `0.5` for 50%, `1.0` for 100% -- **Timeout**: Shadow calls have a 30-second timeout. If the shadow model doesn't respond within this window, the call is abandoned and an error is recorded -- **No user impact**: Shadow failures never affect the primary response or user experience +- Shadow copies are always non-streaming, even if the original request was streaming +- You are billed for shadow calls at standard provider rates +- `sample_rate` is a float from 0.0 to 1.0 (not a percentage). `0.1` = 10%, `1.0` = 100% +- Shadow calls have a 30-second timeout. Timeouts are recorded as errors but don't affect the primary response +- Shadow failures never affect the user-facing response -## Next steps +## Next Steps - - Learn how to configure request routing and failover strategies + + Routing strategies and failover configuration - - Monitor and analyze costs across multiple models and providers + + Monitor costs across models and providers diff --git a/src/pages/docs/prism/features/streaming.mdx b/src/pages/docs/prism/features/streaming.mdx index 425d55d4..27bf846f 100644 --- a/src/pages/docs/prism/features/streaming.mdx +++ b/src/pages/docs/prism/features/streaming.mdx @@ -216,7 +216,7 @@ Your application receives identical SSE events regardless of the underlying prov --- -## Next steps +## Next Steps diff --git a/src/pages/docs/prism/guides/errors.mdx b/src/pages/docs/prism/guides/errors.mdx new file mode 100644 index 00000000..d2e7bd7e --- /dev/null +++ b/src/pages/docs/prism/guides/errors.mdx @@ -0,0 +1,332 @@ +--- +title: "Error handling" +description: "Error response format, HTTP status codes, and retry strategies for the Prism Gateway." +--- + +## About + +All Prism errors follow a consistent JSON format with machine-readable codes. This page covers the error structure, HTTP status codes, and retry strategies. + +--- + +## Error format + +All errors from Prism follow the same JSON structure: + +```json +{ + "error": { + "message": "Human-readable description of what went wrong", + "type": "error_category", + "param": null, + "code": "machine_readable_code" + } +} +``` + +The `type` field groups errors into categories. The `code` field identifies the specific error. Use `code` for programmatic error handling. + +--- + +## HTTP status codes + +### Client errors (4xx) + +| Status | Code | Meaning | +|---|---|---| +| 400 | `invalid_json` | Request body is not valid JSON | +| 400 | `missing_model` | `model` field is missing from the request | +| 400 | `missing_messages` | `messages` field is missing or empty | +| 400 | `invalid_request_error` | Other request validation failures | +| 401 | `unauthorized` | API key is missing or invalid | +| 403 | `content_blocked` | A guardrail blocked the request (enforce mode) | +| 404 | `model_not_found` | Model not configured for any provider. Check `model_map` or use `provider/model` format. | +| 429 | `rate_limit_exceeded` | Per-key or per-org rate limit exceeded | +| 429 | `budget_exceeded` | Organization budget limit reached | + +### Server errors (5xx) + +| Status | Code | Meaning | +|---|---|---| +| 500 | `internal_error` | Unexpected gateway error | +| 501 | `not_supported` | Provider doesn't support this endpoint (e.g. embeddings on a chat-only provider) | +| 502 | `provider_error` | Provider returned an error | +| 502 | `provider_404` | Provider returned 404 (usually wrong API key or model access) | +| 502 | `upstream_error` | Generic upstream provider failure | +| 503 | `service_unavailable` | Gateway is overloaded or shutting down | +| 504 | `timeout` | Request timed out waiting for provider response | + +--- + +## Common errors and fixes + +### model not found (404) + +```json +{ + "error": { + "message": "model \"gpt-4o\" not found in any configured provider. Configure model_map or use 'provider/model' format.", + "type": "not_found", + "code": "model_not_found" + } +} +``` + +**Causes:** +- The model isn't enabled for your organization's providers +- Typo in the model name +- Using a model alias without configuring `model_map` + +**Fixes:** +- Check available models: `GET /v1/models` +- Configure a [model map](/docs/prism/concepts/configuration#model-mapping) +- Use the `provider/model` format: `"openai/gpt-4o"` + +### Rate limit exceeded (429) + +```json +{ + "error": { + "message": "Rate limit exceeded. Please retry after the window resets.", + "type": "rate_limit_error", + "param": null, + "code": "rate_limit_exceeded" + } +} +``` + +Check the `x-ratelimit-remaining-requests` and `x-ratelimit-reset-requests` response headers to know when to retry. See [retry strategies](#retry-strategies) below. + +### Budget exceeded (429) + +```json +{ + "error": { + "message": "Organization monthly budget of $500.00 exceeded", + "type": "budget_error", + "param": null, + "code": "budget_exceeded" + } +} +``` + +Budget resets at the start of the next period (daily/weekly/monthly). Increase the budget in [Rate limiting & budgets](/docs/prism/features/rate-limiting) or wait for the reset. + +### Guardrail blocked (403) + +```json +{ + "error": { + "type": "guardrail_triggered", + "code": "content_blocked", + "message": "Request blocked by guardrail: pii-detector" + } +} +``` + +The request or response triggered a guardrail in enforce mode. Check the `x-prism-guardrail-triggered` response header. See [Guardrails](/docs/prism/features/guardrails) for configuration. + +### Provider error (502) + +```json +{ + "error": { + "message": "provider error (HTTP 404): ", + "type": "upstream_error", + "code": "provider_404" + } +} +``` + +The gateway reached the provider but got an error back. Common causes: +- Provider API key is invalid or expired +- Project-scoped key doesn't have model access enabled +- Provider is experiencing an outage + +Configure [failover](/docs/prism/features/routing#failover) to automatically route to backup providers when this happens. + +--- + +## Retry strategies + +### Exponential backoff + +The standard pattern for handling transient errors (429, 5xx): + + + + + +The Prism SDK retries automatically when you configure `RetryConfig`: + +```python +from prism import Prism, GatewayConfig, RetryConfig + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + config=GatewayConfig( + retry=RetryConfig( + max_retries=3, + on_status_codes=[429, 500, 502, 503, 504], + backoff_factor=0.5, + ), + ), +) + +# Retries happen automatically on configured status codes +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) +``` + + + + + +The OpenAI SDK has built-in retry logic with exponential backoff: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", + max_retries=3, # built-in retry with backoff +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], +) +``` + + + + + +```python +import time +import requests + +def call_with_retry(max_attempts=4): + for attempt in range(max_attempts): + response = requests.post( + "https://gateway.futureagi.com/v1/chat/completions", + headers={ + "Authorization": "Bearer sk-prism-your-key", + "Content-Type": "application/json", + }, + json={ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}], + }, + ) + + if response.status_code == 200: + return response.json() + + if response.status_code in (429, 500, 502, 503, 504): + if attempt < max_attempts - 1: + wait = min(2 ** attempt, 30) # 1s, 2s, 4s, capped at 30s + print(f"Attempt {attempt + 1} failed ({response.status_code}), retrying in {wait}s") + time.sleep(wait) + continue + + # Non-retryable error or final attempt + response.raise_for_status() + + raise Exception(f"Failed after {max_attempts} attempts") +``` + + + + + +### What to retry + +| Status | Retry? | Why | +|---|---|---| +| 400 | No | Bad request, fix the input | +| 401 | No | Bad credentials, fix the API key | +| 403 | No | Blocked by guardrail or RBAC | +| 404 | No | Model not found, fix the model name | +| 429 | Yes | Rate limit, back off and retry | +| 500 | Yes | Internal error, may be transient | +| 502 | Yes | Provider error, may recover | +| 503 | Yes | Service unavailable, may recover | +| 504 | Yes | Timeout, may succeed on retry | + +### Using failover instead of retry + +For production systems, configure [routing with failover](/docs/prism/features/routing#failover) instead of client-side retries. Prism automatically routes to the next provider on failure, which is faster than waiting and retrying the same provider. + +--- + +## Error handling in SDKs + +### Prism SDK exceptions + +```python +from prism import Prism, APIStatusError, RateLimitError, AuthenticationError + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", +) + +try: + response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + ) +except RateLimitError: + print("Rate limited, back off and retry") +except AuthenticationError: + print("Bad API key") +except APIStatusError as e: + print(f"API error {e.status_code}: {e.message}") +``` + +### OpenAI SDK exceptions + +```python +from openai import OpenAI, RateLimitError, AuthenticationError, APIError + +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-key", +) + +try: + response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + ) +except RateLimitError: + print("Rate limited") +except AuthenticationError: + print("Bad API key") +except APIError as e: + print(f"API error {e.status_code}: {e.message}") +``` + +--- + +## Next Steps + + + + Debug common issues step by step + + + Automatic provider failover on errors + + + Configure rate limits and budgets + + + Debug headers for request correlation + + diff --git a/src/pages/docs/prism/guides/troubleshooting.mdx b/src/pages/docs/prism/guides/troubleshooting.mdx new file mode 100644 index 00000000..e6f6aa7d --- /dev/null +++ b/src/pages/docs/prism/guides/troubleshooting.mdx @@ -0,0 +1,196 @@ +--- +title: "Troubleshooting" +description: "Step-by-step solutions for common Prism Gateway issues." +--- + +## About + +Common issues and how to diagnose them when requests through Prism fail. + +--- + +## Debug checklist + +When something isn't working, start here: + +1. Check the `x-prism-request-id` response header and search for it in your logs +2. Check `x-prism-provider` to confirm which provider handled the request +3. Check `x-prism-model-used` to confirm the actual model (may differ from requested if routing changed it) +4. Compare `x-prism-latency-ms` against your expected latency +5. Check `x-prism-cost` to verify pricing is as expected + +Use `curl -i` to see all response headers: + +```bash +curl -i https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "hi"}]}' +``` + +--- + +## Common issues + +### "model not found" but the model exists + +**Symptom:** 404 with `model_not_found` even though the model appears in `GET /v1/models`. + +**Quick fix:** Try the `provider/model` format to bypass model resolution: + +```bash +# Check available models +curl https://gateway.futureagi.com/v1/models \ + -H "Authorization: Bearer sk-prism-your-key" | jq '.data[].id' + +# Use explicit provider prefix +curl https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-key" \ + -H "Content-Type: application/json" \ + -d '{"model": "openai/gpt-4o", "messages": [{"role": "user", "content": "hi"}]}' +``` + +If that works, set up a [model map](/docs/prism/concepts/configuration#model-mapping). See [Error handling](/docs/prism/guides/errors#model-not-found-404) for all causes. + +### Provider returns 404 upstream + +**Symptom:** 502 with `provider_404`. + +The gateway reached the provider, but the provider rejected the request. Most common cause: the provider API key is invalid or doesn't have access to the model. For OpenAI project-scoped keys (`sk-proj-...`), enable models in Project Settings > Model access. + +See [Error handling](/docs/prism/guides/errors#provider-error-502) for details. + +### Responses are slow + +**Symptom:** High `x-prism-latency-ms` values. + +**Possible causes:** +1. **Provider latency**: Check if the provider itself is slow. Compare `x-prism-latency-ms` with direct provider calls. +2. **No caching**: Repeated identical requests hit the provider every time. Enable [caching](/docs/prism/features/caching). +3. **Wrong routing strategy**: `least-latency` routing picks the fastest provider automatically. See [routing](/docs/prism/features/routing). +4. **Large prompts**: Token count affects latency. Check `usage.prompt_tokens` in the response. +5. **Guardrail overhead**: Pre-request guardrails add latency. Check if guardrails are processing-heavy. + +### Cache isn't working + +**Symptom:** `x-prism-cache` always shows `miss` or doesn't appear. + +**Checklist:** +- Is caching enabled? Check your org config or `GatewayConfig`. +- Are you sending streaming requests? Streaming bypasses cache entirely. +- Are the requests identical? Exact-match cache requires identical model, messages, temperature, and all parameters. +- Is the TTL too short? Requests may expire before the next identical request arrives. +- Are you using different cache namespaces? Each namespace is isolated. + +```python +# Force a cache test: send the same non-streaming request twice +from prism import Prism, GatewayConfig, CacheConfig + +client = Prism( + api_key="sk-prism-your-key", + base_url="https://gateway.futureagi.com", + config=GatewayConfig(cache=CacheConfig(enabled=True, strategy="exact", ttl=300)), +) + +# First call +r1 = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "What is 2+2?"}], +) +print(f"Call 1 cache: {r1.prism.cache_status}") # miss or None + +# Second call (same input) +r2 = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "What is 2+2?"}], +) +print(f"Call 2 cache: {r2.prism.cache_status}") # hit_exact +``` + +### Guardrails blocking legitimate requests + +**Symptom:** 403 with `content_blocked` on requests that should be allowed. + +**Diagnosis:** +- Check which guardrail fired: the error message includes the guardrail name +- Check `x-prism-guardrail-triggered: true` in the response headers +- Switch the guardrail from `enforce` to `log` mode temporarily to see what's being flagged without blocking + +See [Guardrails](/docs/prism/features/guardrails) for configuration options including fail-open behavior. + +### Rate limits hit unexpectedly + +**Symptom:** 429 errors before you expect to hit limits. + +**Check the response headers:** +``` +x-ratelimit-limit-requests: 100 +x-ratelimit-remaining-requests: 0 +x-ratelimit-reset-requests: 1714000000 +``` + +**Common causes:** +- Per-key limits are lower than per-org limits. The most restrictive limit applies. +- Multiple services share the same API key +- Burst traffic from retries (each retry counts against the limit) + +**Fix:** Increase limits in [Rate limiting](/docs/prism/features/rate-limiting), use separate keys per service, or add backoff to retry logic. + +### Cost is higher than expected + +**Diagnosis:** +1. Check `x-prism-cost` on individual requests to find expensive calls +2. Use metadata tagging to identify which team/feature is driving costs: + ```python + response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + request_metadata={"team": "search", "feature": "autocomplete"}, + ) + ``` +3. Check the analytics dashboard for cost-by-model breakdown +4. Look for missing cache hits on repeated queries +5. Check if the `race` routing strategy is enabled (bills all providers, not just the winner) + +See [Cost tracking](/docs/prism/features/cost-tracking) for attribution and budgets. + +### Failover isn't working + +**Symptom:** Requests fail with provider errors but don't route to backup providers. + +**Checklist:** +- Is failover enabled in your routing config? +- Does `failover_on` include the status code you're seeing? (Default: `[429, 500, 502, 503, 504]`) +- Are backup providers configured with valid credentials? +- Check `x-prism-fallback-used: true` to confirm failover happened (or didn't) +- Check `x-prism-provider` to see which provider ultimately handled the request + +--- + +## Getting help + +If you can't resolve the issue: + +1. Collect the `x-prism-request-id` from the failing request +2. Note the timestamp and error message +3. Check the [Error handling](/docs/prism/guides/errors) guide for the specific error code +4. Contact support with the request ID - it links to the full request/response log on our end + +--- + +## Next Steps + + + + Error codes, retry strategies, and SDK exceptions + + + All debug headers for request correlation + + + Configure automatic failover + + + Configuration hierarchy and overrides + + diff --git a/src/pages/docs/prism/quickstart.mdx b/src/pages/docs/prism/quickstart.mdx index a38a599f..5992ac1a 100644 --- a/src/pages/docs/prism/quickstart.mdx +++ b/src/pages/docs/prism/quickstart.mdx @@ -1,364 +1,286 @@ --- title: "Quickstart" -description: "Make your first LLM request through Prism in under 5 minutes" +description: "Make your first LLM request through Prism in under 5 minutes." --- ## About -Get your first LLM request through Prism in under 5 minutes. Prism is Future AGI's AI Gateway, a proxy layer between your application and LLM providers. It provides a single API that handles routing across 100+ providers, enforces safety guardrails, caches responses, tracks costs, and delivers full observability. +Point your existing OpenAI SDK at Prism by changing two lines: `base_url` and `api_key`. All providers work through the same API. No new SDK required. - -**Already using the OpenAI SDK?** You can skip the Prism SDK installation. Point your existing client's `base_url` at `https://gateway.futureagi.com` and swap your API key. All providers work through the same OpenAI-format API. - +## Prerequisites + +1. **Future AGI account** - sign up at [app.futureagi.com](https://app.futureagi.com) +2. **Prism API key** - found in your dashboard under **Settings > API Keys**. Keys start with `sk-prism-`. +3. **At least one provider configured** - add a provider (OpenAI, Anthropic, Google, etc.) in [Prism > Providers](/docs/prism/features/providers) --- -## Prerequisites + -Before you begin: + -1. **Future AGI account**: Sign up at [app.futureagi.com](https://app.futureagi.com) if you do not have one. -2. **Prism API key**: Available in your dashboard under **Settings → API Keys**. Prism keys start with `sk-prism-`. -3. **At least one provider configured**: If you have not added a provider yet, see [Manage Providers](/docs/prism/features/providers). You will need your own API key for the provider you want to use (e.g., an OpenAI key). +If you already use the OpenAI SDK, change two lines and you're done: ---- + - - - Install the Prism SDK for your language: - - - ```bash - pip install prism-ai - ``` - - ```bash - npm install @futureagi/prism - ``` - - - - - Set your API key and gateway URL as environment variables: - - ```bash - export PRISM_API_KEY=sk-prism-your-api-key-here - export PRISM_BASE_URL=https://gateway.futureagi.com - ``` - - Your API key starts with `sk-prism-` and is available in your Prism dashboard under **Settings → API Keys**. - - - - Send a chat completion request to any supported provider: - - - ```python - from prism import Prism - - client = Prism( - api_key="sk-prism-your-api-key-here", - base_url="https://gateway.futureagi.com" - ) - - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "user", "content": "What is the capital of France?"} - ] - ) - - print(response.choices[0].message.content) - # Output: Paris - ``` - - ```typescript - import { Prism } from '@futureagi/prism'; - - const client = new Prism({ - apiKey: 'sk-prism-your-api-key-here', - baseUrl: 'https://gateway.futureagi.com' - }); - - const response = await client.chat.completions.create({ - model: 'gpt-4o-mini', - messages: [ - { role: 'user', content: 'What is the capital of France?' } - ] - }); - - console.log(response.choices[0].message.content); - // Output: Paris - ``` - - ```bash - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "What is the capital of France?"} - ] - }' - ``` - - - For the cURL request, you should see a JSON response like: - - ```json - { - "choices": [{ - "message": { "role": "assistant", "content": "Paris" }, - "finish_reason": "stop" - }], - "model": "gpt-4o-mini" - } - ``` - - - - Every Prism response includes metadata headers that tell you what happened: which provider handled the request, how long it took, what it cost, and whether the cache was used. This is useful for debugging and cost monitoring. - - - ```python - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "user", "content": "What is the capital of France?"} - ] - ) - - # Access response metadata - print(f"Request ID: {response.headers.get('X-Prism-Request-Id')}") - print(f"Provider: {response.headers.get('X-Prism-Provider')}") - print(f"Latency: {response.headers.get('X-Prism-Latency-Ms')}ms") - print(f"Cost: ${response.headers.get('X-Prism-Cost')}") - print(f"Cache: {response.headers.get('X-Prism-Cache')}") - ``` - - ```typescript - const response = await client.chat.completions.create({ - model: 'gpt-4o-mini', - messages: [ - { role: 'user', content: 'What is the capital of France?' } - ] - }); - - // Access response metadata - console.log(`Request ID: ${response.headers.get('X-Prism-Request-Id')}`); - console.log(`Provider: ${response.headers.get('X-Prism-Provider')}`); - console.log(`Latency: ${response.headers.get('X-Prism-Latency-Ms')}ms`); - console.log(`Cost: $${response.headers.get('X-Prism-Cost')}`); - console.log(`Cache: ${response.headers.get('X-Prism-Cache')}`); - ``` - - ```bash - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "What is the capital of France?"} - ] - }' -i | grep -i "x-prism" - ``` - - - A typical response looks like: - - ``` - X-Prism-Request-Id: 01HX4K9QZJP7YXMN3T8WVFR2C - X-Prism-Provider: openai - X-Prism-Model-Used: gpt-4o-mini - X-Prism-Latency-Ms: 423 - X-Prism-Cost: 0.000045 - X-Prism-Cache: miss - ``` - - - - Stream responses in real time for better user experience: - - - ```python - with client.chat.completions.stream( - model="gpt-4o-mini", - messages=[ - {"role": "user", "content": "Write a short poem about AI"} - ] - ) as stream: - for text in stream.text_stream: - print(text, end="", flush=True) - ``` - - ```typescript - const stream = await client.chat.completions.stream({ - model: 'gpt-4o-mini', - messages: [ - { role: 'user', content: 'Write a short poem about AI' } - ] - }); - - for await (const chunk of stream) { - process.stdout.write(chunk.choices[0]?.delta?.content || ''); - } - ``` - - ```bash - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "Write a short poem about AI"} - ], - "stream": true - }' - ``` - - - - - Use the same API with different providers by changing the model name. Prism handles translation. Your code stays identical. - - - ```python - # OpenAI - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": "Hello"}] - ) - - # Anthropic - response = client.chat.completions.create( - model="anthropic/claude-haiku-4-5", - messages=[{"role": "user", "content": "Hello"}] - ) - - # Google Gemini - response = client.chat.completions.create( - model="gemini/gemini-2.0-flash", - messages=[{"role": "user", "content": "Hello"}] - ) - ``` - - ```typescript - // OpenAI - const response1 = await client.chat.completions.create({ - model: 'gpt-4o-mini', - messages: [{ role: 'user', content: 'Hello' }] - }); - - // Anthropic - const response2 = await client.chat.completions.create({ - model: 'anthropic/claude-haiku-4-5', - messages: [{ role: 'user', content: 'Hello' }] - }); - - // Google Gemini - const response3 = await client.chat.completions.create({ - model: 'gemini/gemini-2.0-flash', - messages: [{ role: 'user', content: 'Hello' }] - }); - ``` - - ```bash - # OpenAI - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Hello"}]}' - - # Anthropic - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{"model": "anthropic/claude-haiku-4-5", "messages": [{"role": "user", "content": "Hello"}]}' - - # Google Gemini - curl -X POST https://gateway.futureagi.com/v1/chat/completions \ - -H "Authorization: Bearer sk-prism-your-api-key-here" \ - -H "Content-Type: application/json" \ - -d '{"model": "gemini/gemini-2.0-flash", "messages": [{"role": "user", "content": "Hello"}]}' - ``` - - - + ---- +```bash +pip install prism-ai +``` -## Error responses +```python +from prism import Prism -Understanding common errors helps you handle them in your application. +client = Prism( + api_key="sk-prism-your-api-key-here", + base_url="https://gateway.futureagi.com", +) -**Guardrail blocked (403)** +response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "What is the capital of France?"}], +) -When a guardrail is set to Enforce mode and triggers on a request, Prism returns 403 before the LLM is ever called: +print(response.choices[0].message.content) +# Output: Paris +``` + + + + + +```python +from openai import OpenAI + +# Already using OpenAI? Just swap base_url and api_key +client = OpenAI( + base_url="https://gateway.futureagi.com/v1", + api_key="sk-prism-your-api-key-here", +) -```json -{ - "error": { - "type": "guardrail_triggered", - "code": "forbidden", - "message": "Request blocked by guardrail: pii-detector", - "guardrail": "pii-detector" - } -} +response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "What is the capital of France?"}], +) + +print(response.choices[0].message.content) +# Output: Paris ``` -**Budget exceeded (429)** + + + -When your organization's spending limit is reached, new requests are blocked until the next billing period: +```python +import litellm -```json -{ - "error": { - "type": "budget_exceeded", - "code": "rate_limit_exceeded", - "message": "Organization monthly budget of $100.00 exceeded" - } -} +response = litellm.completion( + model="openai/gpt-4o-mini", + messages=[{"role": "user", "content": "What is the capital of France?"}], + api_key="sk-prism-your-api-key-here", + base_url="https://gateway.futureagi.com/v1", +) + +print(response.choices[0].message.content) +# Output: Paris ``` -**Provider unavailable (502)** + -When the selected provider is down or unreachable and no failover is configured: + -```json -{ - "error": { - "type": "provider_error", - "code": "bad_gateway", - "message": "Provider openai returned 503: Service Unavailable" - } -} +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-api-key-here" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] + }' ``` - -To avoid provider failures affecting your users, configure [routing with failover](/docs/prism/features/routing) so Prism automatically retries with a backup provider. - + + + + +That's it. Your existing code works with Prism. Every request now gets routing, caching, guardrails, and cost tracking automatically. + + + + + +Prism adds metadata to every response so you can see what happened. Using the client from Step 1: + +```python +# Using the OpenAI SDK client from Step 1 +response = client.chat.completions.with_raw_response.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Hello"}], +) + +print(f"Provider: {response.headers.get('x-prism-provider')}") +print(f"Latency: {response.headers.get('x-prism-latency-ms')}ms") +print(f"Cost: ${response.headers.get('x-prism-cost')}") +print(f"Cache: {response.headers.get('x-prism-cache')}") +print(f"Model: {response.headers.get('x-prism-model-used')}") + +# Parse the actual response +completion = response.parse() +print(f"Response: {completion.choices[0].message.content}") +``` + +Example output: + +``` +Provider: openai +Latency: 423ms +Cost: $0.000045 +Cache: miss +Model: gpt-4o-mini +Response: Hello! How can I help you today? +``` + + + + + +Change the model name to route to a different provider. Using the same client from Step 1: + +```python +# OpenAI +response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Hello"}] +) + +# Anthropic +response = client.chat.completions.create( + model="claude-sonnet-4-6", + messages=[{"role": "user", "content": "Hello"}] +) + +# Google Gemini +response = client.chat.completions.create( + model="gemini-2.0-flash", + messages=[{"role": "user", "content": "Hello"}] +) +``` + +Prism translates the request to each provider's native format. Your code doesn't change. + + + + + +Stream responses to show output as it arrives: + + + + + +```python +stream = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Write a short poem about AI"}], + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```python +stream = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Write a short poem about AI"}], + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```python +import litellm + +stream = litellm.completion( + model="openai/gpt-4o-mini", + messages=[{"role": "user", "content": "Write a short poem about AI"}], + api_key="sk-prism-your-api-key-here", + base_url="https://gateway.futureagi.com/v1", + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + + +```bash +curl -X POST https://gateway.futureagi.com/v1/chat/completions \ + -H "Authorization: Bearer sk-prism-your-api-key-here" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Write a short poem about AI"} + ], + "stream": true + }' +``` + + + + + + + + + +--- + +## Using a framework? + +Prism works with any OpenAI-compatible client. If you use LangChain, LlamaIndex, or any other framework that supports custom base URLs, just point it at `https://gateway.futureagi.com/v1` with your Prism key. --- -## Next steps +## Next Steps - - Learn the fundamental concepts behind Prism + + Understand the request pipeline and plugin architecture + + + Add and configure LLM providers - - Configure and route across multiple LLM providers + + Add safety checks to requests and responses - - Add safety policies and content moderation + + Set up load balancing and failover - - Set up load balancing and failover strategies + + Full endpoint reference with function calling and vision - - Request headers, response headers, and supported endpoints + + See every API endpoint available