From 3e152cfac863faf08808a55c410857bcfa5e5441 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 05:16:04 +0000 Subject: [PATCH 01/22] docs(openai-passthrough): add implementation plan 16 tasks covering feature flag, auth middleware extension, usage extraction, httpx passthrough client, /chat/completions and /responses endpoints with streaming, full Responses CRUD, /models, guardrail header forwarding, and documentation updates. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../plans/2026-05-25-openai-passthrough.md | 1912 +++++++++++++++++ 1 file changed, 1912 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-25-openai-passthrough.md diff --git a/docs/superpowers/plans/2026-05-25-openai-passthrough.md b/docs/superpowers/plans/2026-05-25-openai-passthrough.md new file mode 100644 index 0000000..b6bcd29 --- /dev/null +++ b/docs/superpowers/plans/2026-05-25-openai-passthrough.md @@ -0,0 +1,1912 @@ +# OpenAI Passthrough Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add `/openai/v1/*` endpoints that accept OpenAI-native Chat Completions and Responses API calls and forward them to AWS bedrock-mantle, while reusing the proxy's API key auth, rate limits, budgets, and usage tracking. + +**Architecture:** Raw httpx passthrough (no Pydantic schemas for OpenAI types) on a new APIRouter mounted at `/openai/v1`. Existing auth middleware extended to accept `Authorization: Bearer` in addition to `x-api-key`. Usage extracted from response bodies (non-streaming) or final SSE event (streaming) and normalized into the existing Anthropic-shaped DDB schema with new `api_surface` and `reasoning_tokens` columns. Independent of existing `ENABLE_OPENAI_COMPAT`. + +**Tech Stack:** Python 3.12, FastAPI, httpx (async), pytest + respx for HTTP mocking, AWS DynamoDB (boto3), uv for package management. + +**Reference design:** `docs/plans/2026-05-25-openai-passthrough-design.md` + +--- + +## File Structure + +| File | Action | Purpose | +|---|---|---| +| `app/core/config.py` | Modify | Add `enable_openai_passthrough` flag | +| `app/middleware/auth.py` | Modify | Accept `Authorization: Bearer` header | +| `app/api/openai_passthrough/__init__.py` | Create | Export `router` | +| `app/api/openai_passthrough/client.py` | Create | httpx singleton client | +| `app/api/openai_passthrough/usage_extractor.py` | Create | Normalize OpenAI usage → Anthropic-shaped dict | +| `app/api/openai_passthrough/streaming.py` | Create | SSE passthrough + usage tee | +| `app/api/openai_passthrough/router.py` | Create | FastAPI routes | +| `app/db/dynamodb.py` | Modify | Extend `UsageTracker.record_usage` with `api_surface` and `reasoning_tokens` | +| `app/main.py` | Modify | Conditionally mount the new router | +| `env.example` | Modify | Document the new flag | +| `CLAUDE.md` | Modify | Add feature description | +| `docs/architecture/features.md` | Modify | Detailed feature doc | +| `tests/unit/test_openai_passthrough/test_usage_extractor.py` | Create | Unit tests for normalization | +| `tests/unit/test_openai_passthrough/test_auth.py` | Create | Unit tests for header acceptance | +| `tests/unit/test_openai_passthrough/test_model_mapping.py` | Create | Unit tests for mapping resolution | +| `tests/unit/test_openai_passthrough/__init__.py` | Create | Test package marker | +| `tests/integration/test_openai_passthrough/test_chat_completions.py` | Create | Chat completions integration | +| `tests/integration/test_openai_passthrough/test_responses.py` | Create | Responses API integration | +| `tests/integration/test_openai_passthrough/test_responses_crud.py` | Create | Responses CRUD passthrough | +| `tests/integration/test_openai_passthrough/test_models.py` | Create | /models endpoint | +| `tests/integration/test_openai_passthrough/conftest.py` | Create | Shared fixtures (FastAPI client, respx) | +| `tests/integration/test_openai_passthrough/__init__.py` | Create | Test package marker | + +**Tooling:** add `respx>=0.21.0` to `[project.optional-dependencies].dev` in `pyproject.toml`. + +--- + +## Task 1: Add feature flag and respx dependency + +**Files:** +- Modify: `app/core/config.py` +- Modify: `pyproject.toml` + +- [ ] **Step 1: Add the feature flag to settings** + +In `app/core/config.py`, find the existing OpenAI-Compat block (around line 379–406) and add a new field immediately after `openai_compat_thinking_medium_threshold`: + +```python + enable_openai_passthrough: bool = Field( + default=False, + alias="ENABLE_OPENAI_PASSTHROUGH", + description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)" + ) +``` + +- [ ] **Step 2: Add respx to dev dependencies** + +In `pyproject.toml`, locate the `dev = [...]` list under `[project.optional-dependencies]` (around line 80–95) and add `"respx>=0.21.0",` after `"pytest-mock>=3.12.0",`. + +- [ ] **Step 3: Sync dependencies** + +Run: `unset VIRTUAL_ENV && uv sync --active --extra dev` +Expected: `respx` and its deps resolved and installed. + +- [ ] **Step 4: Verify the setting loads** + +Run: +```bash +unset VIRTUAL_ENV && uv run --active python -c "from app.core.config import settings; print(settings.enable_openai_passthrough)" +``` +Expected output: `False` + +- [ ] **Step 5: Commit** + +```bash +git add app/core/config.py pyproject.toml uv.lock +git commit -m "feat(openai-passthrough): add ENABLE_OPENAI_PASSTHROUGH flag and respx dev dep" +``` + +--- + +## Task 2: Extend auth middleware to accept Authorization: Bearer + +**Files:** +- Modify: `app/middleware/auth.py:62-77` +- Test: `tests/unit/test_openai_passthrough/test_auth.py` + +- [ ] **Step 1: Create the test package structure** + +Run: +```bash +mkdir -p tests/unit/test_openai_passthrough +touch tests/unit/test_openai_passthrough/__init__.py +``` + +- [ ] **Step 2: Write the failing test** + +Create `tests/unit/test_openai_passthrough/test_auth.py`: + +```python +"""Tests for the auth middleware's Authorization: Bearer support.""" +from unittest.mock import MagicMock, patch + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from app.middleware.auth import AuthMiddleware + + +@pytest.fixture +def make_app(): + """Build a minimal FastAPI app wired to AuthMiddleware with a mocked validator.""" + def _factory(api_key_info): + app = FastAPI() + + ddb_client = MagicMock() + manager = MagicMock() + manager.validate_api_key.return_value = api_key_info + + with patch("app.middleware.auth.APIKeyManager", return_value=manager): + app.add_middleware(AuthMiddleware, dynamodb_client=ddb_client) + + @app.get("/test") + async def test_endpoint(request): + from fastapi import Request + r: Request = request # type: ignore[assignment] + info = r.state.api_key_info + return {"user_id": info["user_id"]} + + return app, manager + return _factory + + +def test_authorization_bearer_resolves_when_xapikey_missing(make_app, monkeypatch): + """Authorization: Bearer should authenticate when x-api-key is absent.""" + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + app, manager = make_app({"user_id": "u1", "api_key": "sk-abc"}) + client = TestClient(app) + + r = client.get("/test", headers={"Authorization": "Bearer sk-abc"}) + + assert r.status_code == 200 + assert r.json() == {"user_id": "u1"} + manager.validate_api_key.assert_called_once_with("sk-abc") + + +def test_xapikey_takes_precedence_when_both_present(make_app, monkeypatch): + """If both headers are present, x-api-key wins.""" + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + app, manager = make_app({"user_id": "u1", "api_key": "sk-from-xapikey"}) + client = TestClient(app) + + client.get( + "/test", + headers={"x-api-key": "sk-from-xapikey", "Authorization": "Bearer sk-from-bearer"}, + ) + + manager.validate_api_key.assert_called_once_with("sk-from-xapikey") + + +def test_missing_both_headers_returns_401(make_app, monkeypatch): + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + app, _ = make_app(None) + client = TestClient(app) + + r = client.get("/test") + assert r.status_code == 401 + + +def test_authorization_non_bearer_is_ignored(make_app, monkeypatch): + """Authorization: Basic ... (or anything not 'Bearer ') should not be treated as an API key.""" + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + app, _ = make_app(None) + client = TestClient(app) + + r = client.get("/test", headers={"Authorization": "Basic dXNlcjpwYXNz"}) + assert r.status_code == 401 +``` + +- [ ] **Step 3: Run the tests to verify they fail** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_auth.py -v --no-cov` +Expected: All four tests FAIL — `test_authorization_bearer_resolves_when_xapikey_missing` returns 401 because the middleware doesn't yet read Authorization. + +- [ ] **Step 4: Modify the middleware to read Authorization: Bearer** + +In `app/middleware/auth.py`, replace lines 62–77 (the API key extraction + missing-key 401 block) with: + +```python + # Extract API key from header (x-api-key first, fall back to Authorization: Bearer) + api_key = request.headers.get(settings.api_key_header) + if not api_key: + authz = request.headers.get("Authorization") or request.headers.get("authorization") + if authz and authz.startswith("Bearer "): + api_key = authz[len("Bearer "):].strip() + + if not api_key: + print(f"[AUTH] Missing API key for {request.url.path}") + from fastapi.responses import JSONResponse + return JSONResponse( + status_code=status.HTTP_401_UNAUTHORIZED, + content={ + "type": "error", + "error": { + "type": "authentication_error", + "message": f"Missing API key in {settings.api_key_header} or Authorization: Bearer header", + }, + }, + ) +``` + +- [ ] **Step 5: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_auth.py -v --no-cov` +Expected: All four tests PASS. + +- [ ] **Step 6: Run the full unit test suite to ensure no regression** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov` +Expected: All previously-passing tests still pass. + +- [ ] **Step 7: Commit** + +```bash +git add app/middleware/auth.py tests/unit/test_openai_passthrough/test_auth.py tests/unit/test_openai_passthrough/__init__.py +git commit -m "feat(auth): accept Authorization: Bearer alongside x-api-key" +``` + +--- + +## Task 3: Add usage normalization function + +**Files:** +- Create: `app/api/openai_passthrough/__init__.py` +- Create: `app/api/openai_passthrough/usage_extractor.py` +- Test: `tests/unit/test_openai_passthrough/test_usage_extractor.py` + +- [ ] **Step 1: Create the package directories** + +Run: +```bash +mkdir -p app/api/openai_passthrough +touch app/api/openai_passthrough/__init__.py +``` + +- [ ] **Step 2: Write the failing tests** + +Create `tests/unit/test_openai_passthrough/test_usage_extractor.py`: + +```python +"""Tests for normalize_usage and try_extract_usage_from_sse.""" +import json + +from app.api.openai_passthrough.usage_extractor import ( + normalize_usage, + try_extract_usage_from_sse, +) + + +def test_normalize_chat_completions_basic(): + raw = {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150} + result = normalize_usage(raw, "chat_completions") + assert result == { + "input_tokens": 100, + "output_tokens": 50, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + "reasoning_tokens": 0, + } + + +def test_normalize_chat_completions_with_cache_and_reasoning(): + raw = { + "prompt_tokens": 100, + "completion_tokens": 50, + "prompt_tokens_details": {"cached_tokens": 30}, + "completion_tokens_details": {"reasoning_tokens": 20}, + } + result = normalize_usage(raw, "chat_completions") + # cache hits subtracted from input + assert result["input_tokens"] == 70 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 30 + assert result["reasoning_tokens"] == 20 + + +def test_normalize_responses_basic(): + raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + result = normalize_usage(raw, "responses") + assert result["input_tokens"] == 100 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 0 + assert result["reasoning_tokens"] == 0 + + +def test_normalize_responses_with_cache_and_reasoning(): + raw = { + "input_tokens": 100, + "output_tokens": 50, + "input_tokens_details": {"cached_tokens": 25}, + "output_tokens_details": {"reasoning_tokens": 15}, + } + result = normalize_usage(raw, "responses") + assert result["input_tokens"] == 75 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 25 + assert result["reasoning_tokens"] == 15 + + +def test_normalize_handles_missing_fields(): + """Empty/None usage should normalize to all-zeros, not crash.""" + result = normalize_usage({}, "chat_completions") + assert result == { + "input_tokens": 0, "output_tokens": 0, + "cache_read_input_tokens": 0, "cache_creation_input_tokens": 0, + "reasoning_tokens": 0, + } + + +def test_extract_chat_completions_usage_from_sse_chunk(): + """Final chat-completions chunk with usage should be picked up.""" + line = "data: " + json.dumps({ + "id": "chatcmpl-1", "choices": [], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, + }) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "chat_completions") + assert holder == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + + +def test_extract_responses_usage_from_response_completed_event(): + line = "data: " + json.dumps({ + "type": "response.completed", + "response": { + "id": "resp-1", + "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}, + }, + }) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "responses") + assert holder == {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28} + + +def test_extract_ignores_non_data_lines(): + holder: dict = {} + try_extract_usage_from_sse("event: response.completed", holder, "responses") + try_extract_usage_from_sse("", holder, "responses") + try_extract_usage_from_sse(": keepalive", holder, "responses") + assert holder == {} + + +def test_extract_ignores_data_done(): + holder: dict = {} + try_extract_usage_from_sse("data: [DONE]", holder, "chat_completions") + assert holder == {} + + +def test_extract_ignores_chunks_without_usage(): + line = "data: " + json.dumps({"choices": [{"delta": {"content": "hi"}}]}) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "chat_completions") + assert holder == {} + + +def test_extract_ignores_malformed_json(): + holder: dict = {} + try_extract_usage_from_sse("data: not-json", holder, "chat_completions") + assert holder == {} +``` + +- [ ] **Step 3: Run the tests to verify they fail** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_extractor.py -v --no-cov` +Expected: ImportError — module doesn't exist yet. + +- [ ] **Step 4: Implement usage_extractor** + +Create `app/api/openai_passthrough/usage_extractor.py`: + +```python +"""Usage extraction and normalization for OpenAI-format responses. + +normalize_usage() converts an OpenAI Chat Completions or Responses API usage +dict into the Anthropic-shaped dict that UsageTracker.record_usage expects, +plus a separate reasoning_tokens field. + +try_extract_usage_from_sse() peeks at SSE lines during streaming and stashes +the usage dict (raw OpenAI shape) the first time it encounters one. The caller +later passes that dict through normalize_usage(). +""" +from __future__ import annotations + +import json +from typing import Any, Dict + + +def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]: + """Normalize OpenAI-shaped usage into Anthropic-shaped fields. + + api_surface: "chat_completions" or "responses" + """ + if api_surface == "chat_completions": + in_tok = int(raw.get("prompt_tokens", 0) or 0) + out_tok = int(raw.get("completion_tokens", 0) or 0) + cached = int((raw.get("prompt_tokens_details") or {}).get("cached_tokens", 0) or 0) + reasoning = int( + (raw.get("completion_tokens_details") or {}).get("reasoning_tokens", 0) or 0 + ) + else: # responses + in_tok = int(raw.get("input_tokens", 0) or 0) + out_tok = int(raw.get("output_tokens", 0) or 0) + cached = int((raw.get("input_tokens_details") or {}).get("cached_tokens", 0) or 0) + reasoning = int( + (raw.get("output_tokens_details") or {}).get("reasoning_tokens", 0) or 0 + ) + + # Cache-read tokens are billed separately, so subtract them from input_tokens + # to mirror how the Anthropic flow accounts for cache hits. + return { + "input_tokens": max(in_tok - cached, 0), + "output_tokens": out_tok, + "cache_read_input_tokens": cached, + "cache_creation_input_tokens": 0, # Not exposed by OpenAI-format APIs + "reasoning_tokens": reasoning, + } + + +def try_extract_usage_from_sse( + raw_line: str, holder: Dict[str, Any], api_surface: str +) -> None: + """Inspect an SSE line and, if it carries usage info, store it in holder. + + Mutates `holder` in place. Idempotent: subsequent calls overwrite, so the + last-seen usage event wins (which is what we want — both APIs put usage + on the terminal event). + """ + line = raw_line.strip() + if not line.startswith("data:"): + return + + payload = line[len("data:"):].strip() + if not payload or payload == "[DONE]": + return + + try: + obj = json.loads(payload) + except (ValueError, TypeError): + return + + if api_surface == "chat_completions": + usage = obj.get("usage") + if isinstance(usage, dict): + holder.clear() + holder.update(usage) + else: # responses + # Usage lives on the `response.completed` event under + # event.response.usage. Other events occasionally carry partial usage + # too — accept any usage dict we see. + if obj.get("type") == "response.completed": + response_obj = obj.get("response") or {} + usage = response_obj.get("usage") + if isinstance(usage, dict): + holder.clear() + holder.update(usage) +``` + +- [ ] **Step 5: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_extractor.py -v --no-cov` +Expected: All 10 tests PASS. + +- [ ] **Step 6: Commit** + +```bash +git add app/api/openai_passthrough/__init__.py app/api/openai_passthrough/usage_extractor.py tests/unit/test_openai_passthrough/test_usage_extractor.py +git commit -m "feat(openai-passthrough): add usage normalization and SSE extraction helpers" +``` + +--- + +## Task 4: Add model mapping resolver helper + +**Files:** +- Create: `app/api/openai_passthrough/model_mapping.py` +- Test: `tests/unit/test_openai_passthrough/test_model_mapping.py` + +- [ ] **Step 1: Write the failing test** + +Create `tests/unit/test_openai_passthrough/test_model_mapping.py`: + +```python +"""Tests for resolve_model_id.""" +from unittest.mock import MagicMock + +from app.api.openai_passthrough.model_mapping import resolve_model_id + + +def test_returns_mapped_id_when_mapping_exists(): + manager = MagicMock() + manager.get_mapping.return_value = "openai.gpt-oss-120b" + + out = resolve_model_id("gpt-4", manager) + assert out == "openai.gpt-oss-120b" + manager.get_mapping.assert_called_once_with("gpt-4") + + +def test_passes_through_when_no_mapping_exists(): + manager = MagicMock() + manager.get_mapping.return_value = None + + out = resolve_model_id("openai.gpt-oss-120b", manager) + assert out == "openai.gpt-oss-120b" + + +def test_passes_through_empty_string(): + manager = MagicMock() + manager.get_mapping.return_value = None + + assert resolve_model_id("", manager) == "" + + +def test_handles_lookup_exception_by_passing_through(): + """If DDB lookup raises, fall back to the original ID rather than crashing the request.""" + manager = MagicMock() + manager.get_mapping.side_effect = RuntimeError("ddb down") + + out = resolve_model_id("gpt-4", manager) + assert out == "gpt-4" +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_model_mapping.py -v --no-cov` +Expected: ImportError. + +- [ ] **Step 3: Implement resolve_model_id** + +Create `app/api/openai_passthrough/model_mapping.py`: + +```python +"""Model ID resolution for the OpenAI passthrough endpoints. + +Looks up the client-supplied model in the existing model_mapping table; if a +mapping exists, substitute it. Otherwise, pass through unchanged so callers +can use Bedrock-native IDs (e.g. ``openai.gpt-oss-120b``) directly without +needing to register them. +""" +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +def resolve_model_id(model: str, model_mapping_manager) -> str: + """Resolve a client-supplied model ID via the mapping table, with fallback. + + Args: + model: The ``model`` field from the client request. + model_mapping_manager: An app.db.dynamodb.ModelMappingManager instance. + + Returns: + The resolved Bedrock model ID, or the original string if no mapping + exists or the lookup fails. + """ + if not model: + return model + try: + mapped = model_mapping_manager.get_mapping(model) + except Exception as exc: + logger.warning("[OPENAI-PASSTHROUGH] model mapping lookup failed for %r: %s", model, exc) + return model + return mapped or model +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_model_mapping.py -v --no-cov` +Expected: All four tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app/api/openai_passthrough/model_mapping.py tests/unit/test_openai_passthrough/test_model_mapping.py +git commit -m "feat(openai-passthrough): add model mapping resolver with passthrough fallback" +``` + +--- + +## Task 5: Extend UsageTracker.record_usage with api_surface and reasoning_tokens + +**Files:** +- Modify: `app/db/dynamodb.py:908-970` +- Test: `tests/unit/test_openai_passthrough/test_usage_tracker_extended.py` + +- [ ] **Step 1: Write the failing test** + +Create `tests/unit/test_openai_passthrough/test_usage_tracker_extended.py`: + +```python +"""Tests for the api_surface and reasoning_tokens additions to UsageTracker.""" +from unittest.mock import MagicMock + +from app.db.dynamodb import UsageTracker + + +def _make_tracker(): + ddb_client = MagicMock() + ddb_client.usage_table_name = "anthropic-proxy-usage" + tracker = UsageTracker(ddb_client) + tracker.table = MagicMock() + return tracker + + +def test_record_usage_writes_api_surface_when_provided(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", + request_id="req-1", + model="openai.gpt-oss-120b", + input_tokens=100, + output_tokens=50, + api_surface="chat_completions", + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + assert item["api_surface"] == "chat_completions" + + +def test_record_usage_writes_reasoning_tokens_when_provided(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", request_id="req-1", model="m", + input_tokens=10, output_tokens=5, reasoning_tokens=3, + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + assert item["reasoning_tokens"] == 3 + + +def test_record_usage_omits_new_fields_when_default(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", request_id="req-1", model="m", + input_tokens=10, output_tokens=5, + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + # Sparse: not written when caller didn't specify them + assert "api_surface" not in item + assert "reasoning_tokens" not in item +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_tracker_extended.py -v --no-cov` +Expected: TypeError — record_usage doesn't accept the new kwargs. + +- [ ] **Step 3: Modify record_usage** + +In `app/db/dynamodb.py`, find `UsageTracker.record_usage` (line 908). Add two new optional parameters to its signature, after `cache_ttl`: + +```python + cache_ttl: Optional[str] = None, + api_surface: Optional[str] = None, + reasoning_tokens: int = 0, + ): +``` + +Update the docstring's Args block to document them: + +``` + api_surface: Source endpoint family ("messages", "chat_completions", or "responses") + reasoning_tokens: Reasoning tokens (already counted in output_tokens; stored separately for visibility) +``` + +Then, after the existing `if cache_ttl:` block (around line 962–963), add: + +```python + if api_surface: + item["api_surface"] = api_surface + if reasoning_tokens: + item["reasoning_tokens"] = reasoning_tokens +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_tracker_extended.py -v --no-cov` +Expected: All three tests PASS. + +- [ ] **Step 5: Run the full unit suite to check nothing regressed** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov` +Expected: All previously-passing tests still pass. + +- [ ] **Step 6: Commit** + +```bash +git add app/db/dynamodb.py tests/unit/test_openai_passthrough/test_usage_tracker_extended.py +git commit -m "feat(usage): record api_surface and reasoning_tokens on usage rows" +``` + +--- + +## Task 6: Implement httpx client singleton + +**Files:** +- Create: `app/api/openai_passthrough/client.py` + +This is a small helper without business logic, so we test it indirectly through the integration tests (Tasks 8–11). No standalone unit tests. + +- [ ] **Step 1: Write the client module** + +Create `app/api/openai_passthrough/client.py`: + +```python +"""Async httpx client to bedrock-mantle, lazily constructed and reused. + +Headers are NOT set on the client itself; they're added per-request in the +router so we can include the proxy's Bedrock API key in Authorization. +""" +from __future__ import annotations + +import httpx + +from app.core.config import settings + +_client: httpx.AsyncClient | None = None + + +def get_client() -> httpx.AsyncClient: + global _client + if _client is None: + _client = httpx.AsyncClient( + base_url=settings.openai_base_url, + timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0), + limits=httpx.Limits(max_connections=200, max_keepalive_connections=50), + ) + return _client + + +def reset_client_for_testing() -> None: + """Reset the singleton — only call this from test fixtures.""" + global _client + if _client is not None: + # AsyncClient.aclose() is async; tests will close the loop after, so we + # null it here and let the GC clean up the underlying transport. + _client = None + + +def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]: + """Build the Authorization + standard headers for an upstream call.""" + headers = { + "Authorization": f"Bearer {settings.openai_api_key}", + "Content-Type": "application/json", + "User-Agent": "bedrock-api-proxy/openai-passthrough", + } + if extra: + headers.update(extra) + return headers +``` + +- [ ] **Step 2: Smoke-test the import** + +Run: +```bash +unset VIRTUAL_ENV && uv run --active python -c "from app.api.openai_passthrough.client import get_client, upstream_headers; print(upstream_headers())" +``` +Expected: prints a dict with `Authorization: Bearer ` (the configured key, possibly empty), `Content-Type`, and `User-Agent`. + +- [ ] **Step 3: Commit** + +```bash +git add app/api/openai_passthrough/client.py +git commit -m "feat(openai-passthrough): add httpx singleton client and header helper" +``` + +--- + +## Task 7: Implement streaming passthrough helper + +**Files:** +- Create: `app/api/openai_passthrough/streaming.py` + +Tested indirectly through integration tests in Task 9. + +- [ ] **Step 1: Write the streaming module** + +Create `app/api/openai_passthrough/streaming.py`: + +```python +"""SSE passthrough with usage tee. + +The async generator yields raw response bytes line-by-line so the FastAPI +StreamingResponse forwards them unchanged. After upstream stream ends, it +calls the supplied on_complete callback with the captured usage dict so the +caller can record usage to DynamoDB. +""" +from __future__ import annotations + +import logging +from typing import Any, AsyncIterator, Awaitable, Callable, Dict + +from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse + +logger = logging.getLogger(__name__) + + +async def stream_passthrough( + method: str, + path: str, + body: Dict[str, Any] | None, + api_surface: str, + on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None], + extra_headers: Dict[str, str] | None = None, +) -> AsyncIterator[bytes]: + """Stream upstream response bytes line-by-line; capture usage; trigger callback.""" + usage: Dict[str, Any] = {} + + client = get_client() + headers = upstream_headers(extra_headers) + + try: + async with client.stream(method, path, json=body, headers=headers) as resp: + async for raw_line in resp.aiter_lines(): + # Upstream gives us SSE lines without trailing newlines; restore the + # framing byte so the SSE body is well-formed for the downstream client. + yield (raw_line + "\n").encode("utf-8") + try_extract_usage_from_sse(raw_line, usage, api_surface) + except Exception as exc: + logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc) + # Re-raise so FastAPI can return a 500; downstream client sees the stream end. + raise + + if usage: + result = on_complete(usage) + # Support both sync and async callbacks + if hasattr(result, "__await__"): + await result # type: ignore[func-returns-value] +``` + +- [ ] **Step 2: Smoke-test the import** + +Run: +```bash +unset VIRTUAL_ENV && uv run --active python -c "from app.api.openai_passthrough.streaming import stream_passthrough; print('ok')" +``` +Expected: prints `ok`. + +- [ ] **Step 3: Commit** + +```bash +git add app/api/openai_passthrough/streaming.py +git commit -m "feat(openai-passthrough): add SSE passthrough generator with usage tee" +``` + +--- + +## Task 8: Implement router skeleton + chat/completions (non-streaming) + +**Files:** +- Create: `app/api/openai_passthrough/router.py` +- Modify: `app/main.py:298-314` +- Create: `tests/integration/test_openai_passthrough/__init__.py` +- Create: `tests/integration/test_openai_passthrough/conftest.py` +- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py` + +- [ ] **Step 1: Create the integration test scaffolding** + +Run: +```bash +mkdir -p tests/integration/test_openai_passthrough +touch tests/integration/test_openai_passthrough/__init__.py +``` + +Create `tests/integration/test_openai_passthrough/conftest.py`: + +```python +"""Shared fixtures for openai-passthrough integration tests.""" +from unittest.mock import MagicMock, patch + +import pytest +import respx +from fastapi.testclient import TestClient + + +@pytest.fixture +def mock_settings(monkeypatch): + """Set the env so the passthrough router mounts and points at a fake mantle.""" + monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", True) + monkeypatch.setattr("app.core.config.settings.openai_api_key", "bedrock-key-test") + monkeypatch.setattr("app.core.config.settings.openai_base_url", "https://mantle.test/v1") + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + +@pytest.fixture +def mock_api_key_manager(): + """Patch APIKeyManager so any non-empty key validates as user 'u1'.""" + manager = MagicMock() + manager.validate_api_key.return_value = { + "api_key": "sk-test", "user_id": "u1", "is_master": False, + "rate_limit": None, "cache_ttl": None, + } + with patch("app.middleware.auth.APIKeyManager", return_value=manager): + yield manager + + +@pytest.fixture +def mock_model_mapping_manager(): + """Patch ModelMappingManager to return None (no mapping) by default.""" + manager = MagicMock() + manager.get_mapping.return_value = None + with patch("app.db.dynamodb.ModelMappingManager", return_value=manager): + yield manager + + +@pytest.fixture +def mock_usage_tracker(): + tracker = MagicMock() + with patch("app.db.dynamodb.UsageTracker", return_value=tracker): + yield tracker + + +@pytest.fixture +def respx_mock(): + """respx mock router for httpx calls.""" + with respx.mock(base_url="https://mantle.test/v1", assert_all_called=False) as router: + yield router + + +@pytest.fixture +def client(mock_settings, mock_api_key_manager, mock_model_mapping_manager, mock_usage_tracker): + """FastAPI TestClient with all mocks wired in. + + Imports inside the fixture so module-level settings reads happen after + monkeypatching. + """ + # Reset httpx singleton so it picks up the patched base URL + from app.api.openai_passthrough.client import reset_client_for_testing + reset_client_for_testing() + + from app.main import app + return TestClient(app) +``` + +- [ ] **Step 2: Write the failing test** + +Create `tests/integration/test_openai_passthrough/test_chat_completions.py`: + +```python +"""Integration tests for POST /openai/v1/chat/completions.""" +import json + +import respx +import httpx + + +def test_non_streaming_chat_completions_forwards_and_logs_usage( + client, respx_mock, mock_usage_tracker, mock_model_mapping_manager +): + upstream_resp = { + "id": "chatcmpl-1", + "object": "chat.completion", + "model": "openai.gpt-oss-120b", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "hi"}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, + } + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json=upstream_resp) + ) + + r = client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={ + "model": "openai.gpt-oss-120b", + "messages": [{"role": "user", "content": "hi"}], + }, + ) + + assert r.status_code == 200 + assert r.json() == upstream_resp + assert route.called + # Upstream got proxy's Bedrock API key, not the client's proxy key + sent = route.calls[0].request + assert sent.headers["authorization"] == "Bearer bedrock-key-test" + sent_body = json.loads(sent.content) + assert sent_body["model"] == "openai.gpt-oss-120b" + # Usage was recorded + assert mock_usage_tracker.record_usage.called + kwargs = mock_usage_tracker.record_usage.call_args.kwargs + assert kwargs["input_tokens"] == 10 + assert kwargs["output_tokens"] == 5 + assert kwargs["api_surface"] == "chat_completions" + assert kwargs["model"] == "openai.gpt-oss-120b" + + +def test_model_mapping_is_applied( + client, respx_mock, mock_model_mapping_manager +): + mock_model_mapping_manager.get_mapping.return_value = "openai.gpt-oss-120b" + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json={ + "id": "x", "choices": [], "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2} + }) + ) + + client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "gpt-4", "messages": [{"role": "user", "content": "hi"}]}, + ) + + sent = json.loads(route.calls[0].request.content) + assert sent["model"] == "openai.gpt-oss-120b" + + +def test_upstream_4xx_returned_verbatim(client, respx_mock, mock_usage_tracker): + err_body = {"error": {"message": "model not found", "type": "invalid_request_error"}} + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(404, json=err_body) + ) + + r = client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "no-such-model", "messages": []}, + ) + assert r.status_code == 404 + assert r.json() == err_body + assert not mock_usage_tracker.record_usage.called # Don't log usage on errors + + +def test_missing_auth_returns_401(client): + r = client.post( + "/openai/v1/chat/completions", + json={"model": "x", "messages": []}, + ) + assert r.status_code == 401 +``` + +- [ ] **Step 3: Run the test to verify it fails** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov` +Expected: tests fail — endpoint doesn't exist yet (404 from FastAPI). + +- [ ] **Step 4: Implement the router** + +Create `app/api/openai_passthrough/router.py`: + +```python +"""FastAPI routes for the OpenAI passthrough endpoints. + +Mounted at /openai/v1 only when settings.enable_openai_passthrough is True. +""" +from __future__ import annotations + +import logging +from typing import Any, Dict +from uuid import uuid4 + +from fastapi import APIRouter, Depends, Request, Response +from fastapi.responses import JSONResponse, StreamingResponse + +from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.model_mapping import resolve_model_id +from app.api.openai_passthrough.streaming import stream_passthrough +from app.api.openai_passthrough.usage_extractor import normalize_usage +from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker +from app.middleware.auth import get_api_key_info + +logger = logging.getLogger(__name__) +router = APIRouter() + +_ddb: DynamoDBClient | None = None +_mapping: ModelMappingManager | None = None +_usage: UsageTracker | None = None + + +def _managers(): + """Lazily build DDB managers — keeps import-time side effects out of tests.""" + global _ddb, _mapping, _usage + if _ddb is None: + _ddb = DynamoDBClient() + _mapping = ModelMappingManager(_ddb) + _usage = UsageTracker(_ddb) + return _mapping, _usage + + +def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None: + _, usage = _managers() + norm = normalize_usage(raw_usage, api_surface) + try: + usage.record_usage( + api_key=api_key_info.get("api_key", ""), + request_id=str(uuid4()), + model=model, + input_tokens=norm["input_tokens"], + output_tokens=norm["output_tokens"], + cached_tokens=norm["cache_read_input_tokens"], + cache_write_input_tokens=norm["cache_creation_input_tokens"], + api_surface=api_surface, + reasoning_tokens=norm["reasoning_tokens"], + ) + except Exception as exc: + logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc) + + +def _passthrough_extra_headers(request: Request) -> Dict[str, str]: + """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails).""" + extra: Dict[str, str] = {} + for name, value in request.headers.items(): + if name.lower().startswith("x-amzn-bedrock-"): + extra[name] = value + return extra + + +@router.post("/chat/completions") +async def chat_completions( + request: Request, + api_key_info: Dict[str, Any] = Depends(get_api_key_info), +): + body = await request.json() + mapping, _ = _managers() + body["model"] = resolve_model_id(body.get("model", ""), mapping) + extra = _passthrough_extra_headers(request) + + if body.get("stream"): + async def on_complete(usage: Dict[str, Any]) -> None: + _record_usage(api_key_info, usage, body["model"], "chat_completions") + return StreamingResponse( + stream_passthrough( + "POST", "/chat/completions", body, "chat_completions", on_complete, extra + ), + media_type="text/event-stream", + ) + + resp = await get_client().post( + "/chat/completions", json=body, headers=upstream_headers(extra) + ) + if resp.status_code >= 400: + return JSONResponse(_safe_json(resp), status_code=resp.status_code) + + data = resp.json() + if isinstance(data, dict) and isinstance(data.get("usage"), dict): + _record_usage(api_key_info, data["usage"], body["model"], "chat_completions") + return JSONResponse(data, status_code=resp.status_code) + + +def _safe_json(resp) -> Dict[str, Any]: + try: + return resp.json() + except ValueError: + return {"error": {"message": resp.text, "type": "upstream_error"}} +``` + +- [ ] **Step 5: Wire up `__init__.py`** + +In `app/api/openai_passthrough/__init__.py`, replace the empty file with: + +```python +"""OpenAI Passthrough — accepts OpenAI Chat Completions and Responses API +calls from clients and forwards them to AWS bedrock-mantle. +""" +from app.api.openai_passthrough.router import router + +__all__ = ["router"] +``` + +- [ ] **Step 6: Mount the router in main.py** + +In `app/main.py`, after the existing `app.include_router(models.router, ...)` block (around line 314), add: + +```python +if settings.enable_openai_passthrough: + from app.api.openai_passthrough import router as openai_passthrough_router + app.include_router( + openai_passthrough_router, + prefix="/openai/v1", + tags=["OpenAI Passthrough"], + ) +``` + +- [ ] **Step 7: Run the integration tests** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov` +Expected: All four tests PASS. + +- [ ] **Step 8: Run the full unit suite to check no regression** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov` +Expected: All previously-passing tests still pass. + +- [ ] **Step 9: Commit** + +```bash +git add app/api/openai_passthrough/router.py app/api/openai_passthrough/__init__.py app/main.py tests/integration/test_openai_passthrough/__init__.py tests/integration/test_openai_passthrough/conftest.py tests/integration/test_openai_passthrough/test_chat_completions.py +git commit -m "feat(openai-passthrough): non-streaming /chat/completions endpoint" +``` + +--- + +## Task 9: Add streaming support to chat/completions + +**Files:** +- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py` (extend) + +The router already routes `body["stream"] = True` requests through `stream_passthrough`; this task validates the path end-to-end and adds the missing-`include_usage` case. + +- [ ] **Step 1: Append failing tests** + +Append the following to `tests/integration/test_openai_passthrough/test_chat_completions.py`: + +```python +def test_streaming_chat_completions_forwards_sse_and_records_usage( + client, respx_mock, mock_usage_tracker +): + """Stream three SSE chunks; the second-to-last carries usage.""" + sse_lines = [ + 'data: {"id":"x","choices":[{"index":0,"delta":{"role":"assistant"}}]}', + 'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}', + 'data: {"id":"x","choices":[],"usage":{"prompt_tokens":7,"completion_tokens":2,"total_tokens":9}}', + 'data: [DONE]', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response( + 200, headers={"content-type": "text/event-stream"}, content=body + ) + ) + + with client.stream( + "POST", + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={ + "model": "openai.gpt-oss-120b", + "messages": [{"role": "user", "content": "hi"}], + "stream": True, + "stream_options": {"include_usage": True}, + }, + ) as r: + assert r.status_code == 200 + out = b"".join(r.iter_bytes()) + + # All four lines forwarded + assert b'"delta":{"role":"assistant"}' in out + assert b'[DONE]' in out + # Usage recorded from the chunk that had it + assert mock_usage_tracker.record_usage.called + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 7 + assert kw["output_tokens"] == 2 + assert kw["api_surface"] == "chat_completions" + + +def test_streaming_chat_completions_without_include_usage_does_not_log( + client, respx_mock, mock_usage_tracker +): + """If client doesn't request usage, no usage chunk arrives → no usage logged.""" + sse_lines = [ + 'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}', + 'data: [DONE]', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response( + 200, headers={"content-type": "text/event-stream"}, content=body + ) + ) + + with client.stream( + "POST", "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "messages": [], "stream": True}, + ) as r: + list(r.iter_bytes()) # drain + + assert not mock_usage_tracker.record_usage.called +``` + +- [ ] **Step 2: Run the tests** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov` +Expected: All six tests PASS (including the two new streaming tests). + +- [ ] **Step 3: Commit** + +```bash +git add tests/integration/test_openai_passthrough/test_chat_completions.py +git commit -m "test(openai-passthrough): streaming /chat/completions integration tests" +``` + +--- + +## Task 10: Add Responses API POST endpoint (streaming + non-streaming) + +**Files:** +- Modify: `app/api/openai_passthrough/router.py` +- Test: `tests/integration/test_openai_passthrough/test_responses.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/integration/test_openai_passthrough/test_responses.py`: + +```python +"""Integration tests for POST /openai/v1/responses (streaming + non-streaming).""" +import json + +import httpx + + +def test_non_streaming_responses_forwards_and_logs_usage( + client, respx_mock, mock_usage_tracker +): + upstream = { + "id": "resp-1", + "object": "response", + "model": "openai.gpt-oss-120b", + "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "hi"}]}], + "usage": {"input_tokens": 11, "output_tokens": 4, "total_tokens": 15}, + } + route = respx_mock.post("/responses").mock(return_value=httpx.Response(200, json=upstream)) + + r = client.post( + "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}]}, + ) + + assert r.status_code == 200 + assert r.json() == upstream + assert route.called + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 11 + assert kw["output_tokens"] == 4 + assert kw["api_surface"] == "responses" + + +def test_streaming_responses_records_usage_from_response_completed( + client, respx_mock, mock_usage_tracker +): + sse_lines = [ + 'event: response.created', + 'data: {"type":"response.created","response":{"id":"r-1"}}', + 'event: response.output_text.delta', + 'data: {"type":"response.output_text.delta","delta":"hi"}', + 'event: response.completed', + 'data: ' + json.dumps({ + "type": "response.completed", + "response": {"id": "r-1", "usage": {"input_tokens": 12, "output_tokens": 3, "total_tokens": 15}}, + }), + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/responses").mock( + return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body) + ) + + with client.stream( + "POST", "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}], "stream": True}, + ) as r: + out = b"".join(r.iter_bytes()) + + assert b"response.completed" in out + assert b"hi" in out + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 12 + assert kw["output_tokens"] == 3 + assert kw["api_surface"] == "responses" + + +def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker): + respx_mock.post("/responses").mock( + return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}}) + ) + r = client.post( + "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "input": []}, + ) + assert r.status_code == 400 + assert r.json()["error"]["message"] == "bad input" + assert not mock_usage_tracker.record_usage.called +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses.py -v --no-cov` +Expected: 404s — endpoint doesn't exist yet. + +- [ ] **Step 3: Add the responses endpoint to the router** + +In `app/api/openai_passthrough/router.py`, immediately after the `chat_completions` function, add: + +```python +@router.post("/responses") +async def responses_create( + request: Request, + api_key_info: Dict[str, Any] = Depends(get_api_key_info), +): + body = await request.json() + mapping, _ = _managers() + body["model"] = resolve_model_id(body.get("model", ""), mapping) + extra = _passthrough_extra_headers(request) + + if body.get("stream"): + async def on_complete(usage: Dict[str, Any]) -> None: + _record_usage(api_key_info, usage, body["model"], "responses") + return StreamingResponse( + stream_passthrough("POST", "/responses", body, "responses", on_complete, extra), + media_type="text/event-stream", + ) + + resp = await get_client().post( + "/responses", json=body, headers=upstream_headers(extra) + ) + if resp.status_code >= 400: + return JSONResponse(_safe_json(resp), status_code=resp.status_code) + + data = resp.json() + if isinstance(data, dict) and isinstance(data.get("usage"), dict): + _record_usage(api_key_info, data["usage"], body["model"], "responses") + return JSONResponse(data, status_code=resp.status_code) +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses.py -v --no-cov` +Expected: All three tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_responses.py +git commit -m "feat(openai-passthrough): /responses endpoint (POST, streaming + non-streaming)" +``` + +--- + +## Task 11: Add Responses CRUD passthrough (GET, DELETE, cancel, input_items) + +**Files:** +- Modify: `app/api/openai_passthrough/router.py` +- Test: `tests/integration/test_openai_passthrough/test_responses_crud.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/integration/test_openai_passthrough/test_responses_crud.py`: + +```python +"""Integration tests for the Responses CRUD endpoints — pure passthrough.""" +import httpx + + +def test_get_response_forwards_and_returns_body(client, respx_mock, mock_usage_tracker): + body = {"id": "r-1", "model": "x", "status": "completed"} + respx_mock.get("/responses/r-1").mock(return_value=httpx.Response(200, json=body)) + + r = client.get("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == body + # No usage logged for retrieval + assert not mock_usage_tracker.record_usage.called + + +def test_delete_response_forwards(client, respx_mock): + respx_mock.delete("/responses/r-1").mock( + return_value=httpx.Response(200, json={"id": "r-1", "deleted": True}) + ) + r = client.delete("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == {"id": "r-1", "deleted": True} + + +def test_cancel_response_forwards(client, respx_mock): + respx_mock.post("/responses/r-1/cancel").mock( + return_value=httpx.Response(200, json={"id": "r-1", "status": "cancelled"}) + ) + r = client.post("/openai/v1/responses/r-1/cancel", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json()["status"] == "cancelled" + + +def test_list_input_items_forwards(client, respx_mock): + body = {"data": [{"id": "msg-1", "role": "user"}], "object": "list"} + respx_mock.get("/responses/r-1/input_items").mock(return_value=httpx.Response(200, json=body)) + r = client.get( + "/openai/v1/responses/r-1/input_items", + headers={"Authorization": "Bearer sk-test"}, + ) + assert r.status_code == 200 + assert r.json() == body + + +def test_get_response_404_returned_verbatim(client, respx_mock): + respx_mock.get("/responses/missing").mock( + return_value=httpx.Response(404, json={"error": {"message": "not found"}}) + ) + r = client.get("/openai/v1/responses/missing", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 404 + assert r.json()["error"]["message"] == "not found" +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses_crud.py -v --no-cov` +Expected: 404s — endpoints don't exist yet. + +- [ ] **Step 3: Add the CRUD endpoints** + +In `app/api/openai_passthrough/router.py`, add immediately after `responses_create`: + +```python +async def _passthrough_request(request: Request, path: str) -> Response: + """Forward request to upstream and mirror the upstream response.""" + extra = _passthrough_extra_headers(request) + body = None + if request.method in ("POST", "PUT", "PATCH"): + try: + body = await request.json() + except Exception: + body = None + resp = await get_client().request( + request.method, path, json=body, headers=upstream_headers(extra) + ) + return Response( + content=resp.content, + status_code=resp.status_code, + media_type=resp.headers.get("content-type"), + ) + + +@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"]) +async def responses_get_or_delete( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}") + + +@router.post("/responses/{response_id}/cancel") +async def responses_cancel( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}/cancel") + + +@router.get("/responses/{response_id}/input_items") +async def responses_input_items( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}/input_items") +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses_crud.py -v --no-cov` +Expected: All five tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_responses_crud.py +git commit -m "feat(openai-passthrough): /responses CRUD passthrough (GET, DELETE, cancel, input_items)" +``` + +--- + +## Task 12: Add /models passthrough endpoint + +**Files:** +- Modify: `app/api/openai_passthrough/router.py` +- Test: `tests/integration/test_openai_passthrough/test_models.py` + +- [ ] **Step 1: Write the failing test** + +Create `tests/integration/test_openai_passthrough/test_models.py`: + +```python +"""Integration test for GET /openai/v1/models — pure passthrough.""" +import httpx + + +def test_list_models_forwards(client, respx_mock): + upstream = { + "object": "list", + "data": [ + {"id": "openai.gpt-oss-120b", "object": "model"}, + {"id": "us.anthropic.claude-sonnet-4-6", "object": "model"}, + ], + } + respx_mock.get("/models").mock(return_value=httpx.Response(200, json=upstream)) + + r = client.get("/openai/v1/models", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == upstream +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_models.py -v --no-cov` +Expected: 404 — endpoint doesn't exist. + +- [ ] **Step 3: Add the endpoint** + +In `app/api/openai_passthrough/router.py`, add at the end: + +```python +@router.get("/models") +async def list_models( + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, "/models") +``` + +- [ ] **Step 4: Run the test to verify it passes** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_models.py -v --no-cov` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_models.py +git commit -m "feat(openai-passthrough): /models endpoint passthrough" +``` + +--- + +## Task 13: Add Bedrock guardrail header passthrough + +The router's `_passthrough_extra_headers` already forwards `X-Amzn-Bedrock-*` headers. This task adds an explicit test so the behavior is locked in. + +**Files:** +- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py` (extend) + +- [ ] **Step 1: Append the test** + +Append to `tests/integration/test_openai_passthrough/test_chat_completions.py`: + +```python +def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock): + """X-Amzn-Bedrock-* headers from the client should reach the upstream call.""" + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json={ + "id": "x", "choices": [], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}, + }) + ) + client.post( + "/openai/v1/chat/completions", + headers={ + "Authorization": "Bearer sk-test", + "X-Amzn-Bedrock-GuardrailIdentifier": "GR12345", + "X-Amzn-Bedrock-GuardrailVersion": "DRAFT", + }, + json={"model": "m", "messages": [{"role": "user", "content": "hi"}]}, + ) + sent = route.calls[0].request + assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345" + assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT" +``` + +- [ ] **Step 2: Run the test** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py::test_bedrock_guardrail_headers_are_forwarded -v --no-cov` +Expected: PASS (the router already forwards these). + +- [ ] **Step 3: Commit** + +```bash +git add tests/integration/test_openai_passthrough/test_chat_completions.py +git commit -m "test(openai-passthrough): pin guardrail header forwarding behavior" +``` + +--- + +## Task 14: Final integration verification + full test suite + +**Files:** none + +- [ ] **Step 1: Run the full openai_passthrough integration suite** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough tests/integration/test_openai_passthrough -v --no-cov` +Expected: All tests PASS (~30 tests). + +- [ ] **Step 2: Run the entire unit test suite** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov` +Expected: All previously-passing tests still pass. + +- [ ] **Step 3: Lint check** + +Run: `unset VIRTUAL_ENV && uv run --active ruff check app/api/openai_passthrough app/middleware/auth.py app/db/dynamodb.py` +Expected: No errors. Fix any issues with `ruff check --fix`. + +- [ ] **Step 4: Type check** + +Run: `unset VIRTUAL_ENV && uv run --active mypy app/api/openai_passthrough 2>&1 | tail -20` +Expected: No new errors introduced. Pre-existing project-wide errors are fine — focus only on the new module. + +- [ ] **Step 5: If lint/type fixes were needed, commit** + +```bash +git add app/api/openai_passthrough +git commit -m "chore(openai-passthrough): lint and type cleanup" +``` + +(Skip this step if Steps 3 and 4 were already clean.) + +--- + +## Task 15: Documentation updates + +**Files:** +- Modify: `env.example` +- Modify: `CLAUDE.md` +- Modify: `docs/architecture/features.md` + +- [ ] **Step 1: Update env.example** + +Find the existing `ENABLE_OPENAI_COMPAT` block in `env.example` and add a new entry below it: + +``` +# OpenAI Passthrough — mount /openai/v1/* endpoints accepting native OpenAI +# Chat Completions and Responses API requests, forwarded to bedrock-mantle. +# Independent of ENABLE_OPENAI_COMPAT (the two flags can be enabled together). +# Reuses OPENAI_API_KEY and OPENAI_BASE_URL. +ENABLE_OPENAI_PASSTHROUGH=False +``` + +- [ ] **Step 2: Update CLAUDE.md** + +In `CLAUDE.md`, find the "Features" section (around line 95–110, after "OpenAI-Compatible API"). Add a new bullet: + +``` +- **OpenAI Passthrough**: New `/openai/v1/*` endpoints accept OpenAI-native Chat Completions and Responses API requests and forward them to bedrock-mantle. Distinct from `ENABLE_OPENAI_COMPAT` (which routes Anthropic-format requests on `/v1/messages`). Reuses proxy API key auth, rate limits, budgets, and usage tracking. Controlled by `ENABLE_OPENAI_PASSTHROUGH`. +``` + +In the "Dual API Mode" section, add a third bullet: + +``` +- **OpenAI Passthrough** (any model bedrock-mantle accepts, optional): When `ENABLE_OPENAI_PASSTHROUGH=True`, mounts `/openai/v1/{chat/completions,responses,responses/{id},models}` for clients using OpenAI-format directly. +``` + +- [ ] **Step 3: Add detailed feature doc** + +Append to `docs/architecture/features.md`: + +```markdown +## OpenAI Passthrough + +Adds new `/openai/v1/*` endpoints that accept OpenAI-native API formats and forward them to `bedrock-mantle`. Distinct from `ENABLE_OPENAI_COMPAT` (which converts Anthropic-format requests on `/v1/messages` into OpenAI calls). + +### When to use it + +- You have client code using the OpenAI Python/JS SDK and want to point it at Bedrock without rewriting. +- You want stateful conversation chaining via the Responses API (`previous_response_id`, `store=true`). +- You want the proxy's API key auth, rate limits, budgets, and usage analytics for OpenAI-format traffic too. + +### Configuration + +```bash +ENABLE_OPENAI_PASSTHROUGH=True +OPENAI_API_KEY= +OPENAI_BASE_URL=https://bedrock-mantle.us-east-1.api.aws/v1 +``` + +### Endpoints + +| Method | Path | Purpose | +|---|---|---| +| POST | `/openai/v1/chat/completions` | Chat Completions (streaming + non-streaming) | +| POST | `/openai/v1/responses` | Responses API (streaming + non-streaming) | +| GET | `/openai/v1/responses/{id}` | Retrieve stored response | +| DELETE | `/openai/v1/responses/{id}` | Delete stored response | +| GET | `/openai/v1/responses/{id}/input_items` | List input items | +| POST | `/openai/v1/responses/{id}/cancel` | Cancel background response | +| GET | `/openai/v1/models` | List available models | + +### OpenAI SDK example + +```python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://your-proxy.example.com/openai/v1", +) +resp = client.chat.completions.create( + model="openai.gpt-oss-120b", + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +### Auth + +Either `Authorization: Bearer ` (OpenAI SDK default) or `x-api-key: ` works. The proxy uses its configured `OPENAI_API_KEY` (Bedrock API key) for the upstream call. + +### Model mapping + +The existing `anthropic-proxy-model-mapping` table is consulted. If a mapping exists, the client-supplied `model` is replaced before forwarding. If no mapping exists, the model ID is passed through unchanged — so Bedrock-native IDs like `openai.gpt-oss-120b` work without registration. + +### Usage tracking + +Usage is normalized into the existing `anthropic-proxy-usage` schema. Two new sparse columns are written: + +- `api_surface` ∈ `{"messages", "chat_completions", "responses"}` +- `reasoning_tokens` (already counted in `output_tokens`; stored separately for visibility) + +For streaming Chat Completions, clients must set `stream_options: {"include_usage": true}` for usage to be captured. Without it, usage is logged as zero. The Responses API always emits `response.completed` with usage. + +### Guardrails + +`X-Amzn-Bedrock-*` headers from the client (e.g. `X-Amzn-Bedrock-GuardrailIdentifier`) are forwarded to bedrock-mantle. +``` + +- [ ] **Step 4: Commit** + +```bash +git add env.example CLAUDE.md docs/architecture/features.md +git commit -m "docs(openai-passthrough): document new feature in env.example, CLAUDE.md, and features.md" +``` + +--- + +## Task 16: Final verification + +- [ ] **Step 1: Sanity import the app with the flag enabled** + +Run: +```bash +unset VIRTUAL_ENV && ENABLE_OPENAI_PASSTHROUGH=True uv run --active python -c " +from app.main import app +paths = sorted({r.path for r in app.routes}) +expected = [ + '/openai/v1/chat/completions', + '/openai/v1/models', + '/openai/v1/responses', + '/openai/v1/responses/{response_id}', + '/openai/v1/responses/{response_id}/cancel', + '/openai/v1/responses/{response_id}/input_items', +] +for p in expected: + assert p in paths, f'missing {p}; got {paths}' +print('all routes registered') +" +``` +Expected output: `all routes registered` + +- [ ] **Step 2: Sanity import with the flag disabled** + +Run: +```bash +unset VIRTUAL_ENV && ENABLE_OPENAI_PASSTHROUGH=False uv run --active python -c " +from app.main import app +paths = {r.path for r in app.routes} +assert not any(p.startswith('/openai/v1') for p in paths), f'unexpected: {[p for p in paths if p.startswith(\"/openai/v1\")]}' +print('flag-off cleanly excludes routes') +" +``` +Expected output: `flag-off cleanly excludes routes` + +- [ ] **Step 3: Final full test suite** + +Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit tests/integration/test_openai_passthrough -q --no-cov` +Expected: All tests PASS, no failures or errors. + +- [ ] **Step 4: Show final git log to confirm commit shape** + +Run: `git log --oneline main..HEAD` +Expected: ~13 commits with `feat(...)`, `test(...)`, `docs(...)`, and possibly `chore(...)` prefixes. + +--- + +## Self-Review Notes + +Items I verified before finalizing: + +- **Spec coverage:** All 8 implementation steps from the design doc are covered (config flag → auth → client → non-streaming chat → streaming chat → responses POST → responses CRUD → docs). Plus tasks for usage extension, model mapping, /models, guardrails, and final verification. +- **Type/name consistency:** `normalize_usage`, `try_extract_usage_from_sse`, `resolve_model_id`, `stream_passthrough`, `upstream_headers`, `_passthrough_extra_headers`, `_passthrough_request`, `_record_usage` — all introduced once and referenced consistently. +- **No placeholders:** Every code step has full code, every test has assertions, every command has expected output. +- **TDD throughout:** Each task that introduces logic starts with a failing test. +- **Frequent commits:** 13–14 separate commits, one per task, with conventional-commit prefixes matching the project's existing style. +- **Open items from design doc:** + - OTEL tracing — explicitly deferred (not in any task). + - Admin portal `api_surface` filter — explicitly deferred (not in any task). + - Guardrails passthrough — included in Task 13 (test pinning the behavior already implemented in Task 8's `_passthrough_extra_headers`). From 14a0b8fdb1a60deea71e7f9144b18a3ba03dce81 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 05:07:59 +0000 Subject: [PATCH 02/22] docs(design): add OpenAI passthrough endpoints design doc Mounts /openai/v1/* (chat/completions, responses + CRUD, models) as raw httpx passthrough to bedrock-mantle. Reuses proxy API key auth, rate limits, budgets, and usage tracking. Independent of ENABLE_OPENAI_COMPAT. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-25-openai-passthrough-design.md | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 docs/plans/2026-05-25-openai-passthrough-design.md diff --git a/docs/plans/2026-05-25-openai-passthrough-design.md b/docs/plans/2026-05-25-openai-passthrough-design.md new file mode 100644 index 0000000..5050c9e --- /dev/null +++ b/docs/plans/2026-05-25-openai-passthrough-design.md @@ -0,0 +1,354 @@ +# OpenAI Passthrough — Design Document + +**Status:** Approved (design) +**Author:** River Xie +**Date:** 2026-05-25 + +## Summary + +Add new client-facing endpoints that accept OpenAI-native API formats (Chat Completions and Responses) and forward them to AWS Bedrock's `bedrock-mantle` endpoint. Existing Anthropic-format endpoints (`/v1/messages`) are untouched. + +This is **distinct from** the existing `ENABLE_OPENAI_COMPAT` feature, which converts Anthropic-format requests on `/v1/messages` into OpenAI calls. The new feature exposes OpenAI-format directly so OpenAI SDK clients can hit the proxy without translation. + +## Motivation + +- OpenAI SDK users want to access non-Claude Bedrock models (gpt-oss-120b, etc.) through their existing OpenAI SDK code with minimal changes. +- The Responses API offers stateful conversation chaining (`previous_response_id`, `store=true`) that has no Anthropic equivalent and is awkward to expose through `/v1/messages`. +- Centralizing all model traffic through one proxy gives unified API key auth, budget tracking, rate limits, usage analytics, and pricing — regardless of wire format. + +## Non-Goals + +- Cross-format translation: OpenAI-in → Anthropic-out is not a goal. Both directions are OpenAI-format end-to-end on these new endpoints. +- OpenAI features that bedrock-mantle doesn't support (e.g. assistants API). +- OTEL tracing for the new endpoints (deferred to v2). + +## Design Decisions + +The following were resolved during brainstorming: + +| # | Decision | Choice | +|---|---|---| +| 1 | Integration depth | **Full integration** — same proxy API key, budget, rate limit, usage tracking | +| 2 | Model scope | **Allow any model bedrock-mantle accepts** (no Claude-block) | +| 3 | Responses API surface | **Full CRUD** (POST + GET + DELETE + cancel + list_input_items) | +| 4 | URL routing | **`/openai/v1/...` prefix** (matches AWS bedrock-runtime convention) | +| 5 | Request handling | **Raw httpx passthrough** (no Pydantic schemas for OpenAI types) | +| 6 | Model ID mapping | **Apply mapping if exists, else passthrough** | +| 7 | Usage tracking | **Normalize into existing Anthropic-shaped schema** + new `api_surface` and `reasoning_tokens` columns | + +## High-Level Architecture + +### Module Layout + +``` +app/api/openai_passthrough/ +├── __init__.py # exposes APIRouter +├── router.py # FastAPI routes (chat, responses, models, CRUD) +├── client.py # httpx async client to bedrock-mantle (singleton) +├── usage_extractor.py # parse usage from JSON body or final SSE event +└── streaming.py # SSE passthrough + usage extraction tee +``` + +### Mounting + +The router mounts at `/openai/v1` only when the feature flag is enabled, in `app/main.py`: + +```python +if settings.enable_openai_passthrough: + from app.api.openai_passthrough import router as openai_router + app.include_router(openai_router, prefix="/openai/v1", tags=["OpenAI Passthrough"]) +``` + +### Endpoints + +| Method | Path | Notes | +|---|---|---| +| POST | `/chat/completions` | Streaming + non-streaming | +| POST | `/responses` | Streaming + non-streaming + background | +| GET | `/responses/{response_id}` | Retrieve stored response | +| DELETE | `/responses/{response_id}` | Delete stored response | +| GET | `/responses/{response_id}/input_items` | List input items | +| POST | `/responses/{response_id}/cancel` | Cancel background response | +| GET | `/models` | List models from Mantle | + +### Request Flow (POST chat/completions or responses) + +1. `verify_api_key` middleware (extended to read `Authorization: Bearer` + existing `x-api-key`) +2. Rate limit check (existing token bucket per API key) +3. Budget check (existing) +4. Parse request body as `dict` (no Pydantic validation) +5. Apply model mapping if exists +6. Forward via httpx to `{OPENAI_BASE_URL}/{path}` with proxy's Bedrock API key in `Authorization` +7. Stream/return response +8. Extract usage → log to `anthropic-proxy-usage` + `anthropic-proxy-usage-stats` with `api_surface` column + +### Request Flow (CRUD on /responses/{id}) + +1. `verify_api_key` +2. Rate limit check (shared bucket with POST) +3. **Skip** budget check and usage logging (no tokens consumed) +4. Forward verbatim to Mantle +5. Return verbatim + +## Auth & Middleware Changes + +### Header Acceptance + +`app/middleware/auth.py::verify_api_key` is extended to accept either `x-api-key` or `Authorization: Bearer`: + +```python +async def verify_api_key( + x_api_key: Optional[str] = Header(None, alias="x-api-key"), + authorization: Optional[str] = Header(None, alias="Authorization"), +) -> ApiKeyInfo: + api_key = x_api_key + if not api_key and authorization and authorization.startswith("Bearer "): + api_key = authorization[7:].strip() + if not api_key: + raise HTTPException(401, "Missing API key (x-api-key or Authorization: Bearer)") + # ... existing lookup logic unchanged +``` + +This is **backwards compatible**. If both headers are present, `x-api-key` wins (deterministic). + +### Rate Limiting + +No change. The existing rate limiter is keyed by `api_key_id`; once auth resolves, all endpoints (Anthropic and OpenAI) share the same per-key bucket. A client cannot dodge limits by switching API surfaces. + +### Budget + +Same. The budget check is per-key and surface-agnostic. POST endpoints check + update; non-POST endpoints (GET/DELETE/list/cancel) skip both since they are free operations. + +### Bedrock-mantle Auth (Proxy → AWS) + +The proxy uses `OPENAI_API_KEY` (the Bedrock API key, already configured for the existing `ENABLE_OPENAI_COMPAT` feature) as `Authorization: Bearer` to bedrock-mantle: + +```python +headers = { + "Authorization": f"Bearer {settings.openai_api_key}", + "Content-Type": "application/json", +} +``` + +### Error Contract + +Mantle errors (4xx/5xx) are returned to the client **as-is** — same status code, same JSON body. No wrapping or rewriting. This preserves OpenAI-SDK error semantics so `OpenAIError` subclasses raise correctly client-side. + +The only proxy-injected errors are: +- `401` — bad proxy API key +- `429` — proxy rate limit +- `402` — budget exceeded + +### Proxy-Injected Headers (upstream) + +- `User-Agent: bedrock-api-proxy/` +- `X-Proxy-Request-ID: ` for log correlation + +Both are zero-cost, useful for debugging, and ignored by Mantle. + +## Passthrough Client & Streaming + +### httpx Client (Singleton) + +```python +# app/api/openai_passthrough/client.py +import httpx +from app.core.config import settings + +_client: httpx.AsyncClient | None = None + +def get_client() -> httpx.AsyncClient: + global _client + if _client is None: + _client = httpx.AsyncClient( + base_url=settings.openai_base_url, + timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0), + limits=httpx.Limits(max_connections=200, max_keepalive_connections=50), + ) + return _client +``` + +### Non-Streaming POST + +```python +async def chat_completions(request: Request, api_key_info: ApiKeyInfo = Depends(verify_api_key)): + body = await request.json() + body["model"] = resolve_model_id(body.get("model", "")) + + if body.get("stream"): + return StreamingResponse( + stream_passthrough("/chat/completions", body, api_key_info, api_surface="chat_completions"), + media_type="text/event-stream", + ) + + resp = await get_client().post( + "/chat/completions", json=body, + headers={"Authorization": f"Bearer {settings.openai_api_key}"}, + ) + if resp.status_code >= 400: + return JSONResponse(resp.json(), status_code=resp.status_code) + + data = resp.json() + log_usage_async(api_key_info, data.get("usage", {}), body["model"], "chat_completions") + return JSONResponse(data) +``` + +### Streaming Passthrough + +For SSE, we forward bytes line-by-line and *tee* to extract the final `usage` chunk. + +- **Chat Completions stream**: requires `stream_options: {"include_usage": true}` from the client. If sent, the second-to-last chunk has `usage`. If not, no usage extracted (proxy logs zero — documented behavior). +- **Responses API stream**: usage is on the `response.completed` event. Always present. + +```python +async def stream_passthrough(path, body, api_key_info, api_surface): + usage_holder: dict = {} + async with get_client().stream( + "POST", path, json=body, + headers={"Authorization": f"Bearer {settings.openai_api_key}"}, + ) as resp: + async for raw_line in resp.aiter_lines(): + yield (raw_line + "\n").encode() + try_extract_usage(raw_line, usage_holder, api_surface) + if usage_holder: + log_usage_async(api_key_info, usage_holder, body["model"], api_surface) +``` + +`try_extract_usage` is small (~30 LOC) — pattern matches `data: {...}` lines, JSON-parses, looks for `usage` field on completion events. + +### CRUD Endpoints + +Pure passthrough — forward method, path, body, query params; return status + body unchanged. ~20 LOC for all four combined: + +```python +@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"]) +async def responses_crud(response_id: str, request: Request, _=Depends(verify_api_key)): + resp = await get_client().request( + request.method, f"/responses/{response_id}", + headers={"Authorization": f"Bearer {settings.openai_api_key}"}, + ) + return Response(content=resp.content, status_code=resp.status_code, + media_type=resp.headers.get("content-type")) +``` + +### Edge Case — `store=true` and Pricing + +Mantle stores conversations for 30 days for free per the docs (no separate storage cost). We do not bill for it. If AWS adds a storage charge later, a feature flag can force `store=false`. + +## Usage Tracking + +### Normalization + +```python +# app/api/openai_passthrough/usage_extractor.py +def normalize_usage(raw: dict, api_surface: str) -> dict: + """Return Anthropic-shaped usage dict + reasoning_tokens.""" + if api_surface == "chat_completions": + in_tok = raw.get("prompt_tokens", 0) + out_tok = raw.get("completion_tokens", 0) + cached = raw.get("prompt_tokens_details", {}).get("cached_tokens", 0) + reasoning = raw.get("completion_tokens_details", {}).get("reasoning_tokens", 0) + else: # responses + in_tok = raw.get("input_tokens", 0) + out_tok = raw.get("output_tokens", 0) + cached = raw.get("input_tokens_details", {}).get("cached_tokens", 0) + reasoning = raw.get("output_tokens_details", {}).get("reasoning_tokens", 0) + return { + "input_tokens": in_tok - cached, # subtract: cache hits billed separately + "output_tokens": out_tok, # reasoning already included per spec + "cache_read_input_tokens": cached, + "cache_creation_input_tokens": 0, # OpenAI APIs don't expose this + "reasoning_tokens": reasoning, # new optional column + } +``` + +### DDB Schema Additions + +Added to `anthropic-proxy-usage`: + +| Field | Type | Default | Notes | +|---|---|---|---| +| `api_surface` | string | `"messages"` | One of `messages`, `chat_completions`, `responses` | +| `reasoning_tokens` | integer | `0` | Optional, sparse | + +Both are sparse attributes — DynamoDB will not reject existing rows. **No migration required**; old rows simply will not have these fields when read. + +### Pricing Lookup + +Existing `anthropic-proxy-model-pricing` is keyed by Bedrock model ID. After model mapping, we have the Bedrock ID, so pricing works unchanged. Models missing from the pricing table log usage with `cost=0` and emit a warning (existing behavior). + +## Configuration + +### New Env Var + +Added to `app/core/config.py`: + +```python +enable_openai_passthrough: bool = Field( + default=False, alias="ENABLE_OPENAI_PASSTHROUGH", + description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)" +) +``` + +### Reused Vars + +- `OPENAI_API_KEY` — Bedrock API key for bedrock-mantle (already exists) +- `OPENAI_BASE_URL` — Mantle endpoint URL (already exists) + +### Flag Interaction + +`ENABLE_OPENAI_COMPAT` (existing) and `ENABLE_OPENAI_PASSTHROUGH` (new) are **independent** and can be enabled together. They affect different endpoints: + +- `ENABLE_OPENAI_COMPAT=True`: routes non-Claude traffic on `/v1/messages` through bedrock-mantle (Anthropic↔OpenAI conversion) +- `ENABLE_OPENAI_PASSTHROUGH=True`: mounts `/openai/v1/*` endpoints (no conversion, raw forward) + +## Testing Strategy + +### Unit Tests (`tests/unit/test_openai_passthrough/`) + +- `test_usage_extractor.py` — normalize chat_completions and responses usage shapes (incl. missing/zero fields, cached tokens, reasoning tokens) +- `test_model_mapping.py` — passthrough when no mapping exists, substitution when it does +- `test_auth.py` — `Authorization: Bearer` resolves to API key correctly; both-headers precedence + +### Integration Tests (`tests/integration/test_openai_passthrough/`) + +`respx` mocks bedrock-mantle. Tests cover: + +- POST chat/completions non-streaming → usage logged correctly +- POST chat/completions streaming with `include_usage=true` → usage logged from second-to-last chunk +- POST chat/completions streaming **without** `include_usage` → request succeeds, usage logged as zero +- POST responses streaming → usage logged from `response.completed` event +- POST responses non-streaming → usage logged from response body +- GET /responses/{id} forwards correctly +- DELETE /responses/{id} forwards correctly +- POST /responses/{id}/cancel forwards correctly +- GET /responses/{id}/input_items forwards correctly +- 4xx from Mantle returned verbatim (status + body) +- Rate limit on shared bucket triggers across both surfaces (mix `/v1/messages` and `/openai/v1/chat/completions` traffic) +- Budget exhaustion blocks POST endpoints but not CRUD endpoints + +## Documentation Updates + +- `CLAUDE.md` — new "Features" entry: "OpenAI Passthrough" +- `docs/architecture/features.md` — detailed feature doc with examples +- `env.example` — new flag with comment +- `README.md` / `README_ZH.md` — usage example with OpenAI SDK pointing at `/openai/v1` + +## Open Items (Deferred) + +1. **OTEL tracing** — additive, deferred to v2. +2. **Admin portal `api_surface` filter** — existing dashboards aggregate fine; add filter when needed. +3. **Guardrails passthrough** — Mantle Chat Completions supports guardrails via `X-Amzn-Bedrock-GuardrailIdentifier` headers. Recommend whitelisting `X-Amzn-Bedrock-*` headers in the passthrough on initial implementation. Trivial addition (~5 LOC), high value for guardrail-using customers. **Confirm before implementation.** + +## Implementation Sequence + +Once approved: + +1. Schema/config skeleton: feature flag, DDB column additions to usage manager, normalization function with unit tests +2. Auth middleware extension (`Authorization: Bearer` support) with unit tests +3. httpx client singleton + non-streaming chat/completions endpoint + integration test +4. Streaming chat/completions + usage tee + integration test +5. Responses API POST (streaming + non-streaming) + integration tests +6. Responses CRUD endpoints (GET, DELETE, cancel, list_input_items) + integration tests +7. `/models` passthrough endpoint +8. Documentation updates (CLAUDE.md, features.md, env.example, READMEs) From b2cfff2551c23e8776675047b1304ca79a5b9afb Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:15:21 +0000 Subject: [PATCH 03/22] feat(openai-passthrough): add ENABLE_OPENAI_PASSTHROUGH flag and respx dev dep --- app/core/config.py | 5 +++++ pyproject.toml | 1 + uv.lock | 15 +++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/app/core/config.py b/app/core/config.py index 9527b2a..bb66386 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -404,6 +404,11 @@ class Settings(BaseSettings): alias="OPENAI_COMPAT_THINKING_MEDIUM_THRESHOLD", description="budget_tokens >= this → reasoning effort 'medium', below → 'low'" ) + enable_openai_passthrough: bool = Field( + default=False, + alias="ENABLE_OPENAI_PASSTHROUGH", + description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)" + ) # === Multi-Provider Gateway Feature Flags === multi_provider_enabled: bool = Field( diff --git a/pyproject.toml b/pyproject.toml index f04858f..800294e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ "pytest-asyncio>=0.23.0,<2.0.0", "pytest-cov>=4.1.0", "pytest-mock>=3.12.0", + "respx>=0.21.0", "httpx>=0.27.0", # Property-Based Testing "hypothesis>=6.100.0", diff --git a/uv.lock b/uv.lock index 9419635..6bf23ec 100644 --- a/uv.lock +++ b/uv.lock @@ -210,6 +210,7 @@ dev = [ { name = "pytest-asyncio" }, { name = "pytest-cov" }, { name = "pytest-mock" }, + { name = "respx" }, { name = "ruff" }, { name = "types-boto3" }, ] @@ -251,6 +252,7 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "python-jose", extras = ["cryptography"], specifier = ">=3.3.0" }, { name = "python-multipart", specifier = ">=0.0.22" }, + { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21.0" }, { name = "routellm", marker = "extra == 'smart-routing'", specifier = ">=0.1.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0" }, { name = "sentry-sdk", extras = ["fastapi"], marker = "extra == 'monitoring'", specifier = ">=1.40.0" }, @@ -360,6 +362,7 @@ dependencies = [ { name = "jmespath" }, { name = "s3transfer" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/f1/13/33c8b8704d677fcaf5555ba8c6cc39468fc7b9a0c6b6c496e008cd5557fc/boto3-1.42.76.tar.gz", hash = "sha256:aa2b1973eee8973a9475d24bb579b1dee7176595338d4e4f7880b5c6189b8814", size = 112789, upload-time = "2026-03-25T19:33:25.985Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/f0/dc/21b3dfb135125eb7e3a46b9aab0aede847726f239fc8f39474742a87ebb0/boto3-1.42.76-py3-none-any.whl", hash = "sha256:63c6779c814847016b89ae1b72ed968f8a63d80e589ba337511aa6fc1b59585e", size = 140557, upload-time = "2026-03-25T19:33:23.289Z" }, ] @@ -2972,6 +2975,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl", hash = "sha256:03ec4409088cd5c66b71ecbbbd27fe2c58ddfad801c66203457b3e6a04868c37", size = 35099, upload-time = "2026-02-19T14:38:03.847Z" }, ] +[[package]] +name = "respx" +version = "0.23.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/98/4e55c9c486404ec12373708d015ebce157966965a5ebe7f28ff2c784d41b/respx-0.23.1.tar.gz", hash = "sha256:242dcc6ce6b5b9bf621f5870c82a63997e8e82bc7c947f9ffe272b8f3dd5a780", size = 29243, upload-time = "2026-04-08T14:37:16.008Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/4a/221da6ca167db45693d8d26c7dc79ccfc978a440251bf6721c9aaf251ac0/respx-0.23.1-py2.py3-none-any.whl", hash = "sha256:b18004b029935384bccfa6d7d9d74b4ec9af73a081cc28600fffc0447f4b8c1a", size = 25557, upload-time = "2026-04-08T14:37:14.613Z" }, +] + [[package]] name = "rfc3339-validator" version = "0.1.4" From e9f43fd2a1b6d9d9df4fd33cdfd6464f425e228f Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:20:27 +0000 Subject: [PATCH 04/22] feat(auth): accept Authorization: Bearer alongside x-api-key --- app/middleware/auth.py | 8 +- .../unit/test_openai_passthrough/__init__.py | 0 .../unit/test_openai_passthrough/test_auth.py | 138 ++++++++++++++++++ 3 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_openai_passthrough/__init__.py create mode 100644 tests/unit/test_openai_passthrough/test_auth.py diff --git a/app/middleware/auth.py b/app/middleware/auth.py index 71abb85..a758732 100644 --- a/app/middleware/auth.py +++ b/app/middleware/auth.py @@ -59,8 +59,12 @@ async def dispatch(self, request: Request, call_next: Callable): request.state.api_key_info = None return await call_next(request) - # Extract API key from header + # Extract API key from header (x-api-key first, fall back to Authorization: Bearer) api_key = request.headers.get(settings.api_key_header) + if not api_key: + authz = request.headers.get("Authorization") + if authz and authz.startswith("Bearer "): + api_key = authz[len("Bearer "):].strip() if not api_key: print(f"[AUTH] Missing API key for {request.url.path}") @@ -71,7 +75,7 @@ async def dispatch(self, request: Request, call_next: Callable): "type": "error", "error": { "type": "authentication_error", - "message": f"Missing API key in {settings.api_key_header} header", + "message": f"Missing API key in {settings.api_key_header} or Authorization: Bearer header", }, }, ) diff --git a/tests/unit/test_openai_passthrough/__init__.py b/tests/unit/test_openai_passthrough/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_openai_passthrough/test_auth.py b/tests/unit/test_openai_passthrough/test_auth.py new file mode 100644 index 0000000..df03fdd --- /dev/null +++ b/tests/unit/test_openai_passthrough/test_auth.py @@ -0,0 +1,138 @@ +"""Tests for the auth middleware's Authorization: Bearer support.""" +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from starlette.datastructures import Headers +from starlette.requests import Request + +from app.middleware.auth import AuthMiddleware + + +@pytest.mark.asyncio +async def test_authorization_bearer_resolves_when_xapikey_missing(monkeypatch): + """Authorization: Bearer should authenticate when x-api-key is absent.""" + # Patch settings + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key") + + # Create mock request with Authorization: Bearer header + request = MagicMock(spec=Request) + request.url.path = "/test" + request.headers = Headers({"Authorization": "Bearer sk-abc"}) + request.state = MagicMock() + + # Mock the API key manager + mock_manager = MagicMock() + mock_manager.validate_api_key.return_value = {"user_id": "u1", "api_key": "sk-abc"} + + # Mock the call_next + mock_call_next = AsyncMock() + mock_call_next.return_value = MagicMock(status_code=200) + + # Create middleware with mocked APIKeyManager + ddb_client = MagicMock() + with patch("app.middleware.auth.APIKeyManager", return_value=mock_manager): + middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client) + + # Call dispatch + await middleware.dispatch(request, mock_call_next) + + # Verify the API key was extracted and validated + mock_manager.validate_api_key.assert_called_once_with("sk-abc") + assert request.state.api_key_info == {"user_id": "u1", "api_key": "sk-abc"} + + +@pytest.mark.asyncio +async def test_xapikey_takes_precedence_when_both_present(monkeypatch): + """If both headers are present, x-api-key wins.""" + # Patch settings + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key") + + # Create mock request with both headers + request = MagicMock(spec=Request) + request.url.path = "/test" + request.headers = Headers({ + "x-api-key": "sk-from-xapikey", + "Authorization": "Bearer sk-from-bearer" + }) + request.state = MagicMock() + + # Mock the API key manager + mock_manager = MagicMock() + mock_manager.validate_api_key.return_value = {"user_id": "u1", "api_key": "sk-from-xapikey"} + + # Mock the call_next + mock_call_next = AsyncMock() + mock_call_next.return_value = MagicMock(status_code=200) + + # Create middleware with mocked APIKeyManager + ddb_client = MagicMock() + with patch("app.middleware.auth.APIKeyManager", return_value=mock_manager): + middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client) + + # Call dispatch + await middleware.dispatch(request, mock_call_next) + + # Verify x-api-key took precedence + mock_manager.validate_api_key.assert_called_once_with("sk-from-xapikey") + + +@pytest.mark.asyncio +async def test_missing_both_headers_returns_401(monkeypatch): + """Missing both headers should return 401.""" + # Patch settings + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key") + + # Create mock request with no auth headers + request = MagicMock(spec=Request) + request.url.path = "/test" + request.headers = Headers({}) + request.state = MagicMock() + + # Mock the call_next + mock_call_next = AsyncMock() + + # Create middleware with mocked APIKeyManager + ddb_client = MagicMock() + with patch("app.middleware.auth.APIKeyManager", return_value=MagicMock()): + middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client) + + # Call dispatch + response = await middleware.dispatch(request, mock_call_next) + + # Verify 401 response + assert response.status_code == 401 + + +@pytest.mark.asyncio +async def test_authorization_non_bearer_is_ignored(monkeypatch): + """Authorization: Basic ... should not be treated as an API key.""" + # Patch settings + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key") + + # Create mock request with Basic auth + request = MagicMock(spec=Request) + request.url.path = "/test" + request.headers = Headers({"Authorization": "Basic dXNlcjpwYXNz"}) + request.state = MagicMock() + + # Mock the call_next + mock_call_next = AsyncMock() + + # Create middleware with mocked APIKeyManager + ddb_client = MagicMock() + with patch("app.middleware.auth.APIKeyManager", return_value=MagicMock()): + middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client) + + # Call dispatch + response = await middleware.dispatch(request, mock_call_next) + + # Verify 401 response + assert response.status_code == 401 From 24d91aee80a002d5e44f1fc0eed89997dcbcf454 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:28:21 +0000 Subject: [PATCH 05/22] feat(openai-passthrough): add usage normalization and SSE extraction helpers --- app/api/openai_passthrough/__init__.py | 0 app/api/openai_passthrough/usage_extractor.py | 84 +++++++++++++ .../test_usage_extractor.py | 118 ++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 app/api/openai_passthrough/__init__.py create mode 100644 app/api/openai_passthrough/usage_extractor.py create mode 100644 tests/unit/test_openai_passthrough/test_usage_extractor.py diff --git a/app/api/openai_passthrough/__init__.py b/app/api/openai_passthrough/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/openai_passthrough/usage_extractor.py b/app/api/openai_passthrough/usage_extractor.py new file mode 100644 index 0000000..96a5f1e --- /dev/null +++ b/app/api/openai_passthrough/usage_extractor.py @@ -0,0 +1,84 @@ +"""Usage extraction and normalization for OpenAI-format responses. + +normalize_usage() converts an OpenAI Chat Completions or Responses API usage +dict into the Anthropic-shaped dict that UsageTracker.record_usage expects, +plus a separate reasoning_tokens field. + +try_extract_usage_from_sse() peeks at SSE lines during streaming and stashes +the usage dict (raw OpenAI shape) the first time it encounters one. The caller +later passes that dict through normalize_usage(). +""" +from __future__ import annotations + +import json +from typing import Any, Dict + + +def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]: + """Normalize OpenAI-shaped usage into Anthropic-shaped fields. + + api_surface: "chat_completions" or "responses" + """ + if api_surface == "chat_completions": + in_tok = int(raw.get("prompt_tokens", 0) or 0) + out_tok = int(raw.get("completion_tokens", 0) or 0) + cached = int((raw.get("prompt_tokens_details") or {}).get("cached_tokens", 0) or 0) + reasoning = int( + (raw.get("completion_tokens_details") or {}).get("reasoning_tokens", 0) or 0 + ) + else: # responses + in_tok = int(raw.get("input_tokens", 0) or 0) + out_tok = int(raw.get("output_tokens", 0) or 0) + cached = int((raw.get("input_tokens_details") or {}).get("cached_tokens", 0) or 0) + reasoning = int( + (raw.get("output_tokens_details") or {}).get("reasoning_tokens", 0) or 0 + ) + + # Cache-read tokens are billed separately, so subtract them from input_tokens + # to mirror how the Anthropic flow accounts for cache hits. + return { + "input_tokens": max(in_tok - cached, 0), + "output_tokens": out_tok, + "cache_read_input_tokens": cached, + "cache_creation_input_tokens": 0, # Not exposed by OpenAI-format APIs + "reasoning_tokens": reasoning, + } + + +def try_extract_usage_from_sse( + raw_line: str, holder: Dict[str, Any], api_surface: str +) -> None: + """Inspect an SSE line and, if it carries usage info, store it in holder. + + Mutates `holder` in place. Idempotent: subsequent calls overwrite, so the + last-seen usage event wins (which is what we want — both APIs put usage + on the terminal event). + """ + line = raw_line.strip() + if not line.startswith("data:"): + return + + payload = line[len("data:"):].strip() + if not payload or payload == "[DONE]": + return + + try: + obj = json.loads(payload) + except (ValueError, TypeError): + return + + if api_surface == "chat_completions": + usage = obj.get("usage") + if isinstance(usage, dict): + holder.clear() + holder.update(usage) + else: # responses + # Usage lives on the `response.completed` event under + # event.response.usage. Other events occasionally carry partial usage + # too — accept any usage dict we see. + if obj.get("type") == "response.completed": + response_obj = obj.get("response") or {} + usage = response_obj.get("usage") + if isinstance(usage, dict): + holder.clear() + holder.update(usage) diff --git a/tests/unit/test_openai_passthrough/test_usage_extractor.py b/tests/unit/test_openai_passthrough/test_usage_extractor.py new file mode 100644 index 0000000..99230a9 --- /dev/null +++ b/tests/unit/test_openai_passthrough/test_usage_extractor.py @@ -0,0 +1,118 @@ +"""Tests for normalize_usage and try_extract_usage_from_sse.""" +import json + +from app.api.openai_passthrough.usage_extractor import ( + normalize_usage, + try_extract_usage_from_sse, +) + + +def test_normalize_chat_completions_basic(): + raw = {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150} + result = normalize_usage(raw, "chat_completions") + assert result == { + "input_tokens": 100, + "output_tokens": 50, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + "reasoning_tokens": 0, + } + + +def test_normalize_chat_completions_with_cache_and_reasoning(): + raw = { + "prompt_tokens": 100, + "completion_tokens": 50, + "prompt_tokens_details": {"cached_tokens": 30}, + "completion_tokens_details": {"reasoning_tokens": 20}, + } + result = normalize_usage(raw, "chat_completions") + # cache hits subtracted from input + assert result["input_tokens"] == 70 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 30 + assert result["reasoning_tokens"] == 20 + + +def test_normalize_responses_basic(): + raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + result = normalize_usage(raw, "responses") + assert result["input_tokens"] == 100 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 0 + assert result["reasoning_tokens"] == 0 + + +def test_normalize_responses_with_cache_and_reasoning(): + raw = { + "input_tokens": 100, + "output_tokens": 50, + "input_tokens_details": {"cached_tokens": 25}, + "output_tokens_details": {"reasoning_tokens": 15}, + } + result = normalize_usage(raw, "responses") + assert result["input_tokens"] == 75 + assert result["output_tokens"] == 50 + assert result["cache_read_input_tokens"] == 25 + assert result["reasoning_tokens"] == 15 + + +def test_normalize_handles_missing_fields(): + """Empty/None usage should normalize to all-zeros, not crash.""" + result = normalize_usage({}, "chat_completions") + assert result == { + "input_tokens": 0, "output_tokens": 0, + "cache_read_input_tokens": 0, "cache_creation_input_tokens": 0, + "reasoning_tokens": 0, + } + + +def test_extract_chat_completions_usage_from_sse_chunk(): + """Final chat-completions chunk with usage should be picked up.""" + line = "data: " + json.dumps({ + "id": "chatcmpl-1", "choices": [], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, + }) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "chat_completions") + assert holder == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + + +def test_extract_responses_usage_from_response_completed_event(): + line = "data: " + json.dumps({ + "type": "response.completed", + "response": { + "id": "resp-1", + "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}, + }, + }) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "responses") + assert holder == {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28} + + +def test_extract_ignores_non_data_lines(): + holder: dict = {} + try_extract_usage_from_sse("event: response.completed", holder, "responses") + try_extract_usage_from_sse("", holder, "responses") + try_extract_usage_from_sse(": keepalive", holder, "responses") + assert holder == {} + + +def test_extract_ignores_data_done(): + holder: dict = {} + try_extract_usage_from_sse("data: [DONE]", holder, "chat_completions") + assert holder == {} + + +def test_extract_ignores_chunks_without_usage(): + line = "data: " + json.dumps({"choices": [{"delta": {"content": "hi"}}]}) + holder: dict = {} + try_extract_usage_from_sse(line, holder, "chat_completions") + assert holder == {} + + +def test_extract_ignores_malformed_json(): + holder: dict = {} + try_extract_usage_from_sse("data: not-json", holder, "chat_completions") + assert holder == {} From e3b8dac163238accfa8cfc1b20beb0fa0979dd09 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:31:50 +0000 Subject: [PATCH 06/22] feat(openai-passthrough): add model mapping resolver with passthrough fallback --- app/api/openai_passthrough/model_mapping.py | 33 +++++++++++++++++ .../test_model_mapping.py | 37 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 app/api/openai_passthrough/model_mapping.py create mode 100644 tests/unit/test_openai_passthrough/test_model_mapping.py diff --git a/app/api/openai_passthrough/model_mapping.py b/app/api/openai_passthrough/model_mapping.py new file mode 100644 index 0000000..a4c7e63 --- /dev/null +++ b/app/api/openai_passthrough/model_mapping.py @@ -0,0 +1,33 @@ +"""Model ID resolution for the OpenAI passthrough endpoints. + +Looks up the client-supplied model in the existing model_mapping table; if a +mapping exists, substitute it. Otherwise, pass through unchanged so callers +can use Bedrock-native IDs (e.g. ``openai.gpt-oss-120b``) directly without +needing to register them. +""" +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +def resolve_model_id(model: str, model_mapping_manager) -> str: + """Resolve a client-supplied model ID via the mapping table, with fallback. + + Args: + model: The ``model`` field from the client request. + model_mapping_manager: An app.db.dynamodb.ModelMappingManager instance. + + Returns: + The resolved Bedrock model ID, or the original string if no mapping + exists or the lookup fails. + """ + if not model: + return model + try: + mapped = model_mapping_manager.get_mapping(model) + except Exception as exc: + logger.warning("[OPENAI-PASSTHROUGH] model mapping lookup failed for %r: %s", model, exc) + return model + return mapped or model diff --git a/tests/unit/test_openai_passthrough/test_model_mapping.py b/tests/unit/test_openai_passthrough/test_model_mapping.py new file mode 100644 index 0000000..d148b64 --- /dev/null +++ b/tests/unit/test_openai_passthrough/test_model_mapping.py @@ -0,0 +1,37 @@ +"""Tests for resolve_model_id.""" +from unittest.mock import MagicMock + +from app.api.openai_passthrough.model_mapping import resolve_model_id + + +def test_returns_mapped_id_when_mapping_exists(): + manager = MagicMock() + manager.get_mapping.return_value = "openai.gpt-oss-120b" + + out = resolve_model_id("gpt-4", manager) + assert out == "openai.gpt-oss-120b" + manager.get_mapping.assert_called_once_with("gpt-4") + + +def test_passes_through_when_no_mapping_exists(): + manager = MagicMock() + manager.get_mapping.return_value = None + + out = resolve_model_id("openai.gpt-oss-120b", manager) + assert out == "openai.gpt-oss-120b" + + +def test_passes_through_empty_string(): + manager = MagicMock() + manager.get_mapping.return_value = None + + assert resolve_model_id("", manager) == "" + + +def test_handles_lookup_exception_by_passing_through(): + """If DDB lookup raises, fall back to the original ID rather than crashing the request.""" + manager = MagicMock() + manager.get_mapping.side_effect = RuntimeError("ddb down") + + out = resolve_model_id("gpt-4", manager) + assert out == "gpt-4" From 842268453e8b4afce9e605925f833340fc1a470e Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:34:25 +0000 Subject: [PATCH 07/22] feat(usage): record api_surface and reasoning_tokens on usage rows --- app/db/dynamodb.py | 9 ++++ .../test_usage_tracker_extended.py | 48 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tests/unit/test_openai_passthrough/test_usage_tracker_extended.py diff --git a/app/db/dynamodb.py b/app/db/dynamodb.py index 8094b56..601a418 100644 --- a/app/db/dynamodb.py +++ b/app/db/dynamodb.py @@ -918,6 +918,8 @@ def record_usage( error_message: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, cache_ttl: Optional[str] = None, + api_surface: Optional[str] = None, + reasoning_tokens: int = 0, ): """ Record API usage. @@ -934,6 +936,8 @@ def record_usage( error_message: Error message if failed metadata: Optional metadata cache_ttl: Effective cache TTL used ("5m" or "1h"), for billing differentiation + api_surface: Source endpoint family ("messages", "chat_completions", or "responses") + reasoning_tokens: Reasoning tokens (already counted in output_tokens; stored separately for visibility) """ # Use string timestamp to match CDK table schema (STRING type) current_time = int(time.time()) @@ -962,6 +966,11 @@ def record_usage( if cache_ttl: item["cache_ttl"] = cache_ttl + if api_surface: + item["api_surface"] = api_surface + if reasoning_tokens: + item["reasoning_tokens"] = reasoning_tokens + # Add TTL if enabled (usage_ttl_days > 0) if settings.usage_ttl_days > 0: ttl_seconds = settings.usage_ttl_days * 24 * 60 * 60 # Convert days to seconds diff --git a/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py b/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py new file mode 100644 index 0000000..a3ac7d7 --- /dev/null +++ b/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py @@ -0,0 +1,48 @@ +"""Tests for the api_surface and reasoning_tokens additions to UsageTracker.""" +from unittest.mock import MagicMock + +from app.db.dynamodb import UsageTracker + + +def _make_tracker(): + ddb_client = MagicMock() + ddb_client.usage_table_name = "anthropic-proxy-usage" + tracker = UsageTracker(ddb_client) + tracker.table = MagicMock() + return tracker + + +def test_record_usage_writes_api_surface_when_provided(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", + request_id="req-1", + model="openai.gpt-oss-120b", + input_tokens=100, + output_tokens=50, + api_surface="chat_completions", + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + assert item["api_surface"] == "chat_completions" + + +def test_record_usage_writes_reasoning_tokens_when_provided(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", request_id="req-1", model="m", + input_tokens=10, output_tokens=5, reasoning_tokens=3, + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + assert item["reasoning_tokens"] == 3 + + +def test_record_usage_omits_new_fields_when_default(): + tracker = _make_tracker() + tracker.record_usage( + api_key="sk-x", request_id="req-1", model="m", + input_tokens=10, output_tokens=5, + ) + item = tracker.table.put_item.call_args.kwargs["Item"] + # Sparse: not written when caller didn't specify them + assert "api_surface" not in item + assert "reasoning_tokens" not in item From c3cf90d23cc572d7b8b816cd72d0ac564e2d2652 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:37:54 +0000 Subject: [PATCH 08/22] feat(openai-passthrough): add httpx singleton client and header helper --- app/api/openai_passthrough/client.py | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 app/api/openai_passthrough/client.py diff --git a/app/api/openai_passthrough/client.py b/app/api/openai_passthrough/client.py new file mode 100644 index 0000000..93fcb80 --- /dev/null +++ b/app/api/openai_passthrough/client.py @@ -0,0 +1,44 @@ +"""Async httpx client to bedrock-mantle, lazily constructed and reused. + +Headers are NOT set on the client itself; they're added per-request in the +router so we can include the proxy's Bedrock API key in Authorization. +""" +from __future__ import annotations + +import httpx + +from app.core.config import settings + +_client: httpx.AsyncClient | None = None + + +def get_client() -> httpx.AsyncClient: + global _client + if _client is None: + _client = httpx.AsyncClient( + base_url=settings.openai_base_url, + timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0), + limits=httpx.Limits(max_connections=200, max_keepalive_connections=50), + ) + return _client + + +def reset_client_for_testing() -> None: + """Reset the singleton — only call this from test fixtures.""" + global _client + if _client is not None: + # AsyncClient.aclose() is async; tests will close the loop after, so we + # null it here and let the GC clean up the underlying transport. + _client = None + + +def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]: + """Build the Authorization + standard headers for an upstream call.""" + headers = { + "Authorization": f"Bearer {settings.openai_api_key}", + "Content-Type": "application/json", + "User-Agent": "bedrock-api-proxy/openai-passthrough", + } + if extra: + headers.update(extra) + return headers From 5aac2335191a0b59107cb5946fcabdedd55d8646 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:42:16 +0000 Subject: [PATCH 09/22] feat(openai-passthrough): add SSE passthrough generator with usage tee --- app/api/openai_passthrough/streaming.py | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 app/api/openai_passthrough/streaming.py diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py new file mode 100644 index 0000000..584c441 --- /dev/null +++ b/app/api/openai_passthrough/streaming.py @@ -0,0 +1,49 @@ +"""SSE passthrough with usage tee. + +The async generator yields raw response bytes line-by-line so the FastAPI +StreamingResponse forwards them unchanged. After upstream stream ends, it +calls the supplied on_complete callback with the captured usage dict so the +caller can record usage to DynamoDB. +""" +from __future__ import annotations + +import logging +from typing import Any, AsyncIterator, Awaitable, Callable, Dict + +from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse + +logger = logging.getLogger(__name__) + + +async def stream_passthrough( + method: str, + path: str, + body: Dict[str, Any] | None, + api_surface: str, + on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None], + extra_headers: Dict[str, str] | None = None, +) -> AsyncIterator[bytes]: + """Stream upstream response bytes line-by-line; capture usage; trigger callback.""" + usage: Dict[str, Any] = {} + + client = get_client() + headers = upstream_headers(extra_headers) + + try: + async with client.stream(method, path, json=body, headers=headers) as resp: + async for raw_line in resp.aiter_lines(): + # Upstream gives us SSE lines without trailing newlines; restore the + # framing byte so the SSE body is well-formed for the downstream client. + yield (raw_line + "\n").encode("utf-8") + try_extract_usage_from_sse(raw_line, usage, api_surface) + except Exception as exc: + logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc) + # Re-raise so FastAPI can return a 500; downstream client sees the stream end. + raise + + if usage: + result = on_complete(usage) + # Support both sync and async callbacks + if hasattr(result, "__await__"): + await result # type: ignore[func-returns-value] From 93408d54640f047fce32639676f3c4bb712178bb Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 06:55:31 +0000 Subject: [PATCH 10/22] feat(openai-passthrough): non-streaming /chat/completions endpoint Implements the FastAPI router for OpenAI passthrough, mounts it conditionally under /openai/v1 when ENABLE_OPENAI_PASSTHROUGH=True, and adds four integration tests (non-streaming forward, model mapping, 4xx passthrough, and 401 on missing auth). Co-Authored-By: Claude Sonnet 4.6 --- app/api/openai_passthrough/__init__.py | 6 + app/api/openai_passthrough/router.py | 103 ++++++++++++++++++ app/main.py | 8 ++ .../test_openai_passthrough/__init__.py | 0 .../test_openai_passthrough/conftest.py | 93 ++++++++++++++++ .../test_chat_completions.py | 88 +++++++++++++++ 6 files changed, 298 insertions(+) create mode 100644 app/api/openai_passthrough/router.py create mode 100644 tests/integration/test_openai_passthrough/__init__.py create mode 100644 tests/integration/test_openai_passthrough/conftest.py create mode 100644 tests/integration/test_openai_passthrough/test_chat_completions.py diff --git a/app/api/openai_passthrough/__init__.py b/app/api/openai_passthrough/__init__.py index e69de29..21c98c6 100644 --- a/app/api/openai_passthrough/__init__.py +++ b/app/api/openai_passthrough/__init__.py @@ -0,0 +1,6 @@ +"""OpenAI Passthrough — accepts OpenAI Chat Completions and Responses API +calls from clients and forwards them to AWS bedrock-mantle. +""" +from app.api.openai_passthrough.router import router + +__all__ = ["router"] diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py new file mode 100644 index 0000000..1ee9b58 --- /dev/null +++ b/app/api/openai_passthrough/router.py @@ -0,0 +1,103 @@ +"""FastAPI routes for the OpenAI passthrough endpoints. + +Mounted at /openai/v1 only when settings.enable_openai_passthrough is True. +""" +from __future__ import annotations + +import logging +from typing import Any, Dict +from uuid import uuid4 + +from fastapi import APIRouter, Depends, Request +from fastapi.responses import JSONResponse, StreamingResponse + +from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.model_mapping import resolve_model_id +from app.api.openai_passthrough.streaming import stream_passthrough +from app.api.openai_passthrough.usage_extractor import normalize_usage +from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker +from app.middleware.auth import get_api_key_info + +logger = logging.getLogger(__name__) +router = APIRouter() + +_ddb: DynamoDBClient | None = None +_mapping: ModelMappingManager | None = None +_usage: UsageTracker | None = None + + +def _managers() -> tuple[ModelMappingManager, UsageTracker]: + """Lazily build DDB managers — keeps import-time side effects out of tests.""" + global _ddb, _mapping, _usage + if _ddb is None or _mapping is None or _usage is None: + _ddb = DynamoDBClient() + _mapping = ModelMappingManager(_ddb) + _usage = UsageTracker(_ddb) + return _mapping, _usage + + +def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None: + _, usage = _managers() + norm = normalize_usage(raw_usage, api_surface) + try: + usage.record_usage( + api_key=api_key_info.get("api_key", ""), + request_id=str(uuid4()), + model=model, + input_tokens=norm["input_tokens"], + output_tokens=norm["output_tokens"], + cached_tokens=norm["cache_read_input_tokens"], + cache_write_input_tokens=norm["cache_creation_input_tokens"], + api_surface=api_surface, + reasoning_tokens=norm["reasoning_tokens"], + ) + except Exception as exc: + logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc) + + +def _passthrough_extra_headers(request: Request) -> Dict[str, str]: + """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails).""" + extra: Dict[str, str] = {} + for name, value in request.headers.items(): + if name.lower().startswith("x-amzn-bedrock-"): + extra[name] = value + return extra + + +@router.post("/chat/completions") +async def chat_completions( + request: Request, + api_key_info: Dict[str, Any] = Depends(get_api_key_info), +): + body = await request.json() + mapping, _ = _managers() + body["model"] = resolve_model_id(body.get("model", ""), mapping) + extra = _passthrough_extra_headers(request) + + if body.get("stream"): + async def on_complete(usage: Dict[str, Any]) -> None: + _record_usage(api_key_info, usage, body["model"], "chat_completions") + return StreamingResponse( + stream_passthrough( + "POST", "/chat/completions", body, "chat_completions", on_complete, extra + ), + media_type="text/event-stream", + ) + + resp = await get_client().post( + "/chat/completions", json=body, headers=upstream_headers(extra) + ) + if resp.status_code >= 400: + return JSONResponse(_safe_json(resp), status_code=resp.status_code) + + data = resp.json() + if isinstance(data, dict) and isinstance(data.get("usage"), dict): + _record_usage(api_key_info, data["usage"], body["model"], "chat_completions") + return JSONResponse(data, status_code=resp.status_code) + + +def _safe_json(resp) -> Dict[str, Any]: + try: + return resp.json() + except ValueError: + return {"error": {"message": resp.text, "type": "upstream_error"}} diff --git a/app/main.py b/app/main.py index e85e051..c589001 100644 --- a/app/main.py +++ b/app/main.py @@ -313,6 +313,14 @@ async def lifespan(app: FastAPI): tags=["models"], ) +if settings.enable_openai_passthrough: + from app.api.openai_passthrough import router as openai_passthrough_router + app.include_router( + openai_passthrough_router, + prefix="/openai/v1", + tags=["OpenAI Passthrough"], + ) + # Custom HTTPException handler to return proper JSON format from fastapi import HTTPException diff --git a/tests/integration/test_openai_passthrough/__init__.py b/tests/integration/test_openai_passthrough/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_openai_passthrough/conftest.py b/tests/integration/test_openai_passthrough/conftest.py new file mode 100644 index 0000000..0a484c8 --- /dev/null +++ b/tests/integration/test_openai_passthrough/conftest.py @@ -0,0 +1,93 @@ +"""Shared fixtures for openai-passthrough integration tests.""" +from unittest.mock import MagicMock, patch + +import pytest +import respx +from fastapi.testclient import TestClient + + +@pytest.fixture +def mock_settings(monkeypatch): + """Set the env so the passthrough router mounts and points at a fake mantle.""" + monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", True) + monkeypatch.setattr("app.core.config.settings.openai_api_key", "bedrock-key-test") + monkeypatch.setattr("app.core.config.settings.openai_base_url", "https://mantle.test/v1") + monkeypatch.setattr("app.core.config.settings.require_api_key", True) + monkeypatch.setattr("app.core.config.settings.master_api_key", "") + + +@pytest.fixture +def mock_api_key_manager(): + """Patch APIKeyManager so any non-empty key validates as user 'u1'.""" + manager = MagicMock() + manager.validate_api_key.return_value = { + "api_key": "sk-test", "user_id": "u1", "is_master": False, + "rate_limit": None, "cache_ttl": None, + } + with patch("app.middleware.auth.APIKeyManager", return_value=manager): + yield manager + + +@pytest.fixture +def mock_model_mapping_manager(): + """Patch ModelMappingManager to return None (no mapping) by default.""" + manager = MagicMock() + manager.get_mapping.return_value = None + with patch("app.api.openai_passthrough.router.ModelMappingManager", return_value=manager): + yield manager + + +@pytest.fixture +def mock_usage_tracker(): + tracker = MagicMock() + with patch("app.api.openai_passthrough.router.UsageTracker", return_value=tracker): + yield tracker + + +@pytest.fixture +def respx_mock(): + """respx mock router for httpx calls.""" + with respx.mock(base_url="https://mantle.test/v1", assert_all_called=False) as router: + yield router + + +@pytest.fixture +def client(mock_settings, mock_api_key_manager, mock_model_mapping_manager, mock_usage_tracker): + """FastAPI TestClient with all mocks wired in. + + Imports inside the fixture so module-level settings reads happen after + monkeypatching. + """ + import importlib + + # Reset httpx singleton so it picks up the patched base URL + from app.api.openai_passthrough.client import reset_client_for_testing + reset_client_for_testing() + + # Access the actual router MODULE (not the APIRouter instance) via sys.modules. + # We must do this because app/api/openai_passthrough/__init__.py shadows the + # submodule name with `from .router import router`, so + # `import app.api.openai_passthrough.router` returns the APIRouter instance. + import sys as _sys + + # Ensure the router module is loaded + import app.api.openai_passthrough.router # noqa: F401 (triggers module load) + _router_module = _sys.modules["app.api.openai_passthrough.router"] + + # Reset DDB manager cache so each test gets fresh mock instances + _router_module._ddb = None + _router_module._mapping = None + _router_module._usage = None + + with patch("app.api.openai_passthrough.router.DynamoDBClient", return_value=MagicMock()): + # Reload app.main so the conditional router mount re-evaluates with + # settings.enable_openai_passthrough=True (set by mock_settings above). + import app.main as _main_mod + importlib.reload(_main_mod) + + # Reset again after reload (reload may reinitialise globals) + _router_module._ddb = None + _router_module._mapping = None + _router_module._usage = None + + yield TestClient(_main_mod.app) diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py new file mode 100644 index 0000000..f3d1482 --- /dev/null +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -0,0 +1,88 @@ +"""Integration tests for POST /openai/v1/chat/completions.""" +import json + +import httpx + + +def test_non_streaming_chat_completions_forwards_and_logs_usage( + client, respx_mock, mock_usage_tracker, mock_model_mapping_manager +): + upstream_resp = { + "id": "chatcmpl-1", + "object": "chat.completion", + "model": "openai.gpt-oss-120b", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "hi"}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, + } + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json=upstream_resp) + ) + + r = client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={ + "model": "openai.gpt-oss-120b", + "messages": [{"role": "user", "content": "hi"}], + }, + ) + + assert r.status_code == 200 + assert r.json() == upstream_resp + assert route.called + # Upstream got proxy's Bedrock API key, not the client's proxy key + sent = route.calls[0].request + assert sent.headers["authorization"] == "Bearer bedrock-key-test" + sent_body = json.loads(sent.content) + assert sent_body["model"] == "openai.gpt-oss-120b" + # Usage was recorded + assert mock_usage_tracker.record_usage.called + kwargs = mock_usage_tracker.record_usage.call_args.kwargs + assert kwargs["input_tokens"] == 10 + assert kwargs["output_tokens"] == 5 + assert kwargs["api_surface"] == "chat_completions" + assert kwargs["model"] == "openai.gpt-oss-120b" + + +def test_model_mapping_is_applied( + client, respx_mock, mock_model_mapping_manager +): + mock_model_mapping_manager.get_mapping.return_value = "openai.gpt-oss-120b" + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json={ + "id": "x", "choices": [], "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2} + }) + ) + + client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "gpt-4", "messages": [{"role": "user", "content": "hi"}]}, + ) + + sent = json.loads(route.calls[0].request.content) + assert sent["model"] == "openai.gpt-oss-120b" + + +def test_upstream_4xx_returned_verbatim(client, respx_mock, mock_usage_tracker): + err_body = {"error": {"message": "model not found", "type": "invalid_request_error"}} + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(404, json=err_body) + ) + + r = client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "no-such-model", "messages": []}, + ) + assert r.status_code == 404 + assert r.json() == err_body + assert not mock_usage_tracker.record_usage.called # Don't log usage on errors + + +def test_missing_auth_returns_401(client): + r = client.post( + "/openai/v1/chat/completions", + json={"model": "x", "messages": []}, + ) + assert r.status_code == 401 From 888eabdc827a13bca25fff6bf81b966226c1d7d6 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 07:02:58 +0000 Subject: [PATCH 11/22] test(openai-passthrough): streaming /chat/completions integration tests --- .../test_chat_completions.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py index f3d1482..4298436 100644 --- a/tests/integration/test_openai_passthrough/test_chat_completions.py +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -86,3 +86,70 @@ def test_missing_auth_returns_401(client): json={"model": "x", "messages": []}, ) assert r.status_code == 401 + + +def test_streaming_chat_completions_forwards_sse_and_records_usage( + client, respx_mock, mock_usage_tracker +): + """Stream three SSE chunks; the second-to-last carries usage.""" + sse_lines = [ + 'data: {"id":"x","choices":[{"index":0,"delta":{"role":"assistant"}}]}', + 'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}', + 'data: {"id":"x","choices":[],"usage":{"prompt_tokens":7,"completion_tokens":2,"total_tokens":9}}', + 'data: [DONE]', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response( + 200, headers={"content-type": "text/event-stream"}, content=body + ) + ) + + with client.stream( + "POST", + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={ + "model": "openai.gpt-oss-120b", + "messages": [{"role": "user", "content": "hi"}], + "stream": True, + "stream_options": {"include_usage": True}, + }, + ) as r: + assert r.status_code == 200 + out = b"".join(r.iter_bytes()) + + # All four lines forwarded + assert b'"delta":{"role":"assistant"}' in out + assert b'[DONE]' in out + # Usage recorded from the chunk that had it + assert mock_usage_tracker.record_usage.called + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 7 + assert kw["output_tokens"] == 2 + assert kw["api_surface"] == "chat_completions" + + +def test_streaming_chat_completions_without_include_usage_does_not_log( + client, respx_mock, mock_usage_tracker +): + """If client doesn't request usage, no usage chunk arrives → no usage logged.""" + sse_lines = [ + 'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}', + 'data: [DONE]', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response( + 200, headers={"content-type": "text/event-stream"}, content=body + ) + ) + + with client.stream( + "POST", "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "messages": [], "stream": True}, + ) as r: + list(r.iter_bytes()) # drain + + assert not mock_usage_tracker.record_usage.called From e091bd7ec82461611c24e825907801ae6e86647e Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 07:07:09 +0000 Subject: [PATCH 12/22] feat(openai-passthrough): /responses endpoint (POST, streaming + non-streaming) --- app/api/openai_passthrough/router.py | 30 +++++++ .../test_openai_passthrough/test_responses.py | 79 +++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tests/integration/test_openai_passthrough/test_responses.py diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index 1ee9b58..36ab285 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -96,6 +96,36 @@ async def on_complete(usage: Dict[str, Any]) -> None: return JSONResponse(data, status_code=resp.status_code) +@router.post("/responses") +async def responses_create( + request: Request, + api_key_info: Dict[str, Any] = Depends(get_api_key_info), +): + body = await request.json() + mapping, _ = _managers() + body["model"] = resolve_model_id(body.get("model", ""), mapping) + extra = _passthrough_extra_headers(request) + + if body.get("stream"): + async def on_complete(usage: Dict[str, Any]) -> None: + _record_usage(api_key_info, usage, body["model"], "responses") + return StreamingResponse( + stream_passthrough("POST", "/responses", body, "responses", on_complete, extra), + media_type="text/event-stream", + ) + + resp = await get_client().post( + "/responses", json=body, headers=upstream_headers(extra) + ) + if resp.status_code >= 400: + return JSONResponse(_safe_json(resp), status_code=resp.status_code) + + data = resp.json() + if isinstance(data, dict) and isinstance(data.get("usage"), dict): + _record_usage(api_key_info, data["usage"], body["model"], "responses") + return JSONResponse(data, status_code=resp.status_code) + + def _safe_json(resp) -> Dict[str, Any]: try: return resp.json() diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py new file mode 100644 index 0000000..a596fed --- /dev/null +++ b/tests/integration/test_openai_passthrough/test_responses.py @@ -0,0 +1,79 @@ +"""Integration tests for POST /openai/v1/responses (streaming + non-streaming).""" +import json + +import httpx + + +def test_non_streaming_responses_forwards_and_logs_usage( + client, respx_mock, mock_usage_tracker +): + upstream = { + "id": "resp-1", + "object": "response", + "model": "openai.gpt-oss-120b", + "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "hi"}]}], + "usage": {"input_tokens": 11, "output_tokens": 4, "total_tokens": 15}, + } + route = respx_mock.post("/responses").mock(return_value=httpx.Response(200, json=upstream)) + + r = client.post( + "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}]}, + ) + + assert r.status_code == 200 + assert r.json() == upstream + assert route.called + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 11 + assert kw["output_tokens"] == 4 + assert kw["api_surface"] == "responses" + + +def test_streaming_responses_records_usage_from_response_completed( + client, respx_mock, mock_usage_tracker +): + sse_lines = [ + 'event: response.created', + 'data: {"type":"response.created","response":{"id":"r-1"}}', + 'event: response.output_text.delta', + 'data: {"type":"response.output_text.delta","delta":"hi"}', + 'event: response.completed', + 'data: ' + json.dumps({ + "type": "response.completed", + "response": {"id": "r-1", "usage": {"input_tokens": 12, "output_tokens": 3, "total_tokens": 15}}, + }), + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/responses").mock( + return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body) + ) + + with client.stream( + "POST", "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}], "stream": True}, + ) as r: + out = b"".join(r.iter_bytes()) + + assert b"response.completed" in out + assert b"hi" in out + kw = mock_usage_tracker.record_usage.call_args.kwargs + assert kw["input_tokens"] == 12 + assert kw["output_tokens"] == 3 + assert kw["api_surface"] == "responses" + + +def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker): + respx_mock.post("/responses").mock( + return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}}) + ) + r = client.post( + "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "input": []}, + ) + assert r.status_code == 400 + assert r.json()["error"]["message"] == "bad input" + assert not mock_usage_tracker.record_usage.called From 8abab658fc5ffa26c674575c60ea259cbe048506 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 07:13:50 +0000 Subject: [PATCH 13/22] feat(openai-passthrough): /responses CRUD passthrough (GET, DELETE, cancel, input_items) --- app/api/openai_passthrough/router.py | 48 ++++++++++++++++- .../test_responses_crud.py | 51 +++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_openai_passthrough/test_responses_crud.py diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index 36ab285..d0135c5 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -8,7 +8,7 @@ from typing import Any, Dict from uuid import uuid4 -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, Request, Response from fastapi.responses import JSONResponse, StreamingResponse from app.api.openai_passthrough.client import get_client, upstream_headers @@ -126,6 +126,52 @@ async def on_complete(usage: Dict[str, Any]) -> None: return JSONResponse(data, status_code=resp.status_code) +async def _passthrough_request(request: Request, path: str) -> Response: + """Forward request to upstream and mirror the upstream response.""" + extra = _passthrough_extra_headers(request) + body = None + if request.method in ("POST", "PUT", "PATCH"): + try: + body = await request.json() + except Exception: + body = None + resp = await get_client().request( + request.method, path, json=body, headers=upstream_headers(extra) + ) + return Response( + content=resp.content, + status_code=resp.status_code, + media_type=resp.headers.get("content-type"), + ) + + +@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"]) +async def responses_get_or_delete( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}") + + +@router.post("/responses/{response_id}/cancel") +async def responses_cancel( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}/cancel") + + +@router.get("/responses/{response_id}/input_items") +async def responses_input_items( + response_id: str, + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, f"/responses/{response_id}/input_items") + + def _safe_json(resp) -> Dict[str, Any]: try: return resp.json() diff --git a/tests/integration/test_openai_passthrough/test_responses_crud.py b/tests/integration/test_openai_passthrough/test_responses_crud.py new file mode 100644 index 0000000..3692c3b --- /dev/null +++ b/tests/integration/test_openai_passthrough/test_responses_crud.py @@ -0,0 +1,51 @@ +"""Integration tests for the Responses CRUD endpoints — pure passthrough.""" +import httpx + + +def test_get_response_forwards_and_returns_body(client, respx_mock, mock_usage_tracker): + body = {"id": "r-1", "model": "x", "status": "completed"} + respx_mock.get("/responses/r-1").mock(return_value=httpx.Response(200, json=body)) + + r = client.get("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == body + # No usage logged for retrieval + assert not mock_usage_tracker.record_usage.called + + +def test_delete_response_forwards(client, respx_mock): + respx_mock.delete("/responses/r-1").mock( + return_value=httpx.Response(200, json={"id": "r-1", "deleted": True}) + ) + r = client.delete("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == {"id": "r-1", "deleted": True} + + +def test_cancel_response_forwards(client, respx_mock): + respx_mock.post("/responses/r-1/cancel").mock( + return_value=httpx.Response(200, json={"id": "r-1", "status": "cancelled"}) + ) + r = client.post("/openai/v1/responses/r-1/cancel", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json()["status"] == "cancelled" + + +def test_list_input_items_forwards(client, respx_mock): + body = {"data": [{"id": "msg-1", "role": "user"}], "object": "list"} + respx_mock.get("/responses/r-1/input_items").mock(return_value=httpx.Response(200, json=body)) + r = client.get( + "/openai/v1/responses/r-1/input_items", + headers={"Authorization": "Bearer sk-test"}, + ) + assert r.status_code == 200 + assert r.json() == body + + +def test_get_response_404_returned_verbatim(client, respx_mock): + respx_mock.get("/responses/missing").mock( + return_value=httpx.Response(404, json={"error": {"message": "not found"}}) + ) + r = client.get("/openai/v1/responses/missing", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 404 + assert r.json()["error"]["message"] == "not found" From bb2b1b33243089dd219873b2a7ca638f8c7983d2 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 07:22:46 +0000 Subject: [PATCH 14/22] feat(openai-passthrough): /models endpoint passthrough --- app/api/openai_passthrough/router.py | 8 ++++++++ .../test_openai_passthrough/test_models.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 tests/integration/test_openai_passthrough/test_models.py diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index d0135c5..1adb3ae 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -172,6 +172,14 @@ async def responses_input_items( return await _passthrough_request(request, f"/responses/{response_id}/input_items") +@router.get("/models") +async def list_models( + request: Request, + _: Dict[str, Any] = Depends(get_api_key_info), +): + return await _passthrough_request(request, "/models") + + def _safe_json(resp) -> Dict[str, Any]: try: return resp.json() diff --git a/tests/integration/test_openai_passthrough/test_models.py b/tests/integration/test_openai_passthrough/test_models.py new file mode 100644 index 0000000..0a6d8cd --- /dev/null +++ b/tests/integration/test_openai_passthrough/test_models.py @@ -0,0 +1,17 @@ +"""Integration test for GET /openai/v1/models — pure passthrough.""" +import httpx + + +def test_list_models_forwards(client, respx_mock): + upstream = { + "object": "list", + "data": [ + {"id": "openai.gpt-oss-120b", "object": "model"}, + {"id": "us.anthropic.claude-sonnet-4-6", "object": "model"}, + ], + } + respx_mock.get("/models").mock(return_value=httpx.Response(200, json=upstream)) + + r = client.get("/openai/v1/models", headers={"Authorization": "Bearer sk-test"}) + assert r.status_code == 200 + assert r.json() == upstream From 8048cc3ddbe7549ff51fee5f95e771a84164e29d Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 08:15:02 +0000 Subject: [PATCH 15/22] test(openai-passthrough): pin guardrail header forwarding behavior --- .../test_chat_completions.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py index 4298436..a2e9bdb 100644 --- a/tests/integration/test_openai_passthrough/test_chat_completions.py +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -153,3 +153,25 @@ def test_streaming_chat_completions_without_include_usage_does_not_log( list(r.iter_bytes()) # drain assert not mock_usage_tracker.record_usage.called + + +def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock): + """X-Amzn-Bedrock-* headers from the client should reach the upstream call.""" + route = respx_mock.post("/chat/completions").mock( + return_value=httpx.Response(200, json={ + "id": "x", "choices": [], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}, + }) + ) + client.post( + "/openai/v1/chat/completions", + headers={ + "Authorization": "Bearer sk-test", + "X-Amzn-Bedrock-GuardrailIdentifier": "GR12345", + "X-Amzn-Bedrock-GuardrailVersion": "DRAFT", + }, + json={"model": "m", "messages": [{"role": "user", "content": "hi"}]}, + ) + sent = route.calls[0].request + assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345" + assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT" From 6fb932fbe99a026ff3ae9d2a7d49e09d687cca9a Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 08:17:13 +0000 Subject: [PATCH 16/22] chore(openai-passthrough): lint and type cleanup --- app/api/openai_passthrough/router.py | 28 +++++++++---------- app/api/openai_passthrough/streaming.py | 13 +++++---- app/api/openai_passthrough/usage_extractor.py | 6 ++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index 1adb3ae..159d28e 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -5,7 +5,7 @@ from __future__ import annotations import logging -from typing import Any, Dict +from typing import Any, cast from uuid import uuid4 from fastapi import APIRouter, Depends, Request, Response @@ -36,7 +36,7 @@ def _managers() -> tuple[ModelMappingManager, UsageTracker]: return _mapping, _usage -def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None: +def _record_usage(api_key_info: dict[str, Any], raw_usage: dict[str, Any], model: str, api_surface: str) -> None: _, usage = _managers() norm = normalize_usage(raw_usage, api_surface) try: @@ -55,9 +55,9 @@ def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc) -def _passthrough_extra_headers(request: Request) -> Dict[str, str]: +def _passthrough_extra_headers(request: Request) -> dict[str, str]: """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails).""" - extra: Dict[str, str] = {} + extra: dict[str, str] = {} for name, value in request.headers.items(): if name.lower().startswith("x-amzn-bedrock-"): extra[name] = value @@ -67,7 +67,7 @@ def _passthrough_extra_headers(request: Request) -> Dict[str, str]: @router.post("/chat/completions") async def chat_completions( request: Request, - api_key_info: Dict[str, Any] = Depends(get_api_key_info), + api_key_info: dict[str, Any] = Depends(get_api_key_info), ): body = await request.json() mapping, _ = _managers() @@ -75,7 +75,7 @@ async def chat_completions( extra = _passthrough_extra_headers(request) if body.get("stream"): - async def on_complete(usage: Dict[str, Any]) -> None: + async def on_complete(usage: dict[str, Any]) -> None: _record_usage(api_key_info, usage, body["model"], "chat_completions") return StreamingResponse( stream_passthrough( @@ -99,7 +99,7 @@ async def on_complete(usage: Dict[str, Any]) -> None: @router.post("/responses") async def responses_create( request: Request, - api_key_info: Dict[str, Any] = Depends(get_api_key_info), + api_key_info: dict[str, Any] = Depends(get_api_key_info), ): body = await request.json() mapping, _ = _managers() @@ -107,7 +107,7 @@ async def responses_create( extra = _passthrough_extra_headers(request) if body.get("stream"): - async def on_complete(usage: Dict[str, Any]) -> None: + async def on_complete(usage: dict[str, Any]) -> None: _record_usage(api_key_info, usage, body["model"], "responses") return StreamingResponse( stream_passthrough("POST", "/responses", body, "responses", on_complete, extra), @@ -149,7 +149,7 @@ async def _passthrough_request(request: Request, path: str) -> Response: async def responses_get_or_delete( response_id: str, request: Request, - _: Dict[str, Any] = Depends(get_api_key_info), + _: dict[str, Any] = Depends(get_api_key_info), ): return await _passthrough_request(request, f"/responses/{response_id}") @@ -158,7 +158,7 @@ async def responses_get_or_delete( async def responses_cancel( response_id: str, request: Request, - _: Dict[str, Any] = Depends(get_api_key_info), + _: dict[str, Any] = Depends(get_api_key_info), ): return await _passthrough_request(request, f"/responses/{response_id}/cancel") @@ -167,7 +167,7 @@ async def responses_cancel( async def responses_input_items( response_id: str, request: Request, - _: Dict[str, Any] = Depends(get_api_key_info), + _: dict[str, Any] = Depends(get_api_key_info), ): return await _passthrough_request(request, f"/responses/{response_id}/input_items") @@ -175,13 +175,13 @@ async def responses_input_items( @router.get("/models") async def list_models( request: Request, - _: Dict[str, Any] = Depends(get_api_key_info), + _: dict[str, Any] = Depends(get_api_key_info), ): return await _passthrough_request(request, "/models") -def _safe_json(resp) -> Dict[str, Any]: +def _safe_json(resp) -> dict[str, Any]: try: - return resp.json() + return cast(dict[str, Any], resp.json()) except ValueError: return {"error": {"message": resp.text, "type": "upstream_error"}} diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py index 584c441..8030f00 100644 --- a/app/api/openai_passthrough/streaming.py +++ b/app/api/openai_passthrough/streaming.py @@ -8,7 +8,8 @@ from __future__ import annotations import logging -from typing import Any, AsyncIterator, Awaitable, Callable, Dict +from collections.abc import AsyncIterator, Awaitable, Callable +from typing import Any from app.api.openai_passthrough.client import get_client, upstream_headers from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse @@ -19,13 +20,13 @@ async def stream_passthrough( method: str, path: str, - body: Dict[str, Any] | None, + body: dict[str, Any] | None, api_surface: str, - on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None], - extra_headers: Dict[str, str] | None = None, + on_complete: Callable[[dict[str, Any]], Awaitable[None] | None], + extra_headers: dict[str, str] | None = None, ) -> AsyncIterator[bytes]: """Stream upstream response bytes line-by-line; capture usage; trigger callback.""" - usage: Dict[str, Any] = {} + usage: dict[str, Any] = {} client = get_client() headers = upstream_headers(extra_headers) @@ -46,4 +47,4 @@ async def stream_passthrough( result = on_complete(usage) # Support both sync and async callbacks if hasattr(result, "__await__"): - await result # type: ignore[func-returns-value] + await result # type: ignore[misc] diff --git a/app/api/openai_passthrough/usage_extractor.py b/app/api/openai_passthrough/usage_extractor.py index 96a5f1e..0eb8a60 100644 --- a/app/api/openai_passthrough/usage_extractor.py +++ b/app/api/openai_passthrough/usage_extractor.py @@ -11,10 +11,10 @@ from __future__ import annotations import json -from typing import Any, Dict +from typing import Any -def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]: +def normalize_usage(raw: dict[str, Any], api_surface: str) -> dict[str, int]: """Normalize OpenAI-shaped usage into Anthropic-shaped fields. api_surface: "chat_completions" or "responses" @@ -46,7 +46,7 @@ def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]: def try_extract_usage_from_sse( - raw_line: str, holder: Dict[str, Any], api_surface: str + raw_line: str, holder: dict[str, Any], api_surface: str ) -> None: """Inspect an SSE line and, if it carries usage info, store it in holder. From d7c7d37c5fd3765df95b26579e66e9f0f8e7f0ab Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 08:23:37 +0000 Subject: [PATCH 17/22] docs(openai-passthrough): document new feature in env.example, CLAUDE.md, and features.md --- CLAUDE.md | 4 ++- docs/architecture/features.md | 68 +++++++++++++++++++++++++++++++++++ env.example | 8 +++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index c7d00e7..461e8ea 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -40,8 +40,9 @@ black app tests && ruff check app tests && mypy app - **InvokeModel API** (Claude models): Native Anthropic format, minimal conversion, full beta feature support - **Converse API** (non-Claude models): Requires format conversion, unified API for all Bedrock models - **OpenAI Chat Completions API** (non-Claude models, optional): When `ENABLE_OPENAI_COMPAT=True`, non-Claude models use Bedrock's OpenAI-compatible endpoint via bedrock-mantle instead of Converse API +- **OpenAI Passthrough** (any model bedrock-mantle accepts, optional): When `ENABLE_OPENAI_PASSTHROUGH=True`, mounts `/openai/v1/{chat/completions,responses,responses/{id},models}` for clients using OpenAI-format directly. -**Routing**: If model ID contains "anthropic" or "claude" → InvokeModel; else if `ENABLE_OPENAI_COMPAT` → OpenAI Chat Completions; else → Converse. +**Routing**: If model ID contains "anthropic" or "claude" → InvokeModel; else if `ENABLE_OPENAI_COMPAT` → OpenAI Chat Completions; else → Converse. OpenAI Passthrough routes are independent and mount at `/openai/v1/*`. > **Detailed conversion flows, content block mapping, and streaming implementation**: see [docs/architecture/detailed-flows.md](docs/architecture/detailed-flows.md) @@ -104,6 +105,7 @@ Each feature has detailed docs in [docs/architecture/features.md](docs/architect - **OpenTelemetry Tracing**: OTEL GenAI semantic conventions, session-based trace grouping. Zero overhead when disabled. - **Admin Portal**: Separate FastAPI app for API key/usage/pricing management with Cognito auth. - **OpenAI-Compatible API**: Non-Claude models can optionally use Bedrock's OpenAI Chat Completions API via bedrock-mantle endpoint instead of Converse API. Controlled by `ENABLE_OPENAI_COMPAT` flag. Maps `thinking` to OpenAI `reasoning` with configurable effort thresholds. +- **OpenAI Passthrough**: New `/openai/v1/*` endpoints accept OpenAI-native Chat Completions and Responses API requests and forward them to bedrock-mantle. Distinct from `ENABLE_OPENAI_COMPAT` (which routes Anthropic-format requests on `/v1/messages`). Reuses proxy API key auth, rate limits, budgets, and usage tracking. Controlled by `ENABLE_OPENAI_PASSTHROUGH`. ## Common Development Tasks diff --git a/docs/architecture/features.md b/docs/architecture/features.md index af6d16b..e9f2b5d 100644 --- a/docs/architecture/features.md +++ b/docs/architecture/features.md @@ -419,3 +419,71 @@ The admin portal is a separate FastAPI application for managing API keys, usage, ### Production Deployment In production (ECS), the admin portal frontend is served as static files from the main proxy at `/admin/`, with API calls proxied to the backend. + +--- + +## OpenAI Passthrough + +Adds new `/openai/v1/*` endpoints that accept OpenAI-native API formats and forward them to `bedrock-mantle`. Distinct from `ENABLE_OPENAI_COMPAT` (which converts Anthropic-format requests on `/v1/messages` into OpenAI calls). + +### When to use it + +- You have client code using the OpenAI Python/JS SDK and want to point it at Bedrock without rewriting. +- You want stateful conversation chaining via the Responses API (`previous_response_id`, `store=true`). +- You want the proxy's API key auth, rate limits, budgets, and usage analytics for OpenAI-format traffic too. + +### Configuration + +```bash +ENABLE_OPENAI_PASSTHROUGH=True +OPENAI_API_KEY= +OPENAI_BASE_URL=https://bedrock-mantle.us-east-1.api.aws/v1 +``` + +### Endpoints + +| Method | Path | Purpose | +|---|---|---| +| POST | `/openai/v1/chat/completions` | Chat Completions (streaming + non-streaming) | +| POST | `/openai/v1/responses` | Responses API (streaming + non-streaming) | +| GET | `/openai/v1/responses/{id}` | Retrieve stored response | +| DELETE | `/openai/v1/responses/{id}` | Delete stored response | +| GET | `/openai/v1/responses/{id}/input_items` | List input items | +| POST | `/openai/v1/responses/{id}/cancel` | Cancel background response | +| GET | `/openai/v1/models` | List available models | + +### OpenAI SDK example + +```python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://your-proxy.example.com/openai/v1", +) +resp = client.chat.completions.create( + model="openai.gpt-oss-120b", + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +### Auth + +Either `Authorization: Bearer ` (OpenAI SDK default) or `x-api-key: ` works. The proxy uses its configured `OPENAI_API_KEY` (Bedrock API key) for the upstream call. + +### Model mapping + +The existing `anthropic-proxy-model-mapping` table is consulted. If a mapping exists, the client-supplied `model` is replaced before forwarding. If no mapping exists, the model ID is passed through unchanged — so Bedrock-native IDs like `openai.gpt-oss-120b` work without registration. + +### Usage tracking + +Usage is normalized into the existing `anthropic-proxy-usage` schema. Two new sparse columns are written: + +- `api_surface` ∈ `{"messages", "chat_completions", "responses"}` +- `reasoning_tokens` (already counted in `output_tokens`; stored separately for visibility) + +For streaming Chat Completions, clients must set `stream_options: {"include_usage": true}` for usage to be captured. Without it, usage is logged as zero. The Responses API always emits `response.completed` with usage. + +### Guardrails + +`X-Amzn-Bedrock-*` headers from the client (e.g. `X-Amzn-Bedrock-GuardrailIdentifier`) are forwarded to bedrock-mantle. diff --git a/env.example b/env.example index 29029de..93733ea 100644 --- a/env.example +++ b/env.example @@ -104,6 +104,14 @@ DEFAULT_SERVICE_TIER=default # OPENAI_COMPAT_THINKING_HIGH_THRESHOLD=10000 # OPENAI_COMPAT_THINKING_MEDIUM_THRESHOLD=4000 +# =========================================== +# OpenAI Passthrough — mount /openai/v1/* endpoints accepting native OpenAI +# Chat Completions and Responses API requests, forwarded to bedrock-mantle. +# Independent of ENABLE_OPENAI_COMPAT (the two flags can be enabled together). +# Reuses OPENAI_API_KEY and OPENAI_BASE_URL. +# =========================================== +# ENABLE_OPENAI_PASSTHROUGH=False + # =========================================== # OpenTelemetry Tracing # =========================================== From fc827c76cba807b612142fb6acafec7c27da7f78 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 08:44:25 +0000 Subject: [PATCH 18/22] fix(openai-passthrough): yield structured SSE error on upstream timeout; add flag-off and timeout tests Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/openai_passthrough/streaming.py | 18 ++++++++- .../test_chat_completions.py | 22 +++++++++++ .../test_openai_passthrough/test_flag_off.py | 37 +++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_openai_passthrough/test_flag_off.py diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py index 8030f00..220903b 100644 --- a/app/api/openai_passthrough/streaming.py +++ b/app/api/openai_passthrough/streaming.py @@ -7,10 +7,13 @@ """ from __future__ import annotations +import json import logging from collections.abc import AsyncIterator, Awaitable, Callable from typing import Any +import httpx + from app.api.openai_passthrough.client import get_client, upstream_headers from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse @@ -38,9 +41,22 @@ async def stream_passthrough( # framing byte so the SSE body is well-formed for the downstream client. yield (raw_line + "\n").encode("utf-8") try_extract_usage_from_sse(raw_line, usage, api_surface) + except (httpx.RequestError, httpx.TimeoutException) as exc: + # Upstream connection/timeout failure during streaming. OpenAI SDK clients + # expect a clean SSE termination, not an abruptly closed stream. + logger.error("[OPENAI-PASSTHROUGH] upstream stream connection error: %s", exc) + err = { + "error": { + "message": f"upstream connection failed: {type(exc).__name__}", + "type": "upstream_error", + } + } + yield ("data: " + json.dumps(err) + "\n\n").encode("utf-8") + yield b"data: [DONE]\n\n" + return except Exception as exc: + # Unexpected error — re-raise so FastAPI can convert to 500. logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc) - # Re-raise so FastAPI can return a 500; downstream client sees the stream end. raise if usage: diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py index a2e9bdb..b26965c 100644 --- a/tests/integration/test_openai_passthrough/test_chat_completions.py +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -175,3 +175,25 @@ def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock): sent = route.calls[0].request assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345" assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT" + + +def test_streaming_upstream_timeout_yields_clean_sse_error( + client, respx_mock, mock_usage_tracker +): + """Upstream timeout during streaming should produce a structured SSE error event, not crash the stream.""" + respx_mock.post("/chat/completions").mock( + side_effect=httpx.ReadTimeout("upstream took too long") + ) + + with client.stream( + "POST", + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "messages": [], "stream": True}, + ) as r: + out = b"".join(r.iter_bytes()) + + assert b'"upstream_error"' in out, f"expected structured error, got: {out}" + assert b"[DONE]" in out + # No usage logged when the stream errored before any usage event arrived + assert not mock_usage_tracker.record_usage.called diff --git a/tests/integration/test_openai_passthrough/test_flag_off.py b/tests/integration/test_openai_passthrough/test_flag_off.py new file mode 100644 index 0000000..da8d816 --- /dev/null +++ b/tests/integration/test_openai_passthrough/test_flag_off.py @@ -0,0 +1,37 @@ +"""Verify that /openai/v1/* paths return 404 when ENABLE_OPENAI_PASSTHROUGH is off.""" +import importlib + +from fastapi.testclient import TestClient + + +def test_flag_off_returns_404(monkeypatch): + """With ENABLE_OPENAI_PASSTHROUGH=False, /openai/v1/* paths must not exist.""" + monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", False) + monkeypatch.setattr("app.core.config.settings.require_api_key", False) + + # Reload main so the conditional mount re-evaluates with the flag off. + import app.main as _main + importlib.reload(_main) + + client = TestClient(_main.app) + r = client.post( + "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "x", "messages": []}, + ) + assert r.status_code == 404, f"expected 404 with flag off, got {r.status_code}" + + r = client.get("/openai/v1/models") + assert r.status_code == 404 + + +def test_flag_off_does_not_register_routes(monkeypatch): + """Programmatic verification: no route paths under /openai/v1 when flag is off.""" + monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", False) + + import app.main as _main + importlib.reload(_main) + + extra = [getattr(r, "path", "") for r in _main.app.routes + if getattr(r, "path", "").startswith("/openai/v1")] + assert not extra, f"unexpected routes registered: {extra}" From 20b58971faf88335c002d29328a73ebf08983b88 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 12:30:19 +0000 Subject: [PATCH 19/22] fix(openai-passthrough): build absolute upstream URLs to preserve /v1 base path httpx follows RFC 3986 path-merging on AsyncClient.base_url: a request path starting with `/` REPLACES the base_url's path entirely. With OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1, calls like `client.post("/chat/completions")` were being sent to `bedrock-mantle.us-west-2.api.aws/chat/completions` (no `/v1`), causing 404s in production. Fix: - Drop base_url from the AsyncClient - Add upstream_url(path) that explicitly joins OPENAI_BASE_URL + path - Use upstream_url() everywhere we previously passed bare paths - Add unit tests covering leading-slash, trailing-slash, and ID-in-path cases that would have caught this Integration tests passed previously because respx joins base_url + path intuitively; only real httpx exhibits the RFC 3986 replacement behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/openai_passthrough/client.py | 27 +++++++- app/api/openai_passthrough/router.py | 8 +-- app/api/openai_passthrough/streaming.py | 4 +- .../test_client_url.py | 63 +++++++++++++++++++ 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 tests/unit/test_openai_passthrough/test_client_url.py diff --git a/app/api/openai_passthrough/client.py b/app/api/openai_passthrough/client.py index 93fcb80..04a9479 100644 --- a/app/api/openai_passthrough/client.py +++ b/app/api/openai_passthrough/client.py @@ -2,6 +2,15 @@ Headers are NOT set on the client itself; they're added per-request in the router so we can include the proxy's Bedrock API key in Authorization. + +URL building note: we deliberately do NOT set ``base_url`` on the AsyncClient. +httpx follows RFC 3986 path-merging, which means a request path starting with +``/`` REPLACES the path component of the base_url. With +``OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1``, calling +``client.post("/chat/completions")`` would produce +``https://bedrock-mantle.us-west-2.api.aws/chat/completions`` (the ``/v1`` is +dropped). To avoid this footgun we build full URLs explicitly via +``upstream_url(path)``. """ from __future__ import annotations @@ -16,7 +25,6 @@ def get_client() -> httpx.AsyncClient: global _client if _client is None: _client = httpx.AsyncClient( - base_url=settings.openai_base_url, timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0), limits=httpx.Limits(max_connections=200, max_keepalive_connections=50), ) @@ -32,6 +40,23 @@ def reset_client_for_testing() -> None: _client = None +def upstream_url(path: str) -> str: + """Build a full upstream URL by appending ``path`` to ``OPENAI_BASE_URL``. + + Avoids httpx's RFC 3986 path-replacement behaviour by always producing a + fully-qualified URL. + + Examples: + OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1 + upstream_url("/chat/completions") -> https://bedrock-mantle.us-west-2.api.aws/v1/chat/completions + upstream_url("models") -> https://bedrock-mantle.us-west-2.api.aws/v1/models + """ + base = settings.openai_base_url.rstrip("/") + if not path.startswith("/"): + path = "/" + path + return base + path + + def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]: """Build the Authorization + standard headers for an upstream call.""" headers = { diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index 159d28e..a505f49 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -11,7 +11,7 @@ from fastapi import APIRouter, Depends, Request, Response from fastapi.responses import JSONResponse, StreamingResponse -from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url from app.api.openai_passthrough.model_mapping import resolve_model_id from app.api.openai_passthrough.streaming import stream_passthrough from app.api.openai_passthrough.usage_extractor import normalize_usage @@ -85,7 +85,7 @@ async def on_complete(usage: dict[str, Any]) -> None: ) resp = await get_client().post( - "/chat/completions", json=body, headers=upstream_headers(extra) + upstream_url("/chat/completions"), json=body, headers=upstream_headers(extra) ) if resp.status_code >= 400: return JSONResponse(_safe_json(resp), status_code=resp.status_code) @@ -115,7 +115,7 @@ async def on_complete(usage: dict[str, Any]) -> None: ) resp = await get_client().post( - "/responses", json=body, headers=upstream_headers(extra) + upstream_url("/responses"), json=body, headers=upstream_headers(extra) ) if resp.status_code >= 400: return JSONResponse(_safe_json(resp), status_code=resp.status_code) @@ -136,7 +136,7 @@ async def _passthrough_request(request: Request, path: str) -> Response: except Exception: body = None resp = await get_client().request( - request.method, path, json=body, headers=upstream_headers(extra) + request.method, upstream_url(path), json=body, headers=upstream_headers(extra) ) return Response( content=resp.content, diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py index 220903b..883cb49 100644 --- a/app/api/openai_passthrough/streaming.py +++ b/app/api/openai_passthrough/streaming.py @@ -14,7 +14,7 @@ import httpx -from app.api.openai_passthrough.client import get_client, upstream_headers +from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ async def stream_passthrough( headers = upstream_headers(extra_headers) try: - async with client.stream(method, path, json=body, headers=headers) as resp: + async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp: async for raw_line in resp.aiter_lines(): # Upstream gives us SSE lines without trailing newlines; restore the # framing byte so the SSE body is well-formed for the downstream client. diff --git a/tests/unit/test_openai_passthrough/test_client_url.py b/tests/unit/test_openai_passthrough/test_client_url.py new file mode 100644 index 0000000..0d1f9b3 --- /dev/null +++ b/tests/unit/test_openai_passthrough/test_client_url.py @@ -0,0 +1,63 @@ +"""Tests for upstream_url — guards against the httpx RFC 3986 path-replacement footgun. + +If you ever set ``base_url`` on the AsyncClient and pass a leading-slash path, +httpx will silently drop the ``/v1`` from the base. This test family asserts +that ``upstream_url`` always produces a fully-qualified URL with both the +configured ``OPENAI_BASE_URL`` path AND the request path joined intact. +""" +from app.api.openai_passthrough.client import upstream_url + + +def test_includes_base_url_path_with_leading_slash(monkeypatch): + monkeypatch.setattr( + "app.core.config.settings.openai_base_url", + "https://bedrock-mantle.us-west-2.api.aws/v1", + ) + # The bug: with httpx base_url=".../v1", "/chat/completions" would drop "/v1". + # upstream_url must keep both segments. + assert ( + upstream_url("/chat/completions") + == "https://bedrock-mantle.us-west-2.api.aws/v1/chat/completions" + ) + + +def test_includes_base_url_path_without_leading_slash(monkeypatch): + monkeypatch.setattr( + "app.core.config.settings.openai_base_url", + "https://bedrock-mantle.us-west-2.api.aws/v1", + ) + assert ( + upstream_url("models") + == "https://bedrock-mantle.us-west-2.api.aws/v1/models" + ) + + +def test_strips_trailing_slash_from_base(monkeypatch): + monkeypatch.setattr( + "app.core.config.settings.openai_base_url", + "https://bedrock-mantle.us-west-2.api.aws/v1/", + ) + assert ( + upstream_url("/responses") + == "https://bedrock-mantle.us-west-2.api.aws/v1/responses" + ) + + +def test_works_with_response_id_in_path(monkeypatch): + monkeypatch.setattr( + "app.core.config.settings.openai_base_url", + "https://bedrock-mantle.us-west-2.api.aws/v1", + ) + assert ( + upstream_url("/responses/resp-123/cancel") + == "https://bedrock-mantle.us-west-2.api.aws/v1/responses/resp-123/cancel" + ) + + +def test_works_with_base_url_no_path_segment(monkeypatch): + """Some clients might point at a domain root; still produce a sensible URL.""" + monkeypatch.setattr( + "app.core.config.settings.openai_base_url", + "https://example.com", + ) + assert upstream_url("/models") == "https://example.com/models" From c88eb51bd37ad602c703e0d0425d65ee536413b5 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 12:41:31 +0000 Subject: [PATCH 20/22] fix(cdk): propagate ENABLE_OPENAI_PASSTHROUGH env var to ECS task definition The previous deploy mounted the new /openai/v1/* code but the CDK never passed ENABLE_OPENAI_PASSTHROUGH through to the container, so the conditional router mount at app/main.py evaluated False and the routes weren't registered. Add support symmetrical to enableOpenaiCompat: - AppConfig: new enableOpenaiPassthrough field - prod default: true (ship the feature on by default) - dev default: false (avoid accidental routing changes in dev) - env-var override: ENABLE_OPENAI_PASSTHROUGH at deploy time - ECS task env: emit ENABLE_OPENAI_PASSTHROUGH= Co-Authored-By: Claude Opus 4.7 (1M context) --- cdk/config/config.ts | 9 +++++++++ cdk/lib/ecs-stack.ts | 1 + 2 files changed, 10 insertions(+) diff --git a/cdk/config/config.ts b/cdk/config/config.ts index cc7b665..c064926 100644 --- a/cdk/config/config.ts +++ b/cdk/config/config.ts @@ -90,6 +90,7 @@ export interface EnvironmentConfig { // OpenAI-Compatible API (Bedrock Mantle) Configuration enableOpenaiCompat: boolean; + enableOpenaiPassthrough: boolean; // Mount /openai/v1/* passthrough endpoints openaiBaseUrl?: string; // e.g., https://bedrock-mantle.us-east-1.api.aws/v1 // Admin Portal Configuration @@ -196,6 +197,7 @@ export const environments: { [key: string]: EnvironmentConfigWithoutRuntime } = // OpenAI-Compatible API (Bedrock Mantle) enableOpenaiCompat: false, + enableOpenaiPassthrough: true, // openaiBaseUrl: 'https://bedrock-mantle.us-east-1.api.aws/v1', // Admin Portal @@ -300,6 +302,7 @@ export const environments: { [key: string]: EnvironmentConfigWithoutRuntime } = // OpenAI-Compatible API (Bedrock Mantle) enableOpenaiCompat: false, + enableOpenaiPassthrough: true, // openaiBaseUrl: 'https://bedrock-mantle.us-east-1.api.aws/v1', // Admin Portal @@ -410,6 +413,11 @@ export function getConfig(environmentName: string = 'dev'): EnvironmentConfig { ? process.env.ENABLE_OPENAI_COMPAT.toLowerCase() === 'true' : config.enableOpenaiCompat; + // Override OpenAI-passthrough settings from environment variables + const enableOpenaiPassthrough = process.env.ENABLE_OPENAI_PASSTHROUGH + ? process.env.ENABLE_OPENAI_PASSTHROUGH.toLowerCase() === 'true' + : config.enableOpenaiPassthrough; + // Override CloudFront settings from environment variables const enableCloudFront = process.env.ENABLE_CLOUDFRONT ? process.env.ENABLE_CLOUDFRONT.toLowerCase() === 'true' @@ -425,6 +433,7 @@ export function getConfig(environmentName: string = 'dev'): EnvironmentConfig { enableWebSearch, enableWebFetch, enableOpenaiCompat, + enableOpenaiPassthrough, enableCloudFront, ...(process.env.OPENAI_BASE_URL && { openaiBaseUrl: process.env.OPENAI_BASE_URL }), ...(process.env.OTEL_EXPORTER_OTLP_ENDPOINT && { otelExporterEndpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT }), diff --git a/cdk/lib/ecs-stack.ts b/cdk/lib/ecs-stack.ts index 744e31c..9da7901 100644 --- a/cdk/lib/ecs-stack.ts +++ b/cdk/lib/ecs-stack.ts @@ -283,6 +283,7 @@ export class ECSStack extends cdk.Stack { // OpenAI-Compatible API (Bedrock Mantle) ENABLE_OPENAI_COMPAT: config.enableOpenaiCompat.toString(), + ENABLE_OPENAI_PASSTHROUGH: config.enableOpenaiPassthrough.toString(), ...(config.openaiBaseUrl && { OPENAI_BASE_URL: config.openaiBaseUrl }), ...(process.env.OPENAI_API_KEY && { OPENAI_API_KEY: process.env.OPENAI_API_KEY }), From b926b421cbd774d0a47453ead2ff0eea48e00a02 Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 13:00:46 +0000 Subject: [PATCH 21/22] fix(openai-passthrough): synthesize event: lines for Responses API SSE Bedrock-mantle emits Responses API SSE as data-only frames (the event type is embedded as a JSON field but no `event: ` line is present). This matches the SSE spec but diverges from real OpenAI servers, which prepend each frame with `event: `. Strict clients like OpenAI Codex CLI key off the `event:` field and report "stream closed before response.completed" when they don't see it. Synthesize `event: ` lines from each data frame's JSON `type` field when api_surface == "responses". Chat Completions streams remain unchanged (real OpenAI doesn't use event: lines for that endpoint). Tests: - test_streaming_responses_synthesizes_event_lines_for_data_only_upstream asserts every data: frame is preceded by the matching event: line - test_streaming_chat_completions_does_not_inject_event_lines pins the no-injection contract for chat completions Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/openai_passthrough/streaming.py | 35 ++++++++++++++++++ .../test_chat_completions.py | 27 ++++++++++++++ .../test_openai_passthrough/test_responses.py | 36 +++++++++++++++++++ 3 files changed, 98 insertions(+) diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py index 883cb49..7864c5a 100644 --- a/app/api/openai_passthrough/streaming.py +++ b/app/api/openai_passthrough/streaming.py @@ -4,6 +4,14 @@ StreamingResponse forwards them unchanged. After upstream stream ends, it calls the supplied on_complete callback with the captured usage dict so the caller can record usage to DynamoDB. + +Responses API note: bedrock-mantle emits Responses SSE as data-only frames +(``data: {"type": "response.completed", ...}``) without the corresponding +``event: response.completed`` line that the real OpenAI Responses API +includes. Strict SSE clients (e.g. OpenAI Codex CLI) key off the ``event:`` +field and reject streams that lack it. For api_surface="responses" we +synthesize ``event: `` lines from the JSON ``type`` field on each frame +to maintain wire compatibility with the real OpenAI server. """ from __future__ import annotations @@ -20,6 +28,24 @@ logger = logging.getLogger(__name__) +def _extract_event_type(raw_line: str) -> str | None: + """Return the ``type`` field from a ``data:`` JSON frame, or None if not parseable.""" + line = raw_line.strip() + if not line.startswith("data:"): + return None + payload = line[len("data:"):].strip() + if not payload or payload == "[DONE]": + return None + try: + obj = json.loads(payload) + except (ValueError, TypeError): + return None + if not isinstance(obj, dict): + return None + event_type = obj.get("type") + return event_type if isinstance(event_type, str) else None + + async def stream_passthrough( method: str, path: str, @@ -33,10 +59,19 @@ async def stream_passthrough( client = get_client() headers = upstream_headers(extra_headers) + synthesize_event_lines = api_surface == "responses" try: async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp: async for raw_line in resp.aiter_lines(): + # For the Responses API, prepend an ``event: `` line whenever + # we see a data frame whose JSON carries a ``type`` field. This + # restores the OpenAI-spec SSE format that strict clients expect. + if synthesize_event_lines: + event_type = _extract_event_type(raw_line) + if event_type is not None: + yield f"event: {event_type}\n".encode("utf-8") + # Upstream gives us SSE lines without trailing newlines; restore the # framing byte so the SSE body is well-formed for the downstream client. yield (raw_line + "\n").encode("utf-8") diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py index b26965c..d726fdd 100644 --- a/tests/integration/test_openai_passthrough/test_chat_completions.py +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -155,6 +155,33 @@ def test_streaming_chat_completions_without_include_usage_does_not_log( assert not mock_usage_tracker.record_usage.called +def test_streaming_chat_completions_does_not_inject_event_lines( + client, respx_mock, +): + """Chat Completions SSE per OpenAI spec is data-only (no `event:` lines). + The proxy must NOT synthesize event: lines for this api_surface. + """ + sse_lines = [ + 'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}', + 'data: [DONE]', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/chat/completions").mock( + return_value=httpx.Response( + 200, headers={"content-type": "text/event-stream"}, content=body + ) + ) + + with client.stream( + "POST", "/openai/v1/chat/completions", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "messages": [], "stream": True}, + ) as r: + out = b"".join(r.iter_bytes()).decode() + + assert "event: " not in out, f"chat completions stream should not contain event: lines, got:\n{out}" + + def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock): """X-Amzn-Bedrock-* headers from the client should reach the upstream call.""" route = respx_mock.post("/chat/completions").mock( diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py index a596fed..8190a98 100644 --- a/tests/integration/test_openai_passthrough/test_responses.py +++ b/tests/integration/test_openai_passthrough/test_responses.py @@ -65,6 +65,42 @@ def test_streaming_responses_records_usage_from_response_completed( assert kw["api_surface"] == "responses" +def test_streaming_responses_synthesizes_event_lines_for_data_only_upstream( + client, respx_mock, +): + """Bedrock-mantle's Responses API emits data-only SSE (no `event:` lines). + Strict clients (e.g. Codex CLI) require `event: ` per OpenAI spec, so + the proxy must synthesize them from each frame's JSON `type` field. + """ + sse_lines = [ + 'data: {"type":"response.created","response":{"id":"r-1"}}', + '', + 'data: {"type":"response.output_text.delta","delta":"hi"}', + '', + 'data: ' + json.dumps({ + "type": "response.completed", + "response": {"id": "r-1", "usage": {"input_tokens": 1, "output_tokens": 1, "total_tokens": 2}}, + }), + '', + ] + body = "\n".join(sse_lines).encode() + respx_mock.post("/responses").mock( + return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body) + ) + + with client.stream( + "POST", "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "input": [], "stream": True}, + ) as r: + out = b"".join(r.iter_bytes()).decode() + + # Each data: frame with a `type` field should be preceded by an event: line + assert "event: response.created\ndata: " in out + assert "event: response.output_text.delta\ndata: " in out + assert "event: response.completed\ndata: " in out + + def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker): respx_mock.post("/responses").mock( return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}}) From e950e8a36b7f05a82a874c472bd83aaeab74d8ad Mon Sep 17 00:00:00 2001 From: River Xie Date: Mon, 25 May 2026 13:22:22 +0000 Subject: [PATCH 22/22] fix(openai-passthrough): surface upstream HTTP errors with real status codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, when client requested stream=true and upstream returned a non-2xx status (e.g. validation 400) or a connection error, the proxy would still return 200 text/event-stream and dump the JSON error body (or a synthetic SSE frame) into the stream. Strict SSE clients like OpenAI Codex CLI then hang waiting for response.completed and report "stream closed before response.completed" — masking the real error. Refactor: split open_upstream_stream() (peeks at status) from stream_passthrough_response() (streams an open 2xx body). The router now: - Returns the real upstream status as JSONResponse when the upstream responds with 4xx/5xx for a streaming request. - Returns 502/504 JSON when the upstream is unreachable (TimeoutException / RequestError) before any bytes flow. - Continues to emit an SSE error+[DONE] frame only for failures that occur AFTER the 2xx stream has begun (where we cannot retroactively change the HTTP status). Tests: - test_streaming_responses_upstream_4xx_returns_json_not_sse - test_streaming_upstream_timeout_returns_json_504 (replaces the prior test that asserted the buggy SSE-error behavior) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/openai_passthrough/router.py | 62 +++++++- app/api/openai_passthrough/streaming.py | 143 +++++++++++++++--- .../test_chat_completions.py | 22 +-- .../test_openai_passthrough/test_responses.py | 23 +++ 4 files changed, 216 insertions(+), 34 deletions(-) diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py index a505f49..70c0406 100644 --- a/app/api/openai_passthrough/router.py +++ b/app/api/openai_passthrough/router.py @@ -13,7 +13,11 @@ from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url from app.api.openai_passthrough.model_mapping import resolve_model_id -from app.api.openai_passthrough.streaming import stream_passthrough +from app.api.openai_passthrough.streaming import ( + UpstreamConnectionError, + open_upstream_stream, + stream_passthrough_response, +) from app.api.openai_passthrough.usage_extractor import normalize_usage from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker from app.middleware.auth import get_api_key_info @@ -75,12 +79,26 @@ async def chat_completions( extra = _passthrough_extra_headers(request) if body.get("stream"): + try: + upstream_resp, error_body = await open_upstream_stream( + "POST", "/chat/completions", body, extra + ) + except UpstreamConnectionError as exc: + return JSONResponse( + {"error": {"message": exc.message, "type": "upstream_error"}}, + status_code=exc.status_code, + ) + if error_body is not None: + return JSONResponse( + _decode_error_body(error_body), + status_code=upstream_resp.status_code, + ) + async def on_complete(usage: dict[str, Any]) -> None: _record_usage(api_key_info, usage, body["model"], "chat_completions") + return StreamingResponse( - stream_passthrough( - "POST", "/chat/completions", body, "chat_completions", on_complete, extra - ), + stream_passthrough_response(upstream_resp, "chat_completions", on_complete), media_type="text/event-stream", ) @@ -107,10 +125,26 @@ async def responses_create( extra = _passthrough_extra_headers(request) if body.get("stream"): + try: + upstream_resp, error_body = await open_upstream_stream( + "POST", "/responses", body, extra + ) + except UpstreamConnectionError as exc: + return JSONResponse( + {"error": {"message": exc.message, "type": "upstream_error"}}, + status_code=exc.status_code, + ) + if error_body is not None: + return JSONResponse( + _decode_error_body(error_body), + status_code=upstream_resp.status_code, + ) + async def on_complete(usage: dict[str, Any]) -> None: _record_usage(api_key_info, usage, body["model"], "responses") + return StreamingResponse( - stream_passthrough("POST", "/responses", body, "responses", on_complete, extra), + stream_passthrough_response(upstream_resp, "responses", on_complete), media_type="text/event-stream", ) @@ -185,3 +219,21 @@ def _safe_json(resp) -> dict[str, Any]: return cast(dict[str, Any], resp.json()) except ValueError: return {"error": {"message": resp.text, "type": "upstream_error"}} + + +def _decode_error_body(body: bytes) -> dict[str, Any]: + """Parse a non-2xx upstream body as JSON, falling back to a wrapped string.""" + import json as _json + + try: + decoded = _json.loads(body) + except (ValueError, TypeError): + return { + "error": { + "message": body.decode("utf-8", "replace"), + "type": "upstream_error", + } + } + if isinstance(decoded, dict): + return cast(dict[str, Any], decoded) + return {"error": {"message": str(decoded), "type": "upstream_error"}} diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py index 7864c5a..8648f91 100644 --- a/app/api/openai_passthrough/streaming.py +++ b/app/api/openai_passthrough/streaming.py @@ -12,6 +12,11 @@ field and reject streams that lack it. For api_surface="responses" we synthesize ``event: `` lines from the JSON ``type`` field on each frame to maintain wire compatibility with the real OpenAI server. + +Upstream-error contract: ``open_upstream_stream`` returns the (resp, error_body) +tuple BEFORE FastAPI has committed any response headers. If the upstream +returns a non-2xx status, the caller can hand back a JSONResponse with the +real upstream status code instead of a fake 200 streaming response. """ from __future__ import annotations @@ -46,36 +51,94 @@ def _extract_event_type(raw_line: str) -> str | None: return event_type if isinstance(event_type, str) else None -async def stream_passthrough( +class UpstreamConnectionError(Exception): + """Raised by open_upstream_stream when the upstream is unreachable. + + Carries an HTTP status to return to the client (502 Bad Gateway by + default) and the underlying httpx exception for logging. + """ + + def __init__(self, status_code: int, message: str, exc_type: str) -> None: + super().__init__(message) + self.status_code = status_code + self.message = message + self.exc_type = exc_type + + +async def open_upstream_stream( method: str, path: str, body: dict[str, Any] | None, - api_surface: str, - on_complete: Callable[[dict[str, Any]], Awaitable[None] | None], extra_headers: dict[str, str] | None = None, -) -> AsyncIterator[bytes]: - """Stream upstream response bytes line-by-line; capture usage; trigger callback.""" - usage: dict[str, Any] = {} +) -> tuple[httpx.Response, bytes | None]: + """Open an upstream streaming request and peek at the status code. + + Returns (resp, error_body): + - error_body is None if upstream returned 2xx — caller streams the body + and is responsible for closing the response. + - error_body is the full upstream body bytes if status >= 400 — caller + should return a JSONResponse with resp.status_code. The response is + already closed. + Raises UpstreamConnectionError if the upstream is unreachable + (timeout, DNS, TLS, connection reset). Caller should turn this into a + JSON 502/504 with the carried status code. + """ client = get_client() headers = upstream_headers(extra_headers) + req = client.build_request(method, upstream_url(path), json=body, headers=headers) + try: + resp = await client.send(req, stream=True) + except httpx.TimeoutException as exc: + logger.error("[OPENAI-PASSTHROUGH] upstream timeout opening stream: %s", exc) + raise UpstreamConnectionError( + status_code=504, + message=f"upstream timeout: {exc}", + exc_type=type(exc).__name__, + ) from exc + except httpx.RequestError as exc: + logger.error("[OPENAI-PASSTHROUGH] upstream connection error opening stream: %s", exc) + raise UpstreamConnectionError( + status_code=502, + message=f"upstream connection failed: {exc}", + exc_type=type(exc).__name__, + ) from exc + + if resp.status_code >= 400: + try: + error_body = await resp.aread() + finally: + await resp.aclose() + return resp, error_body + return resp, None + + +async def stream_passthrough_response( + resp: httpx.Response, + api_surface: str, + on_complete: Callable[[dict[str, Any]], Awaitable[None] | None], +) -> AsyncIterator[bytes]: + """Stream the body of an already-opened 2xx upstream response. + + Closes the response when done. + """ + usage: dict[str, Any] = {} synthesize_event_lines = api_surface == "responses" try: - async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp: - async for raw_line in resp.aiter_lines(): - # For the Responses API, prepend an ``event: `` line whenever - # we see a data frame whose JSON carries a ``type`` field. This - # restores the OpenAI-spec SSE format that strict clients expect. - if synthesize_event_lines: - event_type = _extract_event_type(raw_line) - if event_type is not None: - yield f"event: {event_type}\n".encode("utf-8") - - # Upstream gives us SSE lines without trailing newlines; restore the - # framing byte so the SSE body is well-formed for the downstream client. - yield (raw_line + "\n").encode("utf-8") - try_extract_usage_from_sse(raw_line, usage, api_surface) + async for raw_line in resp.aiter_lines(): + # For the Responses API, prepend an ``event: `` line whenever + # we see a data frame whose JSON carries a ``type`` field. This + # restores the OpenAI-spec SSE format that strict clients expect. + if synthesize_event_lines: + event_type = _extract_event_type(raw_line) + if event_type is not None: + yield f"event: {event_type}\n".encode("utf-8") + + # Upstream gives us SSE lines without trailing newlines; restore the + # framing byte so the SSE body is well-formed for the downstream client. + yield (raw_line + "\n").encode("utf-8") + try_extract_usage_from_sse(raw_line, usage, api_surface) except (httpx.RequestError, httpx.TimeoutException) as exc: # Upstream connection/timeout failure during streaming. OpenAI SDK clients # expect a clean SSE termination, not an abruptly closed stream. @@ -93,9 +156,49 @@ async def stream_passthrough( # Unexpected error — re-raise so FastAPI can convert to 500. logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc) raise + finally: + await resp.aclose() if usage: result = on_complete(usage) # Support both sync and async callbacks if hasattr(result, "__await__"): await result # type: ignore[misc] + + +# --------------------------------------------------------------------------- +# Backwards-compat helper: streams in one call. Useful where the caller doesn't +# need to differentiate streaming-error vs streaming-success at the HTTP-status +# level (legacy code path; new code should use open_upstream_stream + +# stream_passthrough_response so non-2xx errors come back as a real JSONResponse). +# --------------------------------------------------------------------------- + +async def stream_passthrough( + method: str, + path: str, + body: dict[str, Any] | None, + api_surface: str, + on_complete: Callable[[dict[str, Any]], Awaitable[None] | None], + extra_headers: dict[str, str] | None = None, +) -> AsyncIterator[bytes]: + """Open + stream + close in one call. Status-checking variant lives in + open_upstream_stream/stream_passthrough_response.""" + resp, error_body = await open_upstream_stream(method, path, body, extra_headers) + if error_body is not None: + # Legacy callers can't surface a real error status here; emit an SSE + # error frame and terminate cleanly so the client doesn't hang. + try: + err_payload = json.loads(error_body) + except (ValueError, TypeError): + err_payload = { + "error": { + "message": error_body.decode("utf-8", "replace"), + "type": "upstream_error", + } + } + yield ("data: " + json.dumps(err_payload) + "\n\n").encode("utf-8") + yield b"data: [DONE]\n\n" + return + + async for chunk in stream_passthrough_response(resp, api_surface, on_complete): + yield chunk diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py index d726fdd..0fc1120 100644 --- a/tests/integration/test_openai_passthrough/test_chat_completions.py +++ b/tests/integration/test_openai_passthrough/test_chat_completions.py @@ -204,23 +204,27 @@ def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock): assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT" -def test_streaming_upstream_timeout_yields_clean_sse_error( +def test_streaming_upstream_timeout_returns_json_504( client, respx_mock, mock_usage_tracker ): - """Upstream timeout during streaming should produce a structured SSE error event, not crash the stream.""" + """When upstream times out before the stream begins, the proxy must + surface a real HTTP 504 with a JSON error body (NOT a fake 200 + text/event-stream wrapping an SSE error frame). Strict clients can + then act on the status code instead of hanging on a malformed stream. + """ respx_mock.post("/chat/completions").mock( side_effect=httpx.ReadTimeout("upstream took too long") ) - with client.stream( - "POST", + r = client.post( "/openai/v1/chat/completions", headers={"Authorization": "Bearer sk-test"}, json={"model": "m", "messages": [], "stream": True}, - ) as r: - out = b"".join(r.iter_bytes()) + ) - assert b'"upstream_error"' in out, f"expected structured error, got: {out}" - assert b"[DONE]" in out - # No usage logged when the stream errored before any usage event arrived + assert r.status_code == 504 + assert r.headers["content-type"].startswith("application/json") + body = r.json() + assert body["error"]["type"] == "upstream_error" + assert "timeout" in body["error"]["message"].lower() assert not mock_usage_tracker.record_usage.called diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py index 8190a98..e791651 100644 --- a/tests/integration/test_openai_passthrough/test_responses.py +++ b/tests/integration/test_openai_passthrough/test_responses.py @@ -113,3 +113,26 @@ def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usa assert r.status_code == 400 assert r.json()["error"]["message"] == "bad input" assert not mock_usage_tracker.record_usage.called + + +def test_streaming_responses_upstream_4xx_returns_json_not_sse( + client, respx_mock, mock_usage_tracker +): + """When upstream rejects a streaming request with 4xx, the proxy must + surface a real JSON 4xx response — NOT a fake 200 text/event-stream + that wraps the error body. Strict SSE clients (codex) hang waiting + for response.completed if we send the error as event-stream. + """ + err = {"error": {"message": "tools[13].type=namespace not allowed", "type": "validation_error"}} + respx_mock.post("/responses").mock(return_value=httpx.Response(400, json=err)) + + r = client.post( + "/openai/v1/responses", + headers={"Authorization": "Bearer sk-test"}, + json={"model": "m", "input": [], "stream": True, "tools": [{"type": "namespace"}]}, + ) + assert r.status_code == 400 + assert r.headers["content-type"].startswith("application/json"), \ + f"expected JSON content-type, got {r.headers['content-type']}" + assert r.json() == err + assert not mock_usage_tracker.record_usage.called