From 3e152cfac863faf08808a55c410857bcfa5e5441 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 05:16:04 +0000
Subject: [PATCH 01/22] docs(openai-passthrough): add implementation plan

16 tasks covering feature flag, auth middleware extension, usage extraction,
httpx passthrough client, /chat/completions and /responses endpoints with
streaming, full Responses CRUD, /models, guardrail header forwarding, and
documentation updates.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-05-25-openai-passthrough.md    | 1912 +++++++++++++++++
 1 file changed, 1912 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-05-25-openai-passthrough.md

diff --git a/docs/superpowers/plans/2026-05-25-openai-passthrough.md b/docs/superpowers/plans/2026-05-25-openai-passthrough.md
new file mode 100644
index 0000000..b6bcd29
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-25-openai-passthrough.md
@@ -0,0 +1,1912 @@
+# OpenAI Passthrough Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add `/openai/v1/*` endpoints that accept OpenAI-native Chat Completions and Responses API calls and forward them to AWS bedrock-mantle, while reusing the proxy's API key auth, rate limits, budgets, and usage tracking.
+
+**Architecture:** Raw httpx passthrough (no Pydantic schemas for OpenAI types) on a new APIRouter mounted at `/openai/v1`. Existing auth middleware extended to accept `Authorization: Bearer` in addition to `x-api-key`. Usage extracted from response bodies (non-streaming) or final SSE event (streaming) and normalized into the existing Anthropic-shaped DDB schema with new `api_surface` and `reasoning_tokens` columns. Independent of existing `ENABLE_OPENAI_COMPAT`.
+
+**Tech Stack:** Python 3.12, FastAPI, httpx (async), pytest + respx for HTTP mocking, AWS DynamoDB (boto3), uv for package management.
+
+**Reference design:** `docs/plans/2026-05-25-openai-passthrough-design.md`
+
+---
+
+## File Structure
+
+| File | Action | Purpose |
+|---|---|---|
+| `app/core/config.py` | Modify | Add `enable_openai_passthrough` flag |
+| `app/middleware/auth.py` | Modify | Accept `Authorization: Bearer` header |
+| `app/api/openai_passthrough/__init__.py` | Create | Export `router` |
+| `app/api/openai_passthrough/client.py` | Create | httpx singleton client |
+| `app/api/openai_passthrough/usage_extractor.py` | Create | Normalize OpenAI usage → Anthropic-shaped dict |
+| `app/api/openai_passthrough/streaming.py` | Create | SSE passthrough + usage tee |
+| `app/api/openai_passthrough/router.py` | Create | FastAPI routes |
+| `app/db/dynamodb.py` | Modify | Extend `UsageTracker.record_usage` with `api_surface` and `reasoning_tokens` |
+| `app/main.py` | Modify | Conditionally mount the new router |
+| `env.example` | Modify | Document the new flag |
+| `CLAUDE.md` | Modify | Add feature description |
+| `docs/architecture/features.md` | Modify | Detailed feature doc |
+| `tests/unit/test_openai_passthrough/test_usage_extractor.py` | Create | Unit tests for normalization |
+| `tests/unit/test_openai_passthrough/test_auth.py` | Create | Unit tests for header acceptance |
+| `tests/unit/test_openai_passthrough/test_model_mapping.py` | Create | Unit tests for mapping resolution |
+| `tests/unit/test_openai_passthrough/__init__.py` | Create | Test package marker |
+| `tests/integration/test_openai_passthrough/test_chat_completions.py` | Create | Chat completions integration |
+| `tests/integration/test_openai_passthrough/test_responses.py` | Create | Responses API integration |
+| `tests/integration/test_openai_passthrough/test_responses_crud.py` | Create | Responses CRUD passthrough |
+| `tests/integration/test_openai_passthrough/test_models.py` | Create | /models endpoint |
+| `tests/integration/test_openai_passthrough/conftest.py` | Create | Shared fixtures (FastAPI client, respx) |
+| `tests/integration/test_openai_passthrough/__init__.py` | Create | Test package marker |
+
+**Tooling:** add `respx>=0.21.0` to `[project.optional-dependencies].dev` in `pyproject.toml`.
+
+---
+
+## Task 1: Add feature flag and respx dependency
+
+**Files:**
+- Modify: `app/core/config.py`
+- Modify: `pyproject.toml`
+
+- [ ] **Step 1: Add the feature flag to settings**
+
+In `app/core/config.py`, find the existing OpenAI-Compat block (around line 379–406) and add a new field immediately after `openai_compat_thinking_medium_threshold`:
+
+```python
+    enable_openai_passthrough: bool = Field(
+        default=False,
+        alias="ENABLE_OPENAI_PASSTHROUGH",
+        description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)"
+    )
+```
+
+- [ ] **Step 2: Add respx to dev dependencies**
+
+In `pyproject.toml`, locate the `dev = [...]` list under `[project.optional-dependencies]` (around line 80–95) and add `"respx>=0.21.0",` after `"pytest-mock>=3.12.0",`.
+
+- [ ] **Step 3: Sync dependencies**
+
+Run: `unset VIRTUAL_ENV && uv sync --active --extra dev`
+Expected: `respx` and its deps resolved and installed.
+
+- [ ] **Step 4: Verify the setting loads**
+
+Run:
+```bash
+unset VIRTUAL_ENV && uv run --active python -c "from app.core.config import settings; print(settings.enable_openai_passthrough)"
+```
+Expected output: `False`
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/core/config.py pyproject.toml uv.lock
+git commit -m "feat(openai-passthrough): add ENABLE_OPENAI_PASSTHROUGH flag and respx dev dep"
+```
+
+---
+
+## Task 2: Extend auth middleware to accept Authorization: Bearer
+
+**Files:**
+- Modify: `app/middleware/auth.py:62-77`
+- Test: `tests/unit/test_openai_passthrough/test_auth.py`
+
+- [ ] **Step 1: Create the test package structure**
+
+Run:
+```bash
+mkdir -p tests/unit/test_openai_passthrough
+touch tests/unit/test_openai_passthrough/__init__.py
+```
+
+- [ ] **Step 2: Write the failing test**
+
+Create `tests/unit/test_openai_passthrough/test_auth.py`:
+
+```python
+"""Tests for the auth middleware's Authorization: Bearer support."""
+from unittest.mock import MagicMock, patch
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from app.middleware.auth import AuthMiddleware
+
+
+@pytest.fixture
+def make_app():
+    """Build a minimal FastAPI app wired to AuthMiddleware with a mocked validator."""
+    def _factory(api_key_info):
+        app = FastAPI()
+
+        ddb_client = MagicMock()
+        manager = MagicMock()
+        manager.validate_api_key.return_value = api_key_info
+
+        with patch("app.middleware.auth.APIKeyManager", return_value=manager):
+            app.add_middleware(AuthMiddleware, dynamodb_client=ddb_client)
+
+        @app.get("/test")
+        async def test_endpoint(request):
+            from fastapi import Request
+            r: Request = request  # type: ignore[assignment]
+            info = r.state.api_key_info
+            return {"user_id": info["user_id"]}
+
+        return app, manager
+    return _factory
+
+
+def test_authorization_bearer_resolves_when_xapikey_missing(make_app, monkeypatch):
+    """Authorization: Bearer <key> should authenticate when x-api-key is absent."""
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+    app, manager = make_app({"user_id": "u1", "api_key": "sk-abc"})
+    client = TestClient(app)
+
+    r = client.get("/test", headers={"Authorization": "Bearer sk-abc"})
+
+    assert r.status_code == 200
+    assert r.json() == {"user_id": "u1"}
+    manager.validate_api_key.assert_called_once_with("sk-abc")
+
+
+def test_xapikey_takes_precedence_when_both_present(make_app, monkeypatch):
+    """If both headers are present, x-api-key wins."""
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+    app, manager = make_app({"user_id": "u1", "api_key": "sk-from-xapikey"})
+    client = TestClient(app)
+
+    client.get(
+        "/test",
+        headers={"x-api-key": "sk-from-xapikey", "Authorization": "Bearer sk-from-bearer"},
+    )
+
+    manager.validate_api_key.assert_called_once_with("sk-from-xapikey")
+
+
+def test_missing_both_headers_returns_401(make_app, monkeypatch):
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+    app, _ = make_app(None)
+    client = TestClient(app)
+
+    r = client.get("/test")
+    assert r.status_code == 401
+
+
+def test_authorization_non_bearer_is_ignored(make_app, monkeypatch):
+    """Authorization: Basic ... (or anything not 'Bearer ') should not be treated as an API key."""
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+    app, _ = make_app(None)
+    client = TestClient(app)
+
+    r = client.get("/test", headers={"Authorization": "Basic dXNlcjpwYXNz"})
+    assert r.status_code == 401
+```
+
+- [ ] **Step 3: Run the tests to verify they fail**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_auth.py -v --no-cov`
+Expected: All four tests FAIL — `test_authorization_bearer_resolves_when_xapikey_missing` returns 401 because the middleware doesn't yet read Authorization.
+
+- [ ] **Step 4: Modify the middleware to read Authorization: Bearer**
+
+In `app/middleware/auth.py`, replace lines 62–77 (the API key extraction + missing-key 401 block) with:
+
+```python
+        # Extract API key from header (x-api-key first, fall back to Authorization: Bearer)
+        api_key = request.headers.get(settings.api_key_header)
+        if not api_key:
+            authz = request.headers.get("Authorization") or request.headers.get("authorization")
+            if authz and authz.startswith("Bearer "):
+                api_key = authz[len("Bearer "):].strip()
+
+        if not api_key:
+            print(f"[AUTH] Missing API key for {request.url.path}")
+            from fastapi.responses import JSONResponse
+            return JSONResponse(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                content={
+                    "type": "error",
+                    "error": {
+                        "type": "authentication_error",
+                        "message": f"Missing API key in {settings.api_key_header} or Authorization: Bearer header",
+                    },
+                },
+            )
+```
+
+- [ ] **Step 5: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_auth.py -v --no-cov`
+Expected: All four tests PASS.
+
+- [ ] **Step 6: Run the full unit test suite to ensure no regression**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov`
+Expected: All previously-passing tests still pass.
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add app/middleware/auth.py tests/unit/test_openai_passthrough/test_auth.py tests/unit/test_openai_passthrough/__init__.py
+git commit -m "feat(auth): accept Authorization: Bearer alongside x-api-key"
+```
+
+---
+
+## Task 3: Add usage normalization function
+
+**Files:**
+- Create: `app/api/openai_passthrough/__init__.py`
+- Create: `app/api/openai_passthrough/usage_extractor.py`
+- Test: `tests/unit/test_openai_passthrough/test_usage_extractor.py`
+
+- [ ] **Step 1: Create the package directories**
+
+Run:
+```bash
+mkdir -p app/api/openai_passthrough
+touch app/api/openai_passthrough/__init__.py
+```
+
+- [ ] **Step 2: Write the failing tests**
+
+Create `tests/unit/test_openai_passthrough/test_usage_extractor.py`:
+
+```python
+"""Tests for normalize_usage and try_extract_usage_from_sse."""
+import json
+
+from app.api.openai_passthrough.usage_extractor import (
+    normalize_usage,
+    try_extract_usage_from_sse,
+)
+
+
+def test_normalize_chat_completions_basic():
+    raw = {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}
+    result = normalize_usage(raw, "chat_completions")
+    assert result == {
+        "input_tokens": 100,
+        "output_tokens": 50,
+        "cache_read_input_tokens": 0,
+        "cache_creation_input_tokens": 0,
+        "reasoning_tokens": 0,
+    }
+
+
+def test_normalize_chat_completions_with_cache_and_reasoning():
+    raw = {
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "prompt_tokens_details": {"cached_tokens": 30},
+        "completion_tokens_details": {"reasoning_tokens": 20},
+    }
+    result = normalize_usage(raw, "chat_completions")
+    # cache hits subtracted from input
+    assert result["input_tokens"] == 70
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 30
+    assert result["reasoning_tokens"] == 20
+
+
+def test_normalize_responses_basic():
+    raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
+    result = normalize_usage(raw, "responses")
+    assert result["input_tokens"] == 100
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 0
+    assert result["reasoning_tokens"] == 0
+
+
+def test_normalize_responses_with_cache_and_reasoning():
+    raw = {
+        "input_tokens": 100,
+        "output_tokens": 50,
+        "input_tokens_details": {"cached_tokens": 25},
+        "output_tokens_details": {"reasoning_tokens": 15},
+    }
+    result = normalize_usage(raw, "responses")
+    assert result["input_tokens"] == 75
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 25
+    assert result["reasoning_tokens"] == 15
+
+
+def test_normalize_handles_missing_fields():
+    """Empty/None usage should normalize to all-zeros, not crash."""
+    result = normalize_usage({}, "chat_completions")
+    assert result == {
+        "input_tokens": 0, "output_tokens": 0,
+        "cache_read_input_tokens": 0, "cache_creation_input_tokens": 0,
+        "reasoning_tokens": 0,
+    }
+
+
+def test_extract_chat_completions_usage_from_sse_chunk():
+    """Final chat-completions chunk with usage should be picked up."""
+    line = "data: " + json.dumps({
+        "id": "chatcmpl-1", "choices": [],
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+    })
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "chat_completions")
+    assert holder == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+
+
+def test_extract_responses_usage_from_response_completed_event():
+    line = "data: " + json.dumps({
+        "type": "response.completed",
+        "response": {
+            "id": "resp-1",
+            "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28},
+        },
+    })
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "responses")
+    assert holder == {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}
+
+
+def test_extract_ignores_non_data_lines():
+    holder: dict = {}
+    try_extract_usage_from_sse("event: response.completed", holder, "responses")
+    try_extract_usage_from_sse("", holder, "responses")
+    try_extract_usage_from_sse(": keepalive", holder, "responses")
+    assert holder == {}
+
+
+def test_extract_ignores_data_done():
+    holder: dict = {}
+    try_extract_usage_from_sse("data: [DONE]", holder, "chat_completions")
+    assert holder == {}
+
+
+def test_extract_ignores_chunks_without_usage():
+    line = "data: " + json.dumps({"choices": [{"delta": {"content": "hi"}}]})
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "chat_completions")
+    assert holder == {}
+
+
+def test_extract_ignores_malformed_json():
+    holder: dict = {}
+    try_extract_usage_from_sse("data: not-json", holder, "chat_completions")
+    assert holder == {}
+```
+
+- [ ] **Step 3: Run the tests to verify they fail**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_extractor.py -v --no-cov`
+Expected: ImportError — module doesn't exist yet.
+
+- [ ] **Step 4: Implement usage_extractor**
+
+Create `app/api/openai_passthrough/usage_extractor.py`:
+
+```python
+"""Usage extraction and normalization for OpenAI-format responses.
+
+normalize_usage() converts an OpenAI Chat Completions or Responses API usage
+dict into the Anthropic-shaped dict that UsageTracker.record_usage expects,
+plus a separate reasoning_tokens field.
+
+try_extract_usage_from_sse() peeks at SSE lines during streaming and stashes
+the usage dict (raw OpenAI shape) the first time it encounters one. The caller
+later passes that dict through normalize_usage().
+"""
+from __future__ import annotations
+
+import json
+from typing import Any, Dict
+
+
+def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]:
+    """Normalize OpenAI-shaped usage into Anthropic-shaped fields.
+
+    api_surface: "chat_completions" or "responses"
+    """
+    if api_surface == "chat_completions":
+        in_tok = int(raw.get("prompt_tokens", 0) or 0)
+        out_tok = int(raw.get("completion_tokens", 0) or 0)
+        cached = int((raw.get("prompt_tokens_details") or {}).get("cached_tokens", 0) or 0)
+        reasoning = int(
+            (raw.get("completion_tokens_details") or {}).get("reasoning_tokens", 0) or 0
+        )
+    else:  # responses
+        in_tok = int(raw.get("input_tokens", 0) or 0)
+        out_tok = int(raw.get("output_tokens", 0) or 0)
+        cached = int((raw.get("input_tokens_details") or {}).get("cached_tokens", 0) or 0)
+        reasoning = int(
+            (raw.get("output_tokens_details") or {}).get("reasoning_tokens", 0) or 0
+        )
+
+    # Cache-read tokens are billed separately, so subtract them from input_tokens
+    # to mirror how the Anthropic flow accounts for cache hits.
+    return {
+        "input_tokens": max(in_tok - cached, 0),
+        "output_tokens": out_tok,
+        "cache_read_input_tokens": cached,
+        "cache_creation_input_tokens": 0,  # Not exposed by OpenAI-format APIs
+        "reasoning_tokens": reasoning,
+    }
+
+
+def try_extract_usage_from_sse(
+    raw_line: str, holder: Dict[str, Any], api_surface: str
+) -> None:
+    """Inspect an SSE line and, if it carries usage info, store it in holder.
+
+    Mutates `holder` in place. Idempotent: subsequent calls overwrite, so the
+    last-seen usage event wins (which is what we want — both APIs put usage
+    on the terminal event).
+    """
+    line = raw_line.strip()
+    if not line.startswith("data:"):
+        return
+
+    payload = line[len("data:"):].strip()
+    if not payload or payload == "[DONE]":
+        return
+
+    try:
+        obj = json.loads(payload)
+    except (ValueError, TypeError):
+        return
+
+    if api_surface == "chat_completions":
+        usage = obj.get("usage")
+        if isinstance(usage, dict):
+            holder.clear()
+            holder.update(usage)
+    else:  # responses
+        # Usage lives on the `response.completed` event under
+        # event.response.usage. Other events occasionally carry partial usage
+        # too — accept any usage dict we see.
+        if obj.get("type") == "response.completed":
+            response_obj = obj.get("response") or {}
+            usage = response_obj.get("usage")
+            if isinstance(usage, dict):
+                holder.clear()
+                holder.update(usage)
+```
+
+- [ ] **Step 5: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_extractor.py -v --no-cov`
+Expected: All 10 tests PASS.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add app/api/openai_passthrough/__init__.py app/api/openai_passthrough/usage_extractor.py tests/unit/test_openai_passthrough/test_usage_extractor.py
+git commit -m "feat(openai-passthrough): add usage normalization and SSE extraction helpers"
+```
+
+---
+
+## Task 4: Add model mapping resolver helper
+
+**Files:**
+- Create: `app/api/openai_passthrough/model_mapping.py`
+- Test: `tests/unit/test_openai_passthrough/test_model_mapping.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/unit/test_openai_passthrough/test_model_mapping.py`:
+
+```python
+"""Tests for resolve_model_id."""
+from unittest.mock import MagicMock
+
+from app.api.openai_passthrough.model_mapping import resolve_model_id
+
+
+def test_returns_mapped_id_when_mapping_exists():
+    manager = MagicMock()
+    manager.get_mapping.return_value = "openai.gpt-oss-120b"
+
+    out = resolve_model_id("gpt-4", manager)
+    assert out == "openai.gpt-oss-120b"
+    manager.get_mapping.assert_called_once_with("gpt-4")
+
+
+def test_passes_through_when_no_mapping_exists():
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+
+    out = resolve_model_id("openai.gpt-oss-120b", manager)
+    assert out == "openai.gpt-oss-120b"
+
+
+def test_passes_through_empty_string():
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+
+    assert resolve_model_id("", manager) == ""
+
+
+def test_handles_lookup_exception_by_passing_through():
+    """If DDB lookup raises, fall back to the original ID rather than crashing the request."""
+    manager = MagicMock()
+    manager.get_mapping.side_effect = RuntimeError("ddb down")
+
+    out = resolve_model_id("gpt-4", manager)
+    assert out == "gpt-4"
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_model_mapping.py -v --no-cov`
+Expected: ImportError.
+
+- [ ] **Step 3: Implement resolve_model_id**
+
+Create `app/api/openai_passthrough/model_mapping.py`:
+
+```python
+"""Model ID resolution for the OpenAI passthrough endpoints.
+
+Looks up the client-supplied model in the existing model_mapping table; if a
+mapping exists, substitute it. Otherwise, pass through unchanged so callers
+can use Bedrock-native IDs (e.g. ``openai.gpt-oss-120b``) directly without
+needing to register them.
+"""
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_model_id(model: str, model_mapping_manager) -> str:
+    """Resolve a client-supplied model ID via the mapping table, with fallback.
+
+    Args:
+        model: The ``model`` field from the client request.
+        model_mapping_manager: An app.db.dynamodb.ModelMappingManager instance.
+
+    Returns:
+        The resolved Bedrock model ID, or the original string if no mapping
+        exists or the lookup fails.
+    """
+    if not model:
+        return model
+    try:
+        mapped = model_mapping_manager.get_mapping(model)
+    except Exception as exc:
+        logger.warning("[OPENAI-PASSTHROUGH] model mapping lookup failed for %r: %s", model, exc)
+        return model
+    return mapped or model
+```
+
+- [ ] **Step 4: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_model_mapping.py -v --no-cov`
+Expected: All four tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/api/openai_passthrough/model_mapping.py tests/unit/test_openai_passthrough/test_model_mapping.py
+git commit -m "feat(openai-passthrough): add model mapping resolver with passthrough fallback"
+```
+
+---
+
+## Task 5: Extend UsageTracker.record_usage with api_surface and reasoning_tokens
+
+**Files:**
+- Modify: `app/db/dynamodb.py:908-970`
+- Test: `tests/unit/test_openai_passthrough/test_usage_tracker_extended.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/unit/test_openai_passthrough/test_usage_tracker_extended.py`:
+
+```python
+"""Tests for the api_surface and reasoning_tokens additions to UsageTracker."""
+from unittest.mock import MagicMock
+
+from app.db.dynamodb import UsageTracker
+
+
+def _make_tracker():
+    ddb_client = MagicMock()
+    ddb_client.usage_table_name = "anthropic-proxy-usage"
+    tracker = UsageTracker(ddb_client)
+    tracker.table = MagicMock()
+    return tracker
+
+
+def test_record_usage_writes_api_surface_when_provided():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x",
+        request_id="req-1",
+        model="openai.gpt-oss-120b",
+        input_tokens=100,
+        output_tokens=50,
+        api_surface="chat_completions",
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    assert item["api_surface"] == "chat_completions"
+
+
+def test_record_usage_writes_reasoning_tokens_when_provided():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x", request_id="req-1", model="m",
+        input_tokens=10, output_tokens=5, reasoning_tokens=3,
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    assert item["reasoning_tokens"] == 3
+
+
+def test_record_usage_omits_new_fields_when_default():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x", request_id="req-1", model="m",
+        input_tokens=10, output_tokens=5,
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    # Sparse: not written when caller didn't specify them
+    assert "api_surface" not in item
+    assert "reasoning_tokens" not in item
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_tracker_extended.py -v --no-cov`
+Expected: TypeError — record_usage doesn't accept the new kwargs.
+
+- [ ] **Step 3: Modify record_usage**
+
+In `app/db/dynamodb.py`, find `UsageTracker.record_usage` (line 908). Add two new optional parameters to its signature, after `cache_ttl`:
+
+```python
+        cache_ttl: Optional[str] = None,
+        api_surface: Optional[str] = None,
+        reasoning_tokens: int = 0,
+    ):
+```
+
+Update the docstring's Args block to document them:
+
+```
+            api_surface: Source endpoint family ("messages", "chat_completions", or "responses")
+            reasoning_tokens: Reasoning tokens (already counted in output_tokens; stored separately for visibility)
+```
+
+Then, after the existing `if cache_ttl:` block (around line 962–963), add:
+
+```python
+        if api_surface:
+            item["api_surface"] = api_surface
+        if reasoning_tokens:
+            item["reasoning_tokens"] = reasoning_tokens
+```
+
+- [ ] **Step 4: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough/test_usage_tracker_extended.py -v --no-cov`
+Expected: All three tests PASS.
+
+- [ ] **Step 5: Run the full unit suite to check nothing regressed**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov`
+Expected: All previously-passing tests still pass.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add app/db/dynamodb.py tests/unit/test_openai_passthrough/test_usage_tracker_extended.py
+git commit -m "feat(usage): record api_surface and reasoning_tokens on usage rows"
+```
+
+---
+
+## Task 6: Implement httpx client singleton
+
+**Files:**
+- Create: `app/api/openai_passthrough/client.py`
+
+This is a small helper without business logic, so we test it indirectly through the integration tests (Tasks 8–11). No standalone unit tests.
+
+- [ ] **Step 1: Write the client module**
+
+Create `app/api/openai_passthrough/client.py`:
+
+```python
+"""Async httpx client to bedrock-mantle, lazily constructed and reused.
+
+Headers are NOT set on the client itself; they're added per-request in the
+router so we can include the proxy's Bedrock API key in Authorization.
+"""
+from __future__ import annotations
+
+import httpx
+
+from app.core.config import settings
+
+_client: httpx.AsyncClient | None = None
+
+
+def get_client() -> httpx.AsyncClient:
+    global _client
+    if _client is None:
+        _client = httpx.AsyncClient(
+            base_url=settings.openai_base_url,
+            timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0),
+            limits=httpx.Limits(max_connections=200, max_keepalive_connections=50),
+        )
+    return _client
+
+
+def reset_client_for_testing() -> None:
+    """Reset the singleton — only call this from test fixtures."""
+    global _client
+    if _client is not None:
+        # AsyncClient.aclose() is async; tests will close the loop after, so we
+        # null it here and let the GC clean up the underlying transport.
+        _client = None
+
+
+def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]:
+    """Build the Authorization + standard headers for an upstream call."""
+    headers = {
+        "Authorization": f"Bearer {settings.openai_api_key}",
+        "Content-Type": "application/json",
+        "User-Agent": "bedrock-api-proxy/openai-passthrough",
+    }
+    if extra:
+        headers.update(extra)
+    return headers
+```
+
+- [ ] **Step 2: Smoke-test the import**
+
+Run:
+```bash
+unset VIRTUAL_ENV && uv run --active python -c "from app.api.openai_passthrough.client import get_client, upstream_headers; print(upstream_headers())"
+```
+Expected: prints a dict with `Authorization: Bearer ` (the configured key, possibly empty), `Content-Type`, and `User-Agent`.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add app/api/openai_passthrough/client.py
+git commit -m "feat(openai-passthrough): add httpx singleton client and header helper"
+```
+
+---
+
+## Task 7: Implement streaming passthrough helper
+
+**Files:**
+- Create: `app/api/openai_passthrough/streaming.py`
+
+Tested indirectly through integration tests in Task 9.
+
+- [ ] **Step 1: Write the streaming module**
+
+Create `app/api/openai_passthrough/streaming.py`:
+
+```python
+"""SSE passthrough with usage tee.
+
+The async generator yields raw response bytes line-by-line so the FastAPI
+StreamingResponse forwards them unchanged. After upstream stream ends, it
+calls the supplied on_complete callback with the captured usage dict so the
+caller can record usage to DynamoDB.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any, AsyncIterator, Awaitable, Callable, Dict
+
+from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse
+
+logger = logging.getLogger(__name__)
+
+
+async def stream_passthrough(
+    method: str,
+    path: str,
+    body: Dict[str, Any] | None,
+    api_surface: str,
+    on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None],
+    extra_headers: Dict[str, str] | None = None,
+) -> AsyncIterator[bytes]:
+    """Stream upstream response bytes line-by-line; capture usage; trigger callback."""
+    usage: Dict[str, Any] = {}
+
+    client = get_client()
+    headers = upstream_headers(extra_headers)
+
+    try:
+        async with client.stream(method, path, json=body, headers=headers) as resp:
+            async for raw_line in resp.aiter_lines():
+                # Upstream gives us SSE lines without trailing newlines; restore the
+                # framing byte so the SSE body is well-formed for the downstream client.
+                yield (raw_line + "\n").encode("utf-8")
+                try_extract_usage_from_sse(raw_line, usage, api_surface)
+    except Exception as exc:
+        logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc)
+        # Re-raise so FastAPI can return a 500; downstream client sees the stream end.
+        raise
+
+    if usage:
+        result = on_complete(usage)
+        # Support both sync and async callbacks
+        if hasattr(result, "__await__"):
+            await result  # type: ignore[func-returns-value]
+```
+
+- [ ] **Step 2: Smoke-test the import**
+
+Run:
+```bash
+unset VIRTUAL_ENV && uv run --active python -c "from app.api.openai_passthrough.streaming import stream_passthrough; print('ok')"
+```
+Expected: prints `ok`.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add app/api/openai_passthrough/streaming.py
+git commit -m "feat(openai-passthrough): add SSE passthrough generator with usage tee"
+```
+
+---
+
+## Task 8: Implement router skeleton + chat/completions (non-streaming)
+
+**Files:**
+- Create: `app/api/openai_passthrough/router.py`
+- Modify: `app/main.py:298-314`
+- Create: `tests/integration/test_openai_passthrough/__init__.py`
+- Create: `tests/integration/test_openai_passthrough/conftest.py`
+- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py`
+
+- [ ] **Step 1: Create the integration test scaffolding**
+
+Run:
+```bash
+mkdir -p tests/integration/test_openai_passthrough
+touch tests/integration/test_openai_passthrough/__init__.py
+```
+
+Create `tests/integration/test_openai_passthrough/conftest.py`:
+
+```python
+"""Shared fixtures for openai-passthrough integration tests."""
+from unittest.mock import MagicMock, patch
+
+import pytest
+import respx
+from fastapi.testclient import TestClient
+
+
+@pytest.fixture
+def mock_settings(monkeypatch):
+    """Set the env so the passthrough router mounts and points at a fake mantle."""
+    monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", True)
+    monkeypatch.setattr("app.core.config.settings.openai_api_key", "bedrock-key-test")
+    monkeypatch.setattr("app.core.config.settings.openai_base_url", "https://mantle.test/v1")
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+
+@pytest.fixture
+def mock_api_key_manager():
+    """Patch APIKeyManager so any non-empty key validates as user 'u1'."""
+    manager = MagicMock()
+    manager.validate_api_key.return_value = {
+        "api_key": "sk-test", "user_id": "u1", "is_master": False,
+        "rate_limit": None, "cache_ttl": None,
+    }
+    with patch("app.middleware.auth.APIKeyManager", return_value=manager):
+        yield manager
+
+
+@pytest.fixture
+def mock_model_mapping_manager():
+    """Patch ModelMappingManager to return None (no mapping) by default."""
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+    with patch("app.db.dynamodb.ModelMappingManager", return_value=manager):
+        yield manager
+
+
+@pytest.fixture
+def mock_usage_tracker():
+    tracker = MagicMock()
+    with patch("app.db.dynamodb.UsageTracker", return_value=tracker):
+        yield tracker
+
+
+@pytest.fixture
+def respx_mock():
+    """respx mock router for httpx calls."""
+    with respx.mock(base_url="https://mantle.test/v1", assert_all_called=False) as router:
+        yield router
+
+
+@pytest.fixture
+def client(mock_settings, mock_api_key_manager, mock_model_mapping_manager, mock_usage_tracker):
+    """FastAPI TestClient with all mocks wired in.
+
+    Imports inside the fixture so module-level settings reads happen after
+    monkeypatching.
+    """
+    # Reset httpx singleton so it picks up the patched base URL
+    from app.api.openai_passthrough.client import reset_client_for_testing
+    reset_client_for_testing()
+
+    from app.main import app
+    return TestClient(app)
+```
+
+- [ ] **Step 2: Write the failing test**
+
+Create `tests/integration/test_openai_passthrough/test_chat_completions.py`:
+
+```python
+"""Integration tests for POST /openai/v1/chat/completions."""
+import json
+
+import respx
+import httpx
+
+
+def test_non_streaming_chat_completions_forwards_and_logs_usage(
+    client, respx_mock, mock_usage_tracker, mock_model_mapping_manager
+):
+    upstream_resp = {
+        "id": "chatcmpl-1",
+        "object": "chat.completion",
+        "model": "openai.gpt-oss-120b",
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": "hi"}, "finish_reason": "stop"}],
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+    }
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json=upstream_resp)
+    )
+
+    r = client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={
+            "model": "openai.gpt-oss-120b",
+            "messages": [{"role": "user", "content": "hi"}],
+        },
+    )
+
+    assert r.status_code == 200
+    assert r.json() == upstream_resp
+    assert route.called
+    # Upstream got proxy's Bedrock API key, not the client's proxy key
+    sent = route.calls[0].request
+    assert sent.headers["authorization"] == "Bearer bedrock-key-test"
+    sent_body = json.loads(sent.content)
+    assert sent_body["model"] == "openai.gpt-oss-120b"
+    # Usage was recorded
+    assert mock_usage_tracker.record_usage.called
+    kwargs = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kwargs["input_tokens"] == 10
+    assert kwargs["output_tokens"] == 5
+    assert kwargs["api_surface"] == "chat_completions"
+    assert kwargs["model"] == "openai.gpt-oss-120b"
+
+
+def test_model_mapping_is_applied(
+    client, respx_mock, mock_model_mapping_manager
+):
+    mock_model_mapping_manager.get_mapping.return_value = "openai.gpt-oss-120b"
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json={
+            "id": "x", "choices": [], "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}
+        })
+    )
+
+    client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "gpt-4", "messages": [{"role": "user", "content": "hi"}]},
+    )
+
+    sent = json.loads(route.calls[0].request.content)
+    assert sent["model"] == "openai.gpt-oss-120b"
+
+
+def test_upstream_4xx_returned_verbatim(client, respx_mock, mock_usage_tracker):
+    err_body = {"error": {"message": "model not found", "type": "invalid_request_error"}}
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(404, json=err_body)
+    )
+
+    r = client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "no-such-model", "messages": []},
+    )
+    assert r.status_code == 404
+    assert r.json() == err_body
+    assert not mock_usage_tracker.record_usage.called  # Don't log usage on errors
+
+
+def test_missing_auth_returns_401(client):
+    r = client.post(
+        "/openai/v1/chat/completions",
+        json={"model": "x", "messages": []},
+    )
+    assert r.status_code == 401
+```
+
+- [ ] **Step 3: Run the test to verify it fails**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov`
+Expected: tests fail — endpoint doesn't exist yet (404 from FastAPI).
+
+- [ ] **Step 4: Implement the router**
+
+Create `app/api/openai_passthrough/router.py`:
+
+```python
+"""FastAPI routes for the OpenAI passthrough endpoints.
+
+Mounted at /openai/v1 only when settings.enable_openai_passthrough is True.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict
+from uuid import uuid4
+
+from fastapi import APIRouter, Depends, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.model_mapping import resolve_model_id
+from app.api.openai_passthrough.streaming import stream_passthrough
+from app.api.openai_passthrough.usage_extractor import normalize_usage
+from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker
+from app.middleware.auth import get_api_key_info
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+_ddb: DynamoDBClient | None = None
+_mapping: ModelMappingManager | None = None
+_usage: UsageTracker | None = None
+
+
+def _managers():
+    """Lazily build DDB managers — keeps import-time side effects out of tests."""
+    global _ddb, _mapping, _usage
+    if _ddb is None:
+        _ddb = DynamoDBClient()
+        _mapping = ModelMappingManager(_ddb)
+        _usage = UsageTracker(_ddb)
+    return _mapping, _usage
+
+
+def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None:
+    _, usage = _managers()
+    norm = normalize_usage(raw_usage, api_surface)
+    try:
+        usage.record_usage(
+            api_key=api_key_info.get("api_key", ""),
+            request_id=str(uuid4()),
+            model=model,
+            input_tokens=norm["input_tokens"],
+            output_tokens=norm["output_tokens"],
+            cached_tokens=norm["cache_read_input_tokens"],
+            cache_write_input_tokens=norm["cache_creation_input_tokens"],
+            api_surface=api_surface,
+            reasoning_tokens=norm["reasoning_tokens"],
+        )
+    except Exception as exc:
+        logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc)
+
+
+def _passthrough_extra_headers(request: Request) -> Dict[str, str]:
+    """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails)."""
+    extra: Dict[str, str] = {}
+    for name, value in request.headers.items():
+        if name.lower().startswith("x-amzn-bedrock-"):
+            extra[name] = value
+    return extra
+
+
+@router.post("/chat/completions")
+async def chat_completions(
+    request: Request,
+    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+):
+    body = await request.json()
+    mapping, _ = _managers()
+    body["model"] = resolve_model_id(body.get("model", ""), mapping)
+    extra = _passthrough_extra_headers(request)
+
+    if body.get("stream"):
+        async def on_complete(usage: Dict[str, Any]) -> None:
+            _record_usage(api_key_info, usage, body["model"], "chat_completions")
+        return StreamingResponse(
+            stream_passthrough(
+                "POST", "/chat/completions", body, "chat_completions", on_complete, extra
+            ),
+            media_type="text/event-stream",
+        )
+
+    resp = await get_client().post(
+        "/chat/completions", json=body, headers=upstream_headers(extra)
+    )
+    if resp.status_code >= 400:
+        return JSONResponse(_safe_json(resp), status_code=resp.status_code)
+
+    data = resp.json()
+    if isinstance(data, dict) and isinstance(data.get("usage"), dict):
+        _record_usage(api_key_info, data["usage"], body["model"], "chat_completions")
+    return JSONResponse(data, status_code=resp.status_code)
+
+
+def _safe_json(resp) -> Dict[str, Any]:
+    try:
+        return resp.json()
+    except ValueError:
+        return {"error": {"message": resp.text, "type": "upstream_error"}}
+```
+
+- [ ] **Step 5: Wire up `__init__.py`**
+
+In `app/api/openai_passthrough/__init__.py`, replace the empty file with:
+
+```python
+"""OpenAI Passthrough — accepts OpenAI Chat Completions and Responses API
+calls from clients and forwards them to AWS bedrock-mantle.
+"""
+from app.api.openai_passthrough.router import router
+
+__all__ = ["router"]
+```
+
+- [ ] **Step 6: Mount the router in main.py**
+
+In `app/main.py`, after the existing `app.include_router(models.router, ...)` block (around line 314), add:
+
+```python
+if settings.enable_openai_passthrough:
+    from app.api.openai_passthrough import router as openai_passthrough_router
+    app.include_router(
+        openai_passthrough_router,
+        prefix="/openai/v1",
+        tags=["OpenAI Passthrough"],
+    )
+```
+
+- [ ] **Step 7: Run the integration tests**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov`
+Expected: All four tests PASS.
+
+- [ ] **Step 8: Run the full unit suite to check no regression**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov`
+Expected: All previously-passing tests still pass.
+
+- [ ] **Step 9: Commit**
+
+```bash
+git add app/api/openai_passthrough/router.py app/api/openai_passthrough/__init__.py app/main.py tests/integration/test_openai_passthrough/__init__.py tests/integration/test_openai_passthrough/conftest.py tests/integration/test_openai_passthrough/test_chat_completions.py
+git commit -m "feat(openai-passthrough): non-streaming /chat/completions endpoint"
+```
+
+---
+
+## Task 9: Add streaming support to chat/completions
+
+**Files:**
+- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py` (extend)
+
+The router already routes `body["stream"] = True` requests through `stream_passthrough`; this task validates the path end-to-end and adds the missing-`include_usage` case.
+
+- [ ] **Step 1: Append failing tests**
+
+Append the following to `tests/integration/test_openai_passthrough/test_chat_completions.py`:
+
+```python
+def test_streaming_chat_completions_forwards_sse_and_records_usage(
+    client, respx_mock, mock_usage_tracker
+):
+    """Stream three SSE chunks; the second-to-last carries usage."""
+    sse_lines = [
+        'data: {"id":"x","choices":[{"index":0,"delta":{"role":"assistant"}}]}',
+        'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}',
+        'data: {"id":"x","choices":[],"usage":{"prompt_tokens":7,"completion_tokens":2,"total_tokens":9}}',
+        'data: [DONE]',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200, headers={"content-type": "text/event-stream"}, content=body
+        )
+    )
+
+    with client.stream(
+        "POST",
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={
+            "model": "openai.gpt-oss-120b",
+            "messages": [{"role": "user", "content": "hi"}],
+            "stream": True,
+            "stream_options": {"include_usage": True},
+        },
+    ) as r:
+        assert r.status_code == 200
+        out = b"".join(r.iter_bytes())
+
+    # All four lines forwarded
+    assert b'"delta":{"role":"assistant"}' in out
+    assert b'[DONE]' in out
+    # Usage recorded from the chunk that had it
+    assert mock_usage_tracker.record_usage.called
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 7
+    assert kw["output_tokens"] == 2
+    assert kw["api_surface"] == "chat_completions"
+
+
+def test_streaming_chat_completions_without_include_usage_does_not_log(
+    client, respx_mock, mock_usage_tracker
+):
+    """If client doesn't request usage, no usage chunk arrives → no usage logged."""
+    sse_lines = [
+        'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}',
+        'data: [DONE]',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200, headers={"content-type": "text/event-stream"}, content=body
+        )
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "messages": [], "stream": True},
+    ) as r:
+        list(r.iter_bytes())  # drain
+
+    assert not mock_usage_tracker.record_usage.called
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py -v --no-cov`
+Expected: All six tests PASS (including the two new streaming tests).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/integration/test_openai_passthrough/test_chat_completions.py
+git commit -m "test(openai-passthrough): streaming /chat/completions integration tests"
+```
+
+---
+
+## Task 10: Add Responses API POST endpoint (streaming + non-streaming)
+
+**Files:**
+- Modify: `app/api/openai_passthrough/router.py`
+- Test: `tests/integration/test_openai_passthrough/test_responses.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `tests/integration/test_openai_passthrough/test_responses.py`:
+
+```python
+"""Integration tests for POST /openai/v1/responses (streaming + non-streaming)."""
+import json
+
+import httpx
+
+
+def test_non_streaming_responses_forwards_and_logs_usage(
+    client, respx_mock, mock_usage_tracker
+):
+    upstream = {
+        "id": "resp-1",
+        "object": "response",
+        "model": "openai.gpt-oss-120b",
+        "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "hi"}]}],
+        "usage": {"input_tokens": 11, "output_tokens": 4, "total_tokens": 15},
+    }
+    route = respx_mock.post("/responses").mock(return_value=httpx.Response(200, json=upstream))
+
+    r = client.post(
+        "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}]},
+    )
+
+    assert r.status_code == 200
+    assert r.json() == upstream
+    assert route.called
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 11
+    assert kw["output_tokens"] == 4
+    assert kw["api_surface"] == "responses"
+
+
+def test_streaming_responses_records_usage_from_response_completed(
+    client, respx_mock, mock_usage_tracker
+):
+    sse_lines = [
+        'event: response.created',
+        'data: {"type":"response.created","response":{"id":"r-1"}}',
+        'event: response.output_text.delta',
+        'data: {"type":"response.output_text.delta","delta":"hi"}',
+        'event: response.completed',
+        'data: ' + json.dumps({
+            "type": "response.completed",
+            "response": {"id": "r-1", "usage": {"input_tokens": 12, "output_tokens": 3, "total_tokens": 15}},
+        }),
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/responses").mock(
+        return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body)
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}], "stream": True},
+    ) as r:
+        out = b"".join(r.iter_bytes())
+
+    assert b"response.completed" in out
+    assert b"hi" in out
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 12
+    assert kw["output_tokens"] == 3
+    assert kw["api_surface"] == "responses"
+
+
+def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker):
+    respx_mock.post("/responses").mock(
+        return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}})
+    )
+    r = client.post(
+        "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "input": []},
+    )
+    assert r.status_code == 400
+    assert r.json()["error"]["message"] == "bad input"
+    assert not mock_usage_tracker.record_usage.called
+```
+
+- [ ] **Step 2: Run the tests to verify they fail**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses.py -v --no-cov`
+Expected: 404s — endpoint doesn't exist yet.
+
+- [ ] **Step 3: Add the responses endpoint to the router**
+
+In `app/api/openai_passthrough/router.py`, immediately after the `chat_completions` function, add:
+
+```python
+@router.post("/responses")
+async def responses_create(
+    request: Request,
+    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+):
+    body = await request.json()
+    mapping, _ = _managers()
+    body["model"] = resolve_model_id(body.get("model", ""), mapping)
+    extra = _passthrough_extra_headers(request)
+
+    if body.get("stream"):
+        async def on_complete(usage: Dict[str, Any]) -> None:
+            _record_usage(api_key_info, usage, body["model"], "responses")
+        return StreamingResponse(
+            stream_passthrough("POST", "/responses", body, "responses", on_complete, extra),
+            media_type="text/event-stream",
+        )
+
+    resp = await get_client().post(
+        "/responses", json=body, headers=upstream_headers(extra)
+    )
+    if resp.status_code >= 400:
+        return JSONResponse(_safe_json(resp), status_code=resp.status_code)
+
+    data = resp.json()
+    if isinstance(data, dict) and isinstance(data.get("usage"), dict):
+        _record_usage(api_key_info, data["usage"], body["model"], "responses")
+    return JSONResponse(data, status_code=resp.status_code)
+```
+
+- [ ] **Step 4: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses.py -v --no-cov`
+Expected: All three tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_responses.py
+git commit -m "feat(openai-passthrough): /responses endpoint (POST, streaming + non-streaming)"
+```
+
+---
+
+## Task 11: Add Responses CRUD passthrough (GET, DELETE, cancel, input_items)
+
+**Files:**
+- Modify: `app/api/openai_passthrough/router.py`
+- Test: `tests/integration/test_openai_passthrough/test_responses_crud.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `tests/integration/test_openai_passthrough/test_responses_crud.py`:
+
+```python
+"""Integration tests for the Responses CRUD endpoints — pure passthrough."""
+import httpx
+
+
+def test_get_response_forwards_and_returns_body(client, respx_mock, mock_usage_tracker):
+    body = {"id": "r-1", "model": "x", "status": "completed"}
+    respx_mock.get("/responses/r-1").mock(return_value=httpx.Response(200, json=body))
+
+    r = client.get("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == body
+    # No usage logged for retrieval
+    assert not mock_usage_tracker.record_usage.called
+
+
+def test_delete_response_forwards(client, respx_mock):
+    respx_mock.delete("/responses/r-1").mock(
+        return_value=httpx.Response(200, json={"id": "r-1", "deleted": True})
+    )
+    r = client.delete("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == {"id": "r-1", "deleted": True}
+
+
+def test_cancel_response_forwards(client, respx_mock):
+    respx_mock.post("/responses/r-1/cancel").mock(
+        return_value=httpx.Response(200, json={"id": "r-1", "status": "cancelled"})
+    )
+    r = client.post("/openai/v1/responses/r-1/cancel", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json()["status"] == "cancelled"
+
+
+def test_list_input_items_forwards(client, respx_mock):
+    body = {"data": [{"id": "msg-1", "role": "user"}], "object": "list"}
+    respx_mock.get("/responses/r-1/input_items").mock(return_value=httpx.Response(200, json=body))
+    r = client.get(
+        "/openai/v1/responses/r-1/input_items",
+        headers={"Authorization": "Bearer sk-test"},
+    )
+    assert r.status_code == 200
+    assert r.json() == body
+
+
+def test_get_response_404_returned_verbatim(client, respx_mock):
+    respx_mock.get("/responses/missing").mock(
+        return_value=httpx.Response(404, json={"error": {"message": "not found"}})
+    )
+    r = client.get("/openai/v1/responses/missing", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 404
+    assert r.json()["error"]["message"] == "not found"
+```
+
+- [ ] **Step 2: Run the tests to verify they fail**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses_crud.py -v --no-cov`
+Expected: 404s — endpoints don't exist yet.
+
+- [ ] **Step 3: Add the CRUD endpoints**
+
+In `app/api/openai_passthrough/router.py`, add immediately after `responses_create`:
+
+```python
+async def _passthrough_request(request: Request, path: str) -> Response:
+    """Forward request to upstream and mirror the upstream response."""
+    extra = _passthrough_extra_headers(request)
+    body = None
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body = await request.json()
+        except Exception:
+            body = None
+    resp = await get_client().request(
+        request.method, path, json=body, headers=upstream_headers(extra)
+    )
+    return Response(
+        content=resp.content,
+        status_code=resp.status_code,
+        media_type=resp.headers.get("content-type"),
+    )
+
+
+@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"])
+async def responses_get_or_delete(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}")
+
+
+@router.post("/responses/{response_id}/cancel")
+async def responses_cancel(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}/cancel")
+
+
+@router.get("/responses/{response_id}/input_items")
+async def responses_input_items(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}/input_items")
+```
+
+- [ ] **Step 4: Run the tests to verify they pass**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_responses_crud.py -v --no-cov`
+Expected: All five tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_responses_crud.py
+git commit -m "feat(openai-passthrough): /responses CRUD passthrough (GET, DELETE, cancel, input_items)"
+```
+
+---
+
+## Task 12: Add /models passthrough endpoint
+
+**Files:**
+- Modify: `app/api/openai_passthrough/router.py`
+- Test: `tests/integration/test_openai_passthrough/test_models.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/integration/test_openai_passthrough/test_models.py`:
+
+```python
+"""Integration test for GET /openai/v1/models — pure passthrough."""
+import httpx
+
+
+def test_list_models_forwards(client, respx_mock):
+    upstream = {
+        "object": "list",
+        "data": [
+            {"id": "openai.gpt-oss-120b", "object": "model"},
+            {"id": "us.anthropic.claude-sonnet-4-6", "object": "model"},
+        ],
+    }
+    respx_mock.get("/models").mock(return_value=httpx.Response(200, json=upstream))
+
+    r = client.get("/openai/v1/models", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == upstream
+```
+
+- [ ] **Step 2: Run the test to verify it fails**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_models.py -v --no-cov`
+Expected: 404 — endpoint doesn't exist.
+
+- [ ] **Step 3: Add the endpoint**
+
+In `app/api/openai_passthrough/router.py`, add at the end:
+
+```python
+@router.get("/models")
+async def list_models(
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, "/models")
+```
+
+- [ ] **Step 4: Run the test to verify it passes**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_models.py -v --no-cov`
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/api/openai_passthrough/router.py tests/integration/test_openai_passthrough/test_models.py
+git commit -m "feat(openai-passthrough): /models endpoint passthrough"
+```
+
+---
+
+## Task 13: Add Bedrock guardrail header passthrough
+
+The router's `_passthrough_extra_headers` already forwards `X-Amzn-Bedrock-*` headers. This task adds an explicit test so the behavior is locked in.
+
+**Files:**
+- Test: `tests/integration/test_openai_passthrough/test_chat_completions.py` (extend)
+
+- [ ] **Step 1: Append the test**
+
+Append to `tests/integration/test_openai_passthrough/test_chat_completions.py`:
+
+```python
+def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock):
+    """X-Amzn-Bedrock-* headers from the client should reach the upstream call."""
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json={
+            "id": "x", "choices": [],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        })
+    )
+    client.post(
+        "/openai/v1/chat/completions",
+        headers={
+            "Authorization": "Bearer sk-test",
+            "X-Amzn-Bedrock-GuardrailIdentifier": "GR12345",
+            "X-Amzn-Bedrock-GuardrailVersion": "DRAFT",
+        },
+        json={"model": "m", "messages": [{"role": "user", "content": "hi"}]},
+    )
+    sent = route.calls[0].request
+    assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345"
+    assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT"
+```
+
+- [ ] **Step 2: Run the test**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/integration/test_openai_passthrough/test_chat_completions.py::test_bedrock_guardrail_headers_are_forwarded -v --no-cov`
+Expected: PASS (the router already forwards these).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/integration/test_openai_passthrough/test_chat_completions.py
+git commit -m "test(openai-passthrough): pin guardrail header forwarding behavior"
+```
+
+---
+
+## Task 14: Final integration verification + full test suite
+
+**Files:** none
+
+- [ ] **Step 1: Run the full openai_passthrough integration suite**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit/test_openai_passthrough tests/integration/test_openai_passthrough -v --no-cov`
+Expected: All tests PASS (~30 tests).
+
+- [ ] **Step 2: Run the entire unit test suite**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit -q --no-cov`
+Expected: All previously-passing tests still pass.
+
+- [ ] **Step 3: Lint check**
+
+Run: `unset VIRTUAL_ENV && uv run --active ruff check app/api/openai_passthrough app/middleware/auth.py app/db/dynamodb.py`
+Expected: No errors. Fix any issues with `ruff check --fix`.
+
+- [ ] **Step 4: Type check**
+
+Run: `unset VIRTUAL_ENV && uv run --active mypy app/api/openai_passthrough 2>&1 | tail -20`
+Expected: No new errors introduced. Pre-existing project-wide errors are fine — focus only on the new module.
+
+- [ ] **Step 5: If lint/type fixes were needed, commit**
+
+```bash
+git add app/api/openai_passthrough
+git commit -m "chore(openai-passthrough): lint and type cleanup"
+```
+
+(Skip this step if Steps 3 and 4 were already clean.)
+
+---
+
+## Task 15: Documentation updates
+
+**Files:**
+- Modify: `env.example`
+- Modify: `CLAUDE.md`
+- Modify: `docs/architecture/features.md`
+
+- [ ] **Step 1: Update env.example**
+
+Find the existing `ENABLE_OPENAI_COMPAT` block in `env.example` and add a new entry below it:
+
+```
+# OpenAI Passthrough — mount /openai/v1/* endpoints accepting native OpenAI
+# Chat Completions and Responses API requests, forwarded to bedrock-mantle.
+# Independent of ENABLE_OPENAI_COMPAT (the two flags can be enabled together).
+# Reuses OPENAI_API_KEY and OPENAI_BASE_URL.
+ENABLE_OPENAI_PASSTHROUGH=False
+```
+
+- [ ] **Step 2: Update CLAUDE.md**
+
+In `CLAUDE.md`, find the "Features" section (around line 95–110, after "OpenAI-Compatible API"). Add a new bullet:
+
+```
+- **OpenAI Passthrough**: New `/openai/v1/*` endpoints accept OpenAI-native Chat Completions and Responses API requests and forward them to bedrock-mantle. Distinct from `ENABLE_OPENAI_COMPAT` (which routes Anthropic-format requests on `/v1/messages`). Reuses proxy API key auth, rate limits, budgets, and usage tracking. Controlled by `ENABLE_OPENAI_PASSTHROUGH`.
+```
+
+In the "Dual API Mode" section, add a third bullet:
+
+```
+- **OpenAI Passthrough** (any model bedrock-mantle accepts, optional): When `ENABLE_OPENAI_PASSTHROUGH=True`, mounts `/openai/v1/{chat/completions,responses,responses/{id},models}` for clients using OpenAI-format directly.
+```
+
+- [ ] **Step 3: Add detailed feature doc**
+
+Append to `docs/architecture/features.md`:
+
+```markdown
+## OpenAI Passthrough
+
+Adds new `/openai/v1/*` endpoints that accept OpenAI-native API formats and forward them to `bedrock-mantle`. Distinct from `ENABLE_OPENAI_COMPAT` (which converts Anthropic-format requests on `/v1/messages` into OpenAI calls).
+
+### When to use it
+
+- You have client code using the OpenAI Python/JS SDK and want to point it at Bedrock without rewriting.
+- You want stateful conversation chaining via the Responses API (`previous_response_id`, `store=true`).
+- You want the proxy's API key auth, rate limits, budgets, and usage analytics for OpenAI-format traffic too.
+
+### Configuration
+
+```bash
+ENABLE_OPENAI_PASSTHROUGH=True
+OPENAI_API_KEY=<your-bedrock-api-key>
+OPENAI_BASE_URL=https://bedrock-mantle.us-east-1.api.aws/v1
+```
+
+### Endpoints
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/openai/v1/chat/completions` | Chat Completions (streaming + non-streaming) |
+| POST | `/openai/v1/responses` | Responses API (streaming + non-streaming) |
+| GET | `/openai/v1/responses/{id}` | Retrieve stored response |
+| DELETE | `/openai/v1/responses/{id}` | Delete stored response |
+| GET | `/openai/v1/responses/{id}/input_items` | List input items |
+| POST | `/openai/v1/responses/{id}/cancel` | Cancel background response |
+| GET | `/openai/v1/models` | List available models |
+
+### OpenAI SDK example
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="<your-proxy-api-key>",
+    base_url="https://your-proxy.example.com/openai/v1",
+)
+resp = client.chat.completions.create(
+    model="openai.gpt-oss-120b",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+```
+
+### Auth
+
+Either `Authorization: Bearer <proxy-key>` (OpenAI SDK default) or `x-api-key: <proxy-key>` works. The proxy uses its configured `OPENAI_API_KEY` (Bedrock API key) for the upstream call.
+
+### Model mapping
+
+The existing `anthropic-proxy-model-mapping` table is consulted. If a mapping exists, the client-supplied `model` is replaced before forwarding. If no mapping exists, the model ID is passed through unchanged — so Bedrock-native IDs like `openai.gpt-oss-120b` work without registration.
+
+### Usage tracking
+
+Usage is normalized into the existing `anthropic-proxy-usage` schema. Two new sparse columns are written:
+
+- `api_surface` ∈ `{"messages", "chat_completions", "responses"}`
+- `reasoning_tokens` (already counted in `output_tokens`; stored separately for visibility)
+
+For streaming Chat Completions, clients must set `stream_options: {"include_usage": true}` for usage to be captured. Without it, usage is logged as zero. The Responses API always emits `response.completed` with usage.
+
+### Guardrails
+
+`X-Amzn-Bedrock-*` headers from the client (e.g. `X-Amzn-Bedrock-GuardrailIdentifier`) are forwarded to bedrock-mantle.
+```
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add env.example CLAUDE.md docs/architecture/features.md
+git commit -m "docs(openai-passthrough): document new feature in env.example, CLAUDE.md, and features.md"
+```
+
+---
+
+## Task 16: Final verification
+
+- [ ] **Step 1: Sanity import the app with the flag enabled**
+
+Run:
+```bash
+unset VIRTUAL_ENV && ENABLE_OPENAI_PASSTHROUGH=True uv run --active python -c "
+from app.main import app
+paths = sorted({r.path for r in app.routes})
+expected = [
+    '/openai/v1/chat/completions',
+    '/openai/v1/models',
+    '/openai/v1/responses',
+    '/openai/v1/responses/{response_id}',
+    '/openai/v1/responses/{response_id}/cancel',
+    '/openai/v1/responses/{response_id}/input_items',
+]
+for p in expected:
+    assert p in paths, f'missing {p}; got {paths}'
+print('all routes registered')
+"
+```
+Expected output: `all routes registered`
+
+- [ ] **Step 2: Sanity import with the flag disabled**
+
+Run:
+```bash
+unset VIRTUAL_ENV && ENABLE_OPENAI_PASSTHROUGH=False uv run --active python -c "
+from app.main import app
+paths = {r.path for r in app.routes}
+assert not any(p.startswith('/openai/v1') for p in paths), f'unexpected: {[p for p in paths if p.startswith(\"/openai/v1\")]}'
+print('flag-off cleanly excludes routes')
+"
+```
+Expected output: `flag-off cleanly excludes routes`
+
+- [ ] **Step 3: Final full test suite**
+
+Run: `unset VIRTUAL_ENV && uv run --active pytest tests/unit tests/integration/test_openai_passthrough -q --no-cov`
+Expected: All tests PASS, no failures or errors.
+
+- [ ] **Step 4: Show final git log to confirm commit shape**
+
+Run: `git log --oneline main..HEAD`
+Expected: ~13 commits with `feat(...)`, `test(...)`, `docs(...)`, and possibly `chore(...)` prefixes.
+
+---
+
+## Self-Review Notes
+
+Items I verified before finalizing:
+
+- **Spec coverage:** All 8 implementation steps from the design doc are covered (config flag → auth → client → non-streaming chat → streaming chat → responses POST → responses CRUD → docs). Plus tasks for usage extension, model mapping, /models, guardrails, and final verification.
+- **Type/name consistency:** `normalize_usage`, `try_extract_usage_from_sse`, `resolve_model_id`, `stream_passthrough`, `upstream_headers`, `_passthrough_extra_headers`, `_passthrough_request`, `_record_usage` — all introduced once and referenced consistently.
+- **No placeholders:** Every code step has full code, every test has assertions, every command has expected output.
+- **TDD throughout:** Each task that introduces logic starts with a failing test.
+- **Frequent commits:** 13–14 separate commits, one per task, with conventional-commit prefixes matching the project's existing style.
+- **Open items from design doc:**
+  - OTEL tracing — explicitly deferred (not in any task).
+  - Admin portal `api_surface` filter — explicitly deferred (not in any task).
+  - Guardrails passthrough — included in Task 13 (test pinning the behavior already implemented in Task 8's `_passthrough_extra_headers`).

From 14a0b8fdb1a60deea71e7f9144b18a3ba03dce81 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 05:07:59 +0000
Subject: [PATCH 02/22] docs(design): add OpenAI passthrough endpoints design
 doc

Mounts /openai/v1/* (chat/completions, responses + CRUD, models) as raw
httpx passthrough to bedrock-mantle. Reuses proxy API key auth, rate
limits, budgets, and usage tracking. Independent of ENABLE_OPENAI_COMPAT.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../2026-05-25-openai-passthrough-design.md   | 354 ++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 docs/plans/2026-05-25-openai-passthrough-design.md

diff --git a/docs/plans/2026-05-25-openai-passthrough-design.md b/docs/plans/2026-05-25-openai-passthrough-design.md
new file mode 100644
index 0000000..5050c9e
--- /dev/null
+++ b/docs/plans/2026-05-25-openai-passthrough-design.md
@@ -0,0 +1,354 @@
+# OpenAI Passthrough — Design Document
+
+**Status:** Approved (design)
+**Author:** River Xie
+**Date:** 2026-05-25
+
+## Summary
+
+Add new client-facing endpoints that accept OpenAI-native API formats (Chat Completions and Responses) and forward them to AWS Bedrock's `bedrock-mantle` endpoint. Existing Anthropic-format endpoints (`/v1/messages`) are untouched.
+
+This is **distinct from** the existing `ENABLE_OPENAI_COMPAT` feature, which converts Anthropic-format requests on `/v1/messages` into OpenAI calls. The new feature exposes OpenAI-format directly so OpenAI SDK clients can hit the proxy without translation.
+
+## Motivation
+
+- OpenAI SDK users want to access non-Claude Bedrock models (gpt-oss-120b, etc.) through their existing OpenAI SDK code with minimal changes.
+- The Responses API offers stateful conversation chaining (`previous_response_id`, `store=true`) that has no Anthropic equivalent and is awkward to expose through `/v1/messages`.
+- Centralizing all model traffic through one proxy gives unified API key auth, budget tracking, rate limits, usage analytics, and pricing — regardless of wire format.
+
+## Non-Goals
+
+- Cross-format translation: OpenAI-in → Anthropic-out is not a goal. Both directions are OpenAI-format end-to-end on these new endpoints.
+- OpenAI features that bedrock-mantle doesn't support (e.g. assistants API).
+- OTEL tracing for the new endpoints (deferred to v2).
+
+## Design Decisions
+
+The following were resolved during brainstorming:
+
+| # | Decision | Choice |
+|---|---|---|
+| 1 | Integration depth | **Full integration** — same proxy API key, budget, rate limit, usage tracking |
+| 2 | Model scope | **Allow any model bedrock-mantle accepts** (no Claude-block) |
+| 3 | Responses API surface | **Full CRUD** (POST + GET + DELETE + cancel + list_input_items) |
+| 4 | URL routing | **`/openai/v1/...` prefix** (matches AWS bedrock-runtime convention) |
+| 5 | Request handling | **Raw httpx passthrough** (no Pydantic schemas for OpenAI types) |
+| 6 | Model ID mapping | **Apply mapping if exists, else passthrough** |
+| 7 | Usage tracking | **Normalize into existing Anthropic-shaped schema** + new `api_surface` and `reasoning_tokens` columns |
+
+## High-Level Architecture
+
+### Module Layout
+
+```
+app/api/openai_passthrough/
+├── __init__.py          # exposes APIRouter
+├── router.py            # FastAPI routes (chat, responses, models, CRUD)
+├── client.py            # httpx async client to bedrock-mantle (singleton)
+├── usage_extractor.py   # parse usage from JSON body or final SSE event
+└── streaming.py         # SSE passthrough + usage extraction tee
+```
+
+### Mounting
+
+The router mounts at `/openai/v1` only when the feature flag is enabled, in `app/main.py`:
+
+```python
+if settings.enable_openai_passthrough:
+    from app.api.openai_passthrough import router as openai_router
+    app.include_router(openai_router, prefix="/openai/v1", tags=["OpenAI Passthrough"])
+```
+
+### Endpoints
+
+| Method | Path | Notes |
+|---|---|---|
+| POST | `/chat/completions` | Streaming + non-streaming |
+| POST | `/responses` | Streaming + non-streaming + background |
+| GET | `/responses/{response_id}` | Retrieve stored response |
+| DELETE | `/responses/{response_id}` | Delete stored response |
+| GET | `/responses/{response_id}/input_items` | List input items |
+| POST | `/responses/{response_id}/cancel` | Cancel background response |
+| GET | `/models` | List models from Mantle |
+
+### Request Flow (POST chat/completions or responses)
+
+1. `verify_api_key` middleware (extended to read `Authorization: Bearer` + existing `x-api-key`)
+2. Rate limit check (existing token bucket per API key)
+3. Budget check (existing)
+4. Parse request body as `dict` (no Pydantic validation)
+5. Apply model mapping if exists
+6. Forward via httpx to `{OPENAI_BASE_URL}/{path}` with proxy's Bedrock API key in `Authorization`
+7. Stream/return response
+8. Extract usage → log to `anthropic-proxy-usage` + `anthropic-proxy-usage-stats` with `api_surface` column
+
+### Request Flow (CRUD on /responses/{id})
+
+1. `verify_api_key`
+2. Rate limit check (shared bucket with POST)
+3. **Skip** budget check and usage logging (no tokens consumed)
+4. Forward verbatim to Mantle
+5. Return verbatim
+
+## Auth & Middleware Changes
+
+### Header Acceptance
+
+`app/middleware/auth.py::verify_api_key` is extended to accept either `x-api-key` or `Authorization: Bearer`:
+
+```python
+async def verify_api_key(
+    x_api_key: Optional[str] = Header(None, alias="x-api-key"),
+    authorization: Optional[str] = Header(None, alias="Authorization"),
+) -> ApiKeyInfo:
+    api_key = x_api_key
+    if not api_key and authorization and authorization.startswith("Bearer "):
+        api_key = authorization[7:].strip()
+    if not api_key:
+        raise HTTPException(401, "Missing API key (x-api-key or Authorization: Bearer)")
+    # ... existing lookup logic unchanged
+```
+
+This is **backwards compatible**. If both headers are present, `x-api-key` wins (deterministic).
+
+### Rate Limiting
+
+No change. The existing rate limiter is keyed by `api_key_id`; once auth resolves, all endpoints (Anthropic and OpenAI) share the same per-key bucket. A client cannot dodge limits by switching API surfaces.
+
+### Budget
+
+Same. The budget check is per-key and surface-agnostic. POST endpoints check + update; non-POST endpoints (GET/DELETE/list/cancel) skip both since they are free operations.
+
+### Bedrock-mantle Auth (Proxy → AWS)
+
+The proxy uses `OPENAI_API_KEY` (the Bedrock API key, already configured for the existing `ENABLE_OPENAI_COMPAT` feature) as `Authorization: Bearer` to bedrock-mantle:
+
+```python
+headers = {
+    "Authorization": f"Bearer {settings.openai_api_key}",
+    "Content-Type": "application/json",
+}
+```
+
+### Error Contract
+
+Mantle errors (4xx/5xx) are returned to the client **as-is** — same status code, same JSON body. No wrapping or rewriting. This preserves OpenAI-SDK error semantics so `OpenAIError` subclasses raise correctly client-side.
+
+The only proxy-injected errors are:
+- `401` — bad proxy API key
+- `429` — proxy rate limit
+- `402` — budget exceeded
+
+### Proxy-Injected Headers (upstream)
+
+- `User-Agent: bedrock-api-proxy/<version>`
+- `X-Proxy-Request-ID: <uuid>` for log correlation
+
+Both are zero-cost, useful for debugging, and ignored by Mantle.
+
+## Passthrough Client & Streaming
+
+### httpx Client (Singleton)
+
+```python
+# app/api/openai_passthrough/client.py
+import httpx
+from app.core.config import settings
+
+_client: httpx.AsyncClient | None = None
+
+def get_client() -> httpx.AsyncClient:
+    global _client
+    if _client is None:
+        _client = httpx.AsyncClient(
+            base_url=settings.openai_base_url,
+            timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0),
+            limits=httpx.Limits(max_connections=200, max_keepalive_connections=50),
+        )
+    return _client
+```
+
+### Non-Streaming POST
+
+```python
+async def chat_completions(request: Request, api_key_info: ApiKeyInfo = Depends(verify_api_key)):
+    body = await request.json()
+    body["model"] = resolve_model_id(body.get("model", ""))
+
+    if body.get("stream"):
+        return StreamingResponse(
+            stream_passthrough("/chat/completions", body, api_key_info, api_surface="chat_completions"),
+            media_type="text/event-stream",
+        )
+
+    resp = await get_client().post(
+        "/chat/completions", json=body,
+        headers={"Authorization": f"Bearer {settings.openai_api_key}"},
+    )
+    if resp.status_code >= 400:
+        return JSONResponse(resp.json(), status_code=resp.status_code)
+
+    data = resp.json()
+    log_usage_async(api_key_info, data.get("usage", {}), body["model"], "chat_completions")
+    return JSONResponse(data)
+```
+
+### Streaming Passthrough
+
+For SSE, we forward bytes line-by-line and *tee* to extract the final `usage` chunk.
+
+- **Chat Completions stream**: requires `stream_options: {"include_usage": true}` from the client. If sent, the second-to-last chunk has `usage`. If not, no usage extracted (proxy logs zero — documented behavior).
+- **Responses API stream**: usage is on the `response.completed` event. Always present.
+
+```python
+async def stream_passthrough(path, body, api_key_info, api_surface):
+    usage_holder: dict = {}
+    async with get_client().stream(
+        "POST", path, json=body,
+        headers={"Authorization": f"Bearer {settings.openai_api_key}"},
+    ) as resp:
+        async for raw_line in resp.aiter_lines():
+            yield (raw_line + "\n").encode()
+            try_extract_usage(raw_line, usage_holder, api_surface)
+    if usage_holder:
+        log_usage_async(api_key_info, usage_holder, body["model"], api_surface)
+```
+
+`try_extract_usage` is small (~30 LOC) — pattern matches `data: {...}` lines, JSON-parses, looks for `usage` field on completion events.
+
+### CRUD Endpoints
+
+Pure passthrough — forward method, path, body, query params; return status + body unchanged. ~20 LOC for all four combined:
+
+```python
+@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"])
+async def responses_crud(response_id: str, request: Request, _=Depends(verify_api_key)):
+    resp = await get_client().request(
+        request.method, f"/responses/{response_id}",
+        headers={"Authorization": f"Bearer {settings.openai_api_key}"},
+    )
+    return Response(content=resp.content, status_code=resp.status_code,
+                    media_type=resp.headers.get("content-type"))
+```
+
+### Edge Case — `store=true` and Pricing
+
+Mantle stores conversations for 30 days for free per the docs (no separate storage cost). We do not bill for it. If AWS adds a storage charge later, a feature flag can force `store=false`.
+
+## Usage Tracking
+
+### Normalization
+
+```python
+# app/api/openai_passthrough/usage_extractor.py
+def normalize_usage(raw: dict, api_surface: str) -> dict:
+    """Return Anthropic-shaped usage dict + reasoning_tokens."""
+    if api_surface == "chat_completions":
+        in_tok = raw.get("prompt_tokens", 0)
+        out_tok = raw.get("completion_tokens", 0)
+        cached = raw.get("prompt_tokens_details", {}).get("cached_tokens", 0)
+        reasoning = raw.get("completion_tokens_details", {}).get("reasoning_tokens", 0)
+    else:  # responses
+        in_tok = raw.get("input_tokens", 0)
+        out_tok = raw.get("output_tokens", 0)
+        cached = raw.get("input_tokens_details", {}).get("cached_tokens", 0)
+        reasoning = raw.get("output_tokens_details", {}).get("reasoning_tokens", 0)
+    return {
+        "input_tokens": in_tok - cached,         # subtract: cache hits billed separately
+        "output_tokens": out_tok,                # reasoning already included per spec
+        "cache_read_input_tokens": cached,
+        "cache_creation_input_tokens": 0,        # OpenAI APIs don't expose this
+        "reasoning_tokens": reasoning,           # new optional column
+    }
+```
+
+### DDB Schema Additions
+
+Added to `anthropic-proxy-usage`:
+
+| Field | Type | Default | Notes |
+|---|---|---|---|
+| `api_surface` | string | `"messages"` | One of `messages`, `chat_completions`, `responses` |
+| `reasoning_tokens` | integer | `0` | Optional, sparse |
+
+Both are sparse attributes — DynamoDB will not reject existing rows. **No migration required**; old rows simply will not have these fields when read.
+
+### Pricing Lookup
+
+Existing `anthropic-proxy-model-pricing` is keyed by Bedrock model ID. After model mapping, we have the Bedrock ID, so pricing works unchanged. Models missing from the pricing table log usage with `cost=0` and emit a warning (existing behavior).
+
+## Configuration
+
+### New Env Var
+
+Added to `app/core/config.py`:
+
+```python
+enable_openai_passthrough: bool = Field(
+    default=False, alias="ENABLE_OPENAI_PASSTHROUGH",
+    description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)"
+)
+```
+
+### Reused Vars
+
+- `OPENAI_API_KEY` — Bedrock API key for bedrock-mantle (already exists)
+- `OPENAI_BASE_URL` — Mantle endpoint URL (already exists)
+
+### Flag Interaction
+
+`ENABLE_OPENAI_COMPAT` (existing) and `ENABLE_OPENAI_PASSTHROUGH` (new) are **independent** and can be enabled together. They affect different endpoints:
+
+- `ENABLE_OPENAI_COMPAT=True`: routes non-Claude traffic on `/v1/messages` through bedrock-mantle (Anthropic↔OpenAI conversion)
+- `ENABLE_OPENAI_PASSTHROUGH=True`: mounts `/openai/v1/*` endpoints (no conversion, raw forward)
+
+## Testing Strategy
+
+### Unit Tests (`tests/unit/test_openai_passthrough/`)
+
+- `test_usage_extractor.py` — normalize chat_completions and responses usage shapes (incl. missing/zero fields, cached tokens, reasoning tokens)
+- `test_model_mapping.py` — passthrough when no mapping exists, substitution when it does
+- `test_auth.py` — `Authorization: Bearer` resolves to API key correctly; both-headers precedence
+
+### Integration Tests (`tests/integration/test_openai_passthrough/`)
+
+`respx` mocks bedrock-mantle. Tests cover:
+
+- POST chat/completions non-streaming → usage logged correctly
+- POST chat/completions streaming with `include_usage=true` → usage logged from second-to-last chunk
+- POST chat/completions streaming **without** `include_usage` → request succeeds, usage logged as zero
+- POST responses streaming → usage logged from `response.completed` event
+- POST responses non-streaming → usage logged from response body
+- GET /responses/{id} forwards correctly
+- DELETE /responses/{id} forwards correctly
+- POST /responses/{id}/cancel forwards correctly
+- GET /responses/{id}/input_items forwards correctly
+- 4xx from Mantle returned verbatim (status + body)
+- Rate limit on shared bucket triggers across both surfaces (mix `/v1/messages` and `/openai/v1/chat/completions` traffic)
+- Budget exhaustion blocks POST endpoints but not CRUD endpoints
+
+## Documentation Updates
+
+- `CLAUDE.md` — new "Features" entry: "OpenAI Passthrough"
+- `docs/architecture/features.md` — detailed feature doc with examples
+- `env.example` — new flag with comment
+- `README.md` / `README_ZH.md` — usage example with OpenAI SDK pointing at `/openai/v1`
+
+## Open Items (Deferred)
+
+1. **OTEL tracing** — additive, deferred to v2.
+2. **Admin portal `api_surface` filter** — existing dashboards aggregate fine; add filter when needed.
+3. **Guardrails passthrough** — Mantle Chat Completions supports guardrails via `X-Amzn-Bedrock-GuardrailIdentifier` headers. Recommend whitelisting `X-Amzn-Bedrock-*` headers in the passthrough on initial implementation. Trivial addition (~5 LOC), high value for guardrail-using customers. **Confirm before implementation.**
+
+## Implementation Sequence
+
+Once approved:
+
+1. Schema/config skeleton: feature flag, DDB column additions to usage manager, normalization function with unit tests
+2. Auth middleware extension (`Authorization: Bearer` support) with unit tests
+3. httpx client singleton + non-streaming chat/completions endpoint + integration test
+4. Streaming chat/completions + usage tee + integration test
+5. Responses API POST (streaming + non-streaming) + integration tests
+6. Responses CRUD endpoints (GET, DELETE, cancel, list_input_items) + integration tests
+7. `/models` passthrough endpoint
+8. Documentation updates (CLAUDE.md, features.md, env.example, READMEs)

From b2cfff2551c23e8776675047b1304ca79a5b9afb Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:15:21 +0000
Subject: [PATCH 03/22] feat(openai-passthrough): add ENABLE_OPENAI_PASSTHROUGH
 flag and respx dev dep

---
 app/core/config.py |  5 +++++
 pyproject.toml     |  1 +
 uv.lock            | 15 +++++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/app/core/config.py b/app/core/config.py
index 9527b2a..bb66386 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -404,6 +404,11 @@ class Settings(BaseSettings):
         alias="OPENAI_COMPAT_THINKING_MEDIUM_THRESHOLD",
         description="budget_tokens >= this → reasoning effort 'medium', below → 'low'"
     )
+    enable_openai_passthrough: bool = Field(
+        default=False,
+        alias="ENABLE_OPENAI_PASSTHROUGH",
+        description="Mount /openai/v1/* endpoints (Chat Completions + Responses passthrough to bedrock-mantle)"
+    )
 
     # === Multi-Provider Gateway Feature Flags ===
     multi_provider_enabled: bool = Field(
diff --git a/pyproject.toml b/pyproject.toml
index f04858f..800294e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ dev = [
     "pytest-asyncio>=0.23.0,<2.0.0",
     "pytest-cov>=4.1.0",
     "pytest-mock>=3.12.0",
+    "respx>=0.21.0",
     "httpx>=0.27.0",
     # Property-Based Testing
     "hypothesis>=6.100.0",
diff --git a/uv.lock b/uv.lock
index 9419635..6bf23ec 100644
--- a/uv.lock
+++ b/uv.lock
@@ -210,6 +210,7 @@ dev = [
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
     { name = "pytest-mock" },
+    { name = "respx" },
     { name = "ruff" },
     { name = "types-boto3" },
 ]
@@ -251,6 +252,7 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.0.0" },
     { name = "python-jose", extras = ["cryptography"], specifier = ">=3.3.0" },
     { name = "python-multipart", specifier = ">=0.0.22" },
+    { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21.0" },
     { name = "routellm", marker = "extra == 'smart-routing'", specifier = ">=0.1.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0" },
     { name = "sentry-sdk", extras = ["fastapi"], marker = "extra == 'monitoring'", specifier = ">=1.40.0" },
@@ -360,6 +362,7 @@ dependencies = [
     { name = "jmespath" },
     { name = "s3transfer" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/13/33c8b8704d677fcaf5555ba8c6cc39468fc7b9a0c6b6c496e008cd5557fc/boto3-1.42.76.tar.gz", hash = "sha256:aa2b1973eee8973a9475d24bb579b1dee7176595338d4e4f7880b5c6189b8814", size = 112789, upload-time = "2026-03-25T19:33:25.985Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/dc/21b3dfb135125eb7e3a46b9aab0aede847726f239fc8f39474742a87ebb0/boto3-1.42.76-py3-none-any.whl", hash = "sha256:63c6779c814847016b89ae1b72ed968f8a63d80e589ba337511aa6fc1b59585e", size = 140557, upload-time = "2026-03-25T19:33:23.289Z" },
 ]
@@ -2972,6 +2975,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl", hash = "sha256:03ec4409088cd5c66b71ecbbbd27fe2c58ddfad801c66203457b3e6a04868c37", size = 35099, upload-time = "2026-02-19T14:38:03.847Z" },
 ]
 
+[[package]]
+name = "respx"
+version = "0.23.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/98/4e55c9c486404ec12373708d015ebce157966965a5ebe7f28ff2c784d41b/respx-0.23.1.tar.gz", hash = "sha256:242dcc6ce6b5b9bf621f5870c82a63997e8e82bc7c947f9ffe272b8f3dd5a780", size = 29243, upload-time = "2026-04-08T14:37:16.008Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/4a/221da6ca167db45693d8d26c7dc79ccfc978a440251bf6721c9aaf251ac0/respx-0.23.1-py2.py3-none-any.whl", hash = "sha256:b18004b029935384bccfa6d7d9d74b4ec9af73a081cc28600fffc0447f4b8c1a", size = 25557, upload-time = "2026-04-08T14:37:14.613Z" },
+]
+
 [[package]]
 name = "rfc3339-validator"
 version = "0.1.4"

From e9f43fd2a1b6d9d9df4fd33cdfd6464f425e228f Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:20:27 +0000
Subject: [PATCH 04/22] feat(auth): accept Authorization: Bearer alongside
 x-api-key

---
 app/middleware/auth.py                        |   8 +-
 .../unit/test_openai_passthrough/__init__.py  |   0
 .../unit/test_openai_passthrough/test_auth.py | 138 ++++++++++++++++++
 3 files changed, 144 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/test_openai_passthrough/__init__.py
 create mode 100644 tests/unit/test_openai_passthrough/test_auth.py

diff --git a/app/middleware/auth.py b/app/middleware/auth.py
index 71abb85..a758732 100644
--- a/app/middleware/auth.py
+++ b/app/middleware/auth.py
@@ -59,8 +59,12 @@ async def dispatch(self, request: Request, call_next: Callable):
             request.state.api_key_info = None
             return await call_next(request)
 
-        # Extract API key from header
+        # Extract API key from header (x-api-key first, fall back to Authorization: Bearer)
         api_key = request.headers.get(settings.api_key_header)
+        if not api_key:
+            authz = request.headers.get("Authorization")
+            if authz and authz.startswith("Bearer "):
+                api_key = authz[len("Bearer "):].strip()
 
         if not api_key:
             print(f"[AUTH] Missing API key for {request.url.path}")
@@ -71,7 +75,7 @@ async def dispatch(self, request: Request, call_next: Callable):
                     "type": "error",
                     "error": {
                         "type": "authentication_error",
-                        "message": f"Missing API key in {settings.api_key_header} header",
+                        "message": f"Missing API key in {settings.api_key_header} or Authorization: Bearer header",
                     },
                 },
             )
diff --git a/tests/unit/test_openai_passthrough/__init__.py b/tests/unit/test_openai_passthrough/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_openai_passthrough/test_auth.py b/tests/unit/test_openai_passthrough/test_auth.py
new file mode 100644
index 0000000..df03fdd
--- /dev/null
+++ b/tests/unit/test_openai_passthrough/test_auth.py
@@ -0,0 +1,138 @@
+"""Tests for the auth middleware's Authorization: Bearer support."""
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from starlette.datastructures import Headers
+from starlette.requests import Request
+
+from app.middleware.auth import AuthMiddleware
+
+
+@pytest.mark.asyncio
+async def test_authorization_bearer_resolves_when_xapikey_missing(monkeypatch):
+    """Authorization: Bearer <key> should authenticate when x-api-key is absent."""
+    # Patch settings
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+    monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key")
+
+    # Create mock request with Authorization: Bearer header
+    request = MagicMock(spec=Request)
+    request.url.path = "/test"
+    request.headers = Headers({"Authorization": "Bearer sk-abc"})
+    request.state = MagicMock()
+
+    # Mock the API key manager
+    mock_manager = MagicMock()
+    mock_manager.validate_api_key.return_value = {"user_id": "u1", "api_key": "sk-abc"}
+
+    # Mock the call_next
+    mock_call_next = AsyncMock()
+    mock_call_next.return_value = MagicMock(status_code=200)
+
+    # Create middleware with mocked APIKeyManager
+    ddb_client = MagicMock()
+    with patch("app.middleware.auth.APIKeyManager", return_value=mock_manager):
+        middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client)
+
+    # Call dispatch
+    await middleware.dispatch(request, mock_call_next)
+
+    # Verify the API key was extracted and validated
+    mock_manager.validate_api_key.assert_called_once_with("sk-abc")
+    assert request.state.api_key_info == {"user_id": "u1", "api_key": "sk-abc"}
+
+
+@pytest.mark.asyncio
+async def test_xapikey_takes_precedence_when_both_present(monkeypatch):
+    """If both headers are present, x-api-key wins."""
+    # Patch settings
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+    monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key")
+
+    # Create mock request with both headers
+    request = MagicMock(spec=Request)
+    request.url.path = "/test"
+    request.headers = Headers({
+        "x-api-key": "sk-from-xapikey",
+        "Authorization": "Bearer sk-from-bearer"
+    })
+    request.state = MagicMock()
+
+    # Mock the API key manager
+    mock_manager = MagicMock()
+    mock_manager.validate_api_key.return_value = {"user_id": "u1", "api_key": "sk-from-xapikey"}
+
+    # Mock the call_next
+    mock_call_next = AsyncMock()
+    mock_call_next.return_value = MagicMock(status_code=200)
+
+    # Create middleware with mocked APIKeyManager
+    ddb_client = MagicMock()
+    with patch("app.middleware.auth.APIKeyManager", return_value=mock_manager):
+        middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client)
+
+    # Call dispatch
+    await middleware.dispatch(request, mock_call_next)
+
+    # Verify x-api-key took precedence
+    mock_manager.validate_api_key.assert_called_once_with("sk-from-xapikey")
+
+
+@pytest.mark.asyncio
+async def test_missing_both_headers_returns_401(monkeypatch):
+    """Missing both headers should return 401."""
+    # Patch settings
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+    monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key")
+
+    # Create mock request with no auth headers
+    request = MagicMock(spec=Request)
+    request.url.path = "/test"
+    request.headers = Headers({})
+    request.state = MagicMock()
+
+    # Mock the call_next
+    mock_call_next = AsyncMock()
+
+    # Create middleware with mocked APIKeyManager
+    ddb_client = MagicMock()
+    with patch("app.middleware.auth.APIKeyManager", return_value=MagicMock()):
+        middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client)
+
+    # Call dispatch
+    response = await middleware.dispatch(request, mock_call_next)
+
+    # Verify 401 response
+    assert response.status_code == 401
+
+
+@pytest.mark.asyncio
+async def test_authorization_non_bearer_is_ignored(monkeypatch):
+    """Authorization: Basic ... should not be treated as an API key."""
+    # Patch settings
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+    monkeypatch.setattr("app.core.config.settings.api_key_header", "x-api-key")
+
+    # Create mock request with Basic auth
+    request = MagicMock(spec=Request)
+    request.url.path = "/test"
+    request.headers = Headers({"Authorization": "Basic dXNlcjpwYXNz"})
+    request.state = MagicMock()
+
+    # Mock the call_next
+    mock_call_next = AsyncMock()
+
+    # Create middleware with mocked APIKeyManager
+    ddb_client = MagicMock()
+    with patch("app.middleware.auth.APIKeyManager", return_value=MagicMock()):
+        middleware = AuthMiddleware(MagicMock(), dynamodb_client=ddb_client)
+
+    # Call dispatch
+    response = await middleware.dispatch(request, mock_call_next)
+
+    # Verify 401 response
+    assert response.status_code == 401

From 24d91aee80a002d5e44f1fc0eed89997dcbcf454 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:28:21 +0000
Subject: [PATCH 05/22] feat(openai-passthrough): add usage normalization and
 SSE extraction helpers

---
 app/api/openai_passthrough/__init__.py        |   0
 app/api/openai_passthrough/usage_extractor.py |  84 +++++++++++++
 .../test_usage_extractor.py                   | 118 ++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 app/api/openai_passthrough/__init__.py
 create mode 100644 app/api/openai_passthrough/usage_extractor.py
 create mode 100644 tests/unit/test_openai_passthrough/test_usage_extractor.py

diff --git a/app/api/openai_passthrough/__init__.py b/app/api/openai_passthrough/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/openai_passthrough/usage_extractor.py b/app/api/openai_passthrough/usage_extractor.py
new file mode 100644
index 0000000..96a5f1e
--- /dev/null
+++ b/app/api/openai_passthrough/usage_extractor.py
@@ -0,0 +1,84 @@
+"""Usage extraction and normalization for OpenAI-format responses.
+
+normalize_usage() converts an OpenAI Chat Completions or Responses API usage
+dict into the Anthropic-shaped dict that UsageTracker.record_usage expects,
+plus a separate reasoning_tokens field.
+
+try_extract_usage_from_sse() peeks at SSE lines during streaming and stashes
+the usage dict (raw OpenAI shape) the first time it encounters one. The caller
+later passes that dict through normalize_usage().
+"""
+from __future__ import annotations
+
+import json
+from typing import Any, Dict
+
+
+def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]:
+    """Normalize OpenAI-shaped usage into Anthropic-shaped fields.
+
+    api_surface: "chat_completions" or "responses"
+    """
+    if api_surface == "chat_completions":
+        in_tok = int(raw.get("prompt_tokens", 0) or 0)
+        out_tok = int(raw.get("completion_tokens", 0) or 0)
+        cached = int((raw.get("prompt_tokens_details") or {}).get("cached_tokens", 0) or 0)
+        reasoning = int(
+            (raw.get("completion_tokens_details") or {}).get("reasoning_tokens", 0) or 0
+        )
+    else:  # responses
+        in_tok = int(raw.get("input_tokens", 0) or 0)
+        out_tok = int(raw.get("output_tokens", 0) or 0)
+        cached = int((raw.get("input_tokens_details") or {}).get("cached_tokens", 0) or 0)
+        reasoning = int(
+            (raw.get("output_tokens_details") or {}).get("reasoning_tokens", 0) or 0
+        )
+
+    # Cache-read tokens are billed separately, so subtract them from input_tokens
+    # to mirror how the Anthropic flow accounts for cache hits.
+    return {
+        "input_tokens": max(in_tok - cached, 0),
+        "output_tokens": out_tok,
+        "cache_read_input_tokens": cached,
+        "cache_creation_input_tokens": 0,  # Not exposed by OpenAI-format APIs
+        "reasoning_tokens": reasoning,
+    }
+
+
+def try_extract_usage_from_sse(
+    raw_line: str, holder: Dict[str, Any], api_surface: str
+) -> None:
+    """Inspect an SSE line and, if it carries usage info, store it in holder.
+
+    Mutates `holder` in place. Idempotent: subsequent calls overwrite, so the
+    last-seen usage event wins (which is what we want — both APIs put usage
+    on the terminal event).
+    """
+    line = raw_line.strip()
+    if not line.startswith("data:"):
+        return
+
+    payload = line[len("data:"):].strip()
+    if not payload or payload == "[DONE]":
+        return
+
+    try:
+        obj = json.loads(payload)
+    except (ValueError, TypeError):
+        return
+
+    if api_surface == "chat_completions":
+        usage = obj.get("usage")
+        if isinstance(usage, dict):
+            holder.clear()
+            holder.update(usage)
+    else:  # responses
+        # Usage lives on the `response.completed` event under
+        # event.response.usage. Other events occasionally carry partial usage
+        # too — accept any usage dict we see.
+        if obj.get("type") == "response.completed":
+            response_obj = obj.get("response") or {}
+            usage = response_obj.get("usage")
+            if isinstance(usage, dict):
+                holder.clear()
+                holder.update(usage)
diff --git a/tests/unit/test_openai_passthrough/test_usage_extractor.py b/tests/unit/test_openai_passthrough/test_usage_extractor.py
new file mode 100644
index 0000000..99230a9
--- /dev/null
+++ b/tests/unit/test_openai_passthrough/test_usage_extractor.py
@@ -0,0 +1,118 @@
+"""Tests for normalize_usage and try_extract_usage_from_sse."""
+import json
+
+from app.api.openai_passthrough.usage_extractor import (
+    normalize_usage,
+    try_extract_usage_from_sse,
+)
+
+
+def test_normalize_chat_completions_basic():
+    raw = {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}
+    result = normalize_usage(raw, "chat_completions")
+    assert result == {
+        "input_tokens": 100,
+        "output_tokens": 50,
+        "cache_read_input_tokens": 0,
+        "cache_creation_input_tokens": 0,
+        "reasoning_tokens": 0,
+    }
+
+
+def test_normalize_chat_completions_with_cache_and_reasoning():
+    raw = {
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "prompt_tokens_details": {"cached_tokens": 30},
+        "completion_tokens_details": {"reasoning_tokens": 20},
+    }
+    result = normalize_usage(raw, "chat_completions")
+    # cache hits subtracted from input
+    assert result["input_tokens"] == 70
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 30
+    assert result["reasoning_tokens"] == 20
+
+
+def test_normalize_responses_basic():
+    raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
+    result = normalize_usage(raw, "responses")
+    assert result["input_tokens"] == 100
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 0
+    assert result["reasoning_tokens"] == 0
+
+
+def test_normalize_responses_with_cache_and_reasoning():
+    raw = {
+        "input_tokens": 100,
+        "output_tokens": 50,
+        "input_tokens_details": {"cached_tokens": 25},
+        "output_tokens_details": {"reasoning_tokens": 15},
+    }
+    result = normalize_usage(raw, "responses")
+    assert result["input_tokens"] == 75
+    assert result["output_tokens"] == 50
+    assert result["cache_read_input_tokens"] == 25
+    assert result["reasoning_tokens"] == 15
+
+
+def test_normalize_handles_missing_fields():
+    """Empty/None usage should normalize to all-zeros, not crash."""
+    result = normalize_usage({}, "chat_completions")
+    assert result == {
+        "input_tokens": 0, "output_tokens": 0,
+        "cache_read_input_tokens": 0, "cache_creation_input_tokens": 0,
+        "reasoning_tokens": 0,
+    }
+
+
+def test_extract_chat_completions_usage_from_sse_chunk():
+    """Final chat-completions chunk with usage should be picked up."""
+    line = "data: " + json.dumps({
+        "id": "chatcmpl-1", "choices": [],
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+    })
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "chat_completions")
+    assert holder == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+
+
+def test_extract_responses_usage_from_response_completed_event():
+    line = "data: " + json.dumps({
+        "type": "response.completed",
+        "response": {
+            "id": "resp-1",
+            "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28},
+        },
+    })
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "responses")
+    assert holder == {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}
+
+
+def test_extract_ignores_non_data_lines():
+    holder: dict = {}
+    try_extract_usage_from_sse("event: response.completed", holder, "responses")
+    try_extract_usage_from_sse("", holder, "responses")
+    try_extract_usage_from_sse(": keepalive", holder, "responses")
+    assert holder == {}
+
+
+def test_extract_ignores_data_done():
+    holder: dict = {}
+    try_extract_usage_from_sse("data: [DONE]", holder, "chat_completions")
+    assert holder == {}
+
+
+def test_extract_ignores_chunks_without_usage():
+    line = "data: " + json.dumps({"choices": [{"delta": {"content": "hi"}}]})
+    holder: dict = {}
+    try_extract_usage_from_sse(line, holder, "chat_completions")
+    assert holder == {}
+
+
+def test_extract_ignores_malformed_json():
+    holder: dict = {}
+    try_extract_usage_from_sse("data: not-json", holder, "chat_completions")
+    assert holder == {}

From e3b8dac163238accfa8cfc1b20beb0fa0979dd09 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:31:50 +0000
Subject: [PATCH 06/22] feat(openai-passthrough): add model mapping resolver
 with passthrough fallback

---
 app/api/openai_passthrough/model_mapping.py   | 33 +++++++++++++++++
 .../test_model_mapping.py                     | 37 +++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 app/api/openai_passthrough/model_mapping.py
 create mode 100644 tests/unit/test_openai_passthrough/test_model_mapping.py

diff --git a/app/api/openai_passthrough/model_mapping.py b/app/api/openai_passthrough/model_mapping.py
new file mode 100644
index 0000000..a4c7e63
--- /dev/null
+++ b/app/api/openai_passthrough/model_mapping.py
@@ -0,0 +1,33 @@
+"""Model ID resolution for the OpenAI passthrough endpoints.
+
+Looks up the client-supplied model in the existing model_mapping table; if a
+mapping exists, substitute it. Otherwise, pass through unchanged so callers
+can use Bedrock-native IDs (e.g. ``openai.gpt-oss-120b``) directly without
+needing to register them.
+"""
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_model_id(model: str, model_mapping_manager) -> str:
+    """Resolve a client-supplied model ID via the mapping table, with fallback.
+
+    Args:
+        model: The ``model`` field from the client request.
+        model_mapping_manager: An app.db.dynamodb.ModelMappingManager instance.
+
+    Returns:
+        The resolved Bedrock model ID, or the original string if no mapping
+        exists or the lookup fails.
+    """
+    if not model:
+        return model
+    try:
+        mapped = model_mapping_manager.get_mapping(model)
+    except Exception as exc:
+        logger.warning("[OPENAI-PASSTHROUGH] model mapping lookup failed for %r: %s", model, exc)
+        return model
+    return mapped or model
diff --git a/tests/unit/test_openai_passthrough/test_model_mapping.py b/tests/unit/test_openai_passthrough/test_model_mapping.py
new file mode 100644
index 0000000..d148b64
--- /dev/null
+++ b/tests/unit/test_openai_passthrough/test_model_mapping.py
@@ -0,0 +1,37 @@
+"""Tests for resolve_model_id."""
+from unittest.mock import MagicMock
+
+from app.api.openai_passthrough.model_mapping import resolve_model_id
+
+
+def test_returns_mapped_id_when_mapping_exists():
+    manager = MagicMock()
+    manager.get_mapping.return_value = "openai.gpt-oss-120b"
+
+    out = resolve_model_id("gpt-4", manager)
+    assert out == "openai.gpt-oss-120b"
+    manager.get_mapping.assert_called_once_with("gpt-4")
+
+
+def test_passes_through_when_no_mapping_exists():
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+
+    out = resolve_model_id("openai.gpt-oss-120b", manager)
+    assert out == "openai.gpt-oss-120b"
+
+
+def test_passes_through_empty_string():
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+
+    assert resolve_model_id("", manager) == ""
+
+
+def test_handles_lookup_exception_by_passing_through():
+    """If DDB lookup raises, fall back to the original ID rather than crashing the request."""
+    manager = MagicMock()
+    manager.get_mapping.side_effect = RuntimeError("ddb down")
+
+    out = resolve_model_id("gpt-4", manager)
+    assert out == "gpt-4"

From 842268453e8b4afce9e605925f833340fc1a470e Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:34:25 +0000
Subject: [PATCH 07/22] feat(usage): record api_surface and reasoning_tokens on
 usage rows

---
 app/db/dynamodb.py                            |  9 ++++
 .../test_usage_tracker_extended.py            | 48 +++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 tests/unit/test_openai_passthrough/test_usage_tracker_extended.py

diff --git a/app/db/dynamodb.py b/app/db/dynamodb.py
index 8094b56..601a418 100644
--- a/app/db/dynamodb.py
+++ b/app/db/dynamodb.py
@@ -918,6 +918,8 @@ def record_usage(
         error_message: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         cache_ttl: Optional[str] = None,
+        api_surface: Optional[str] = None,
+        reasoning_tokens: int = 0,
     ):
         """
         Record API usage.
@@ -934,6 +936,8 @@ def record_usage(
             error_message: Error message if failed
             metadata: Optional metadata
             cache_ttl: Effective cache TTL used ("5m" or "1h"), for billing differentiation
+            api_surface: Source endpoint family ("messages", "chat_completions", or "responses")
+            reasoning_tokens: Reasoning tokens (already counted in output_tokens; stored separately for visibility)
         """
         # Use string timestamp to match CDK table schema (STRING type)
         current_time = int(time.time())
@@ -962,6 +966,11 @@ def record_usage(
         if cache_ttl:
             item["cache_ttl"] = cache_ttl
 
+        if api_surface:
+            item["api_surface"] = api_surface
+        if reasoning_tokens:
+            item["reasoning_tokens"] = reasoning_tokens
+
         # Add TTL if enabled (usage_ttl_days > 0)
         if settings.usage_ttl_days > 0:
             ttl_seconds = settings.usage_ttl_days * 24 * 60 * 60  # Convert days to seconds
diff --git a/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py b/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py
new file mode 100644
index 0000000..a3ac7d7
--- /dev/null
+++ b/tests/unit/test_openai_passthrough/test_usage_tracker_extended.py
@@ -0,0 +1,48 @@
+"""Tests for the api_surface and reasoning_tokens additions to UsageTracker."""
+from unittest.mock import MagicMock
+
+from app.db.dynamodb import UsageTracker
+
+
+def _make_tracker():
+    ddb_client = MagicMock()
+    ddb_client.usage_table_name = "anthropic-proxy-usage"
+    tracker = UsageTracker(ddb_client)
+    tracker.table = MagicMock()
+    return tracker
+
+
+def test_record_usage_writes_api_surface_when_provided():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x",
+        request_id="req-1",
+        model="openai.gpt-oss-120b",
+        input_tokens=100,
+        output_tokens=50,
+        api_surface="chat_completions",
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    assert item["api_surface"] == "chat_completions"
+
+
+def test_record_usage_writes_reasoning_tokens_when_provided():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x", request_id="req-1", model="m",
+        input_tokens=10, output_tokens=5, reasoning_tokens=3,
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    assert item["reasoning_tokens"] == 3
+
+
+def test_record_usage_omits_new_fields_when_default():
+    tracker = _make_tracker()
+    tracker.record_usage(
+        api_key="sk-x", request_id="req-1", model="m",
+        input_tokens=10, output_tokens=5,
+    )
+    item = tracker.table.put_item.call_args.kwargs["Item"]
+    # Sparse: not written when caller didn't specify them
+    assert "api_surface" not in item
+    assert "reasoning_tokens" not in item

From c3cf90d23cc572d7b8b816cd72d0ac564e2d2652 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:37:54 +0000
Subject: [PATCH 08/22] feat(openai-passthrough): add httpx singleton client
 and header helper

---
 app/api/openai_passthrough/client.py | 44 ++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 app/api/openai_passthrough/client.py

diff --git a/app/api/openai_passthrough/client.py b/app/api/openai_passthrough/client.py
new file mode 100644
index 0000000..93fcb80
--- /dev/null
+++ b/app/api/openai_passthrough/client.py
@@ -0,0 +1,44 @@
+"""Async httpx client to bedrock-mantle, lazily constructed and reused.
+
+Headers are NOT set on the client itself; they're added per-request in the
+router so we can include the proxy's Bedrock API key in Authorization.
+"""
+from __future__ import annotations
+
+import httpx
+
+from app.core.config import settings
+
+_client: httpx.AsyncClient | None = None
+
+
+def get_client() -> httpx.AsyncClient:
+    global _client
+    if _client is None:
+        _client = httpx.AsyncClient(
+            base_url=settings.openai_base_url,
+            timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0),
+            limits=httpx.Limits(max_connections=200, max_keepalive_connections=50),
+        )
+    return _client
+
+
+def reset_client_for_testing() -> None:
+    """Reset the singleton — only call this from test fixtures."""
+    global _client
+    if _client is not None:
+        # AsyncClient.aclose() is async; tests will close the loop after, so we
+        # null it here and let the GC clean up the underlying transport.
+        _client = None
+
+
+def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]:
+    """Build the Authorization + standard headers for an upstream call."""
+    headers = {
+        "Authorization": f"Bearer {settings.openai_api_key}",
+        "Content-Type": "application/json",
+        "User-Agent": "bedrock-api-proxy/openai-passthrough",
+    }
+    if extra:
+        headers.update(extra)
+    return headers

From 5aac2335191a0b59107cb5946fcabdedd55d8646 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:42:16 +0000
Subject: [PATCH 09/22] feat(openai-passthrough): add SSE passthrough generator
 with usage tee

---
 app/api/openai_passthrough/streaming.py | 49 +++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 app/api/openai_passthrough/streaming.py

diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
new file mode 100644
index 0000000..584c441
--- /dev/null
+++ b/app/api/openai_passthrough/streaming.py
@@ -0,0 +1,49 @@
+"""SSE passthrough with usage tee.
+
+The async generator yields raw response bytes line-by-line so the FastAPI
+StreamingResponse forwards them unchanged. After upstream stream ends, it
+calls the supplied on_complete callback with the captured usage dict so the
+caller can record usage to DynamoDB.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any, AsyncIterator, Awaitable, Callable, Dict
+
+from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse
+
+logger = logging.getLogger(__name__)
+
+
+async def stream_passthrough(
+    method: str,
+    path: str,
+    body: Dict[str, Any] | None,
+    api_surface: str,
+    on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None],
+    extra_headers: Dict[str, str] | None = None,
+) -> AsyncIterator[bytes]:
+    """Stream upstream response bytes line-by-line; capture usage; trigger callback."""
+    usage: Dict[str, Any] = {}
+
+    client = get_client()
+    headers = upstream_headers(extra_headers)
+
+    try:
+        async with client.stream(method, path, json=body, headers=headers) as resp:
+            async for raw_line in resp.aiter_lines():
+                # Upstream gives us SSE lines without trailing newlines; restore the
+                # framing byte so the SSE body is well-formed for the downstream client.
+                yield (raw_line + "\n").encode("utf-8")
+                try_extract_usage_from_sse(raw_line, usage, api_surface)
+    except Exception as exc:
+        logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc)
+        # Re-raise so FastAPI can return a 500; downstream client sees the stream end.
+        raise
+
+    if usage:
+        result = on_complete(usage)
+        # Support both sync and async callbacks
+        if hasattr(result, "__await__"):
+            await result  # type: ignore[func-returns-value]

From 93408d54640f047fce32639676f3c4bb712178bb Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 06:55:31 +0000
Subject: [PATCH 10/22] feat(openai-passthrough): non-streaming
 /chat/completions endpoint

Implements the FastAPI router for OpenAI passthrough, mounts it
conditionally under /openai/v1 when ENABLE_OPENAI_PASSTHROUGH=True,
and adds four integration tests (non-streaming forward, model mapping,
4xx passthrough, and 401 on missing auth).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 app/api/openai_passthrough/__init__.py        |   6 +
 app/api/openai_passthrough/router.py          | 103 ++++++++++++++++++
 app/main.py                                   |   8 ++
 .../test_openai_passthrough/__init__.py       |   0
 .../test_openai_passthrough/conftest.py       |  93 ++++++++++++++++
 .../test_chat_completions.py                  |  88 +++++++++++++++
 6 files changed, 298 insertions(+)
 create mode 100644 app/api/openai_passthrough/router.py
 create mode 100644 tests/integration/test_openai_passthrough/__init__.py
 create mode 100644 tests/integration/test_openai_passthrough/conftest.py
 create mode 100644 tests/integration/test_openai_passthrough/test_chat_completions.py

diff --git a/app/api/openai_passthrough/__init__.py b/app/api/openai_passthrough/__init__.py
index e69de29..21c98c6 100644
--- a/app/api/openai_passthrough/__init__.py
+++ b/app/api/openai_passthrough/__init__.py
@@ -0,0 +1,6 @@
+"""OpenAI Passthrough — accepts OpenAI Chat Completions and Responses API
+calls from clients and forwards them to AWS bedrock-mantle.
+"""
+from app.api.openai_passthrough.router import router
+
+__all__ = ["router"]
diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
new file mode 100644
index 0000000..1ee9b58
--- /dev/null
+++ b/app/api/openai_passthrough/router.py
@@ -0,0 +1,103 @@
+"""FastAPI routes for the OpenAI passthrough endpoints.
+
+Mounted at /openai/v1 only when settings.enable_openai_passthrough is True.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict
+from uuid import uuid4
+
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.model_mapping import resolve_model_id
+from app.api.openai_passthrough.streaming import stream_passthrough
+from app.api.openai_passthrough.usage_extractor import normalize_usage
+from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker
+from app.middleware.auth import get_api_key_info
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+_ddb: DynamoDBClient | None = None
+_mapping: ModelMappingManager | None = None
+_usage: UsageTracker | None = None
+
+
+def _managers() -> tuple[ModelMappingManager, UsageTracker]:
+    """Lazily build DDB managers — keeps import-time side effects out of tests."""
+    global _ddb, _mapping, _usage
+    if _ddb is None or _mapping is None or _usage is None:
+        _ddb = DynamoDBClient()
+        _mapping = ModelMappingManager(_ddb)
+        _usage = UsageTracker(_ddb)
+    return _mapping, _usage
+
+
+def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None:
+    _, usage = _managers()
+    norm = normalize_usage(raw_usage, api_surface)
+    try:
+        usage.record_usage(
+            api_key=api_key_info.get("api_key", ""),
+            request_id=str(uuid4()),
+            model=model,
+            input_tokens=norm["input_tokens"],
+            output_tokens=norm["output_tokens"],
+            cached_tokens=norm["cache_read_input_tokens"],
+            cache_write_input_tokens=norm["cache_creation_input_tokens"],
+            api_surface=api_surface,
+            reasoning_tokens=norm["reasoning_tokens"],
+        )
+    except Exception as exc:
+        logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc)
+
+
+def _passthrough_extra_headers(request: Request) -> Dict[str, str]:
+    """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails)."""
+    extra: Dict[str, str] = {}
+    for name, value in request.headers.items():
+        if name.lower().startswith("x-amzn-bedrock-"):
+            extra[name] = value
+    return extra
+
+
+@router.post("/chat/completions")
+async def chat_completions(
+    request: Request,
+    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+):
+    body = await request.json()
+    mapping, _ = _managers()
+    body["model"] = resolve_model_id(body.get("model", ""), mapping)
+    extra = _passthrough_extra_headers(request)
+
+    if body.get("stream"):
+        async def on_complete(usage: Dict[str, Any]) -> None:
+            _record_usage(api_key_info, usage, body["model"], "chat_completions")
+        return StreamingResponse(
+            stream_passthrough(
+                "POST", "/chat/completions", body, "chat_completions", on_complete, extra
+            ),
+            media_type="text/event-stream",
+        )
+
+    resp = await get_client().post(
+        "/chat/completions", json=body, headers=upstream_headers(extra)
+    )
+    if resp.status_code >= 400:
+        return JSONResponse(_safe_json(resp), status_code=resp.status_code)
+
+    data = resp.json()
+    if isinstance(data, dict) and isinstance(data.get("usage"), dict):
+        _record_usage(api_key_info, data["usage"], body["model"], "chat_completions")
+    return JSONResponse(data, status_code=resp.status_code)
+
+
+def _safe_json(resp) -> Dict[str, Any]:
+    try:
+        return resp.json()
+    except ValueError:
+        return {"error": {"message": resp.text, "type": "upstream_error"}}
diff --git a/app/main.py b/app/main.py
index e85e051..c589001 100644
--- a/app/main.py
+++ b/app/main.py
@@ -313,6 +313,14 @@ async def lifespan(app: FastAPI):
     tags=["models"],
 )
 
+if settings.enable_openai_passthrough:
+    from app.api.openai_passthrough import router as openai_passthrough_router
+    app.include_router(
+        openai_passthrough_router,
+        prefix="/openai/v1",
+        tags=["OpenAI Passthrough"],
+    )
+
 
 # Custom HTTPException handler to return proper JSON format
 from fastapi import HTTPException
diff --git a/tests/integration/test_openai_passthrough/__init__.py b/tests/integration/test_openai_passthrough/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/test_openai_passthrough/conftest.py b/tests/integration/test_openai_passthrough/conftest.py
new file mode 100644
index 0000000..0a484c8
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/conftest.py
@@ -0,0 +1,93 @@
+"""Shared fixtures for openai-passthrough integration tests."""
+from unittest.mock import MagicMock, patch
+
+import pytest
+import respx
+from fastapi.testclient import TestClient
+
+
+@pytest.fixture
+def mock_settings(monkeypatch):
+    """Set the env so the passthrough router mounts and points at a fake mantle."""
+    monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", True)
+    monkeypatch.setattr("app.core.config.settings.openai_api_key", "bedrock-key-test")
+    monkeypatch.setattr("app.core.config.settings.openai_base_url", "https://mantle.test/v1")
+    monkeypatch.setattr("app.core.config.settings.require_api_key", True)
+    monkeypatch.setattr("app.core.config.settings.master_api_key", "")
+
+
+@pytest.fixture
+def mock_api_key_manager():
+    """Patch APIKeyManager so any non-empty key validates as user 'u1'."""
+    manager = MagicMock()
+    manager.validate_api_key.return_value = {
+        "api_key": "sk-test", "user_id": "u1", "is_master": False,
+        "rate_limit": None, "cache_ttl": None,
+    }
+    with patch("app.middleware.auth.APIKeyManager", return_value=manager):
+        yield manager
+
+
+@pytest.fixture
+def mock_model_mapping_manager():
+    """Patch ModelMappingManager to return None (no mapping) by default."""
+    manager = MagicMock()
+    manager.get_mapping.return_value = None
+    with patch("app.api.openai_passthrough.router.ModelMappingManager", return_value=manager):
+        yield manager
+
+
+@pytest.fixture
+def mock_usage_tracker():
+    tracker = MagicMock()
+    with patch("app.api.openai_passthrough.router.UsageTracker", return_value=tracker):
+        yield tracker
+
+
+@pytest.fixture
+def respx_mock():
+    """respx mock router for httpx calls."""
+    with respx.mock(base_url="https://mantle.test/v1", assert_all_called=False) as router:
+        yield router
+
+
+@pytest.fixture
+def client(mock_settings, mock_api_key_manager, mock_model_mapping_manager, mock_usage_tracker):
+    """FastAPI TestClient with all mocks wired in.
+
+    Imports inside the fixture so module-level settings reads happen after
+    monkeypatching.
+    """
+    import importlib
+
+    # Reset httpx singleton so it picks up the patched base URL
+    from app.api.openai_passthrough.client import reset_client_for_testing
+    reset_client_for_testing()
+
+    # Access the actual router MODULE (not the APIRouter instance) via sys.modules.
+    # We must do this because app/api/openai_passthrough/__init__.py shadows the
+    # submodule name with `from .router import router`, so
+    # `import app.api.openai_passthrough.router` returns the APIRouter instance.
+    import sys as _sys
+
+    # Ensure the router module is loaded
+    import app.api.openai_passthrough.router  # noqa: F401  (triggers module load)
+    _router_module = _sys.modules["app.api.openai_passthrough.router"]
+
+    # Reset DDB manager cache so each test gets fresh mock instances
+    _router_module._ddb = None
+    _router_module._mapping = None
+    _router_module._usage = None
+
+    with patch("app.api.openai_passthrough.router.DynamoDBClient", return_value=MagicMock()):
+        # Reload app.main so the conditional router mount re-evaluates with
+        # settings.enable_openai_passthrough=True (set by mock_settings above).
+        import app.main as _main_mod
+        importlib.reload(_main_mod)
+
+        # Reset again after reload (reload may reinitialise globals)
+        _router_module._ddb = None
+        _router_module._mapping = None
+        _router_module._usage = None
+
+        yield TestClient(_main_mod.app)
diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
new file mode 100644
index 0000000..f3d1482
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -0,0 +1,88 @@
+"""Integration tests for POST /openai/v1/chat/completions."""
+import json
+
+import httpx
+
+
+def test_non_streaming_chat_completions_forwards_and_logs_usage(
+    client, respx_mock, mock_usage_tracker, mock_model_mapping_manager
+):
+    upstream_resp = {
+        "id": "chatcmpl-1",
+        "object": "chat.completion",
+        "model": "openai.gpt-oss-120b",
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": "hi"}, "finish_reason": "stop"}],
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+    }
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json=upstream_resp)
+    )
+
+    r = client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={
+            "model": "openai.gpt-oss-120b",
+            "messages": [{"role": "user", "content": "hi"}],
+        },
+    )
+
+    assert r.status_code == 200
+    assert r.json() == upstream_resp
+    assert route.called
+    # Upstream got proxy's Bedrock API key, not the client's proxy key
+    sent = route.calls[0].request
+    assert sent.headers["authorization"] == "Bearer bedrock-key-test"
+    sent_body = json.loads(sent.content)
+    assert sent_body["model"] == "openai.gpt-oss-120b"
+    # Usage was recorded
+    assert mock_usage_tracker.record_usage.called
+    kwargs = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kwargs["input_tokens"] == 10
+    assert kwargs["output_tokens"] == 5
+    assert kwargs["api_surface"] == "chat_completions"
+    assert kwargs["model"] == "openai.gpt-oss-120b"
+
+
+def test_model_mapping_is_applied(
+    client, respx_mock, mock_model_mapping_manager
+):
+    mock_model_mapping_manager.get_mapping.return_value = "openai.gpt-oss-120b"
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json={
+            "id": "x", "choices": [], "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}
+        })
+    )
+
+    client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "gpt-4", "messages": [{"role": "user", "content": "hi"}]},
+    )
+
+    sent = json.loads(route.calls[0].request.content)
+    assert sent["model"] == "openai.gpt-oss-120b"
+
+
+def test_upstream_4xx_returned_verbatim(client, respx_mock, mock_usage_tracker):
+    err_body = {"error": {"message": "model not found", "type": "invalid_request_error"}}
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(404, json=err_body)
+    )
+
+    r = client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "no-such-model", "messages": []},
+    )
+    assert r.status_code == 404
+    assert r.json() == err_body
+    assert not mock_usage_tracker.record_usage.called  # Don't log usage on errors
+
+
+def test_missing_auth_returns_401(client):
+    r = client.post(
+        "/openai/v1/chat/completions",
+        json={"model": "x", "messages": []},
+    )
+    assert r.status_code == 401

From 888eabdc827a13bca25fff6bf81b966226c1d7d6 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 07:02:58 +0000
Subject: [PATCH 11/22] test(openai-passthrough): streaming /chat/completions
 integration tests

---
 .../test_chat_completions.py                  | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
index f3d1482..4298436 100644
--- a/tests/integration/test_openai_passthrough/test_chat_completions.py
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -86,3 +86,70 @@ def test_missing_auth_returns_401(client):
         json={"model": "x", "messages": []},
     )
     assert r.status_code == 401
+
+
+def test_streaming_chat_completions_forwards_sse_and_records_usage(
+    client, respx_mock, mock_usage_tracker
+):
+    """Stream three SSE chunks; the second-to-last carries usage."""
+    sse_lines = [
+        'data: {"id":"x","choices":[{"index":0,"delta":{"role":"assistant"}}]}',
+        'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}',
+        'data: {"id":"x","choices":[],"usage":{"prompt_tokens":7,"completion_tokens":2,"total_tokens":9}}',
+        'data: [DONE]',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200, headers={"content-type": "text/event-stream"}, content=body
+        )
+    )
+
+    with client.stream(
+        "POST",
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={
+            "model": "openai.gpt-oss-120b",
+            "messages": [{"role": "user", "content": "hi"}],
+            "stream": True,
+            "stream_options": {"include_usage": True},
+        },
+    ) as r:
+        assert r.status_code == 200
+        out = b"".join(r.iter_bytes())
+
+    # All four lines forwarded
+    assert b'"delta":{"role":"assistant"}' in out
+    assert b'[DONE]' in out
+    # Usage recorded from the chunk that had it
+    assert mock_usage_tracker.record_usage.called
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 7
+    assert kw["output_tokens"] == 2
+    assert kw["api_surface"] == "chat_completions"
+
+
+def test_streaming_chat_completions_without_include_usage_does_not_log(
+    client, respx_mock, mock_usage_tracker
+):
+    """If client doesn't request usage, no usage chunk arrives → no usage logged."""
+    sse_lines = [
+        'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}',
+        'data: [DONE]',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200, headers={"content-type": "text/event-stream"}, content=body
+        )
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "messages": [], "stream": True},
+    ) as r:
+        list(r.iter_bytes())  # drain
+
+    assert not mock_usage_tracker.record_usage.called

From e091bd7ec82461611c24e825907801ae6e86647e Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 07:07:09 +0000
Subject: [PATCH 12/22] feat(openai-passthrough): /responses endpoint (POST,
 streaming + non-streaming)

---
 app/api/openai_passthrough/router.py          | 30 +++++++
 .../test_openai_passthrough/test_responses.py | 79 +++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 tests/integration/test_openai_passthrough/test_responses.py

diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index 1ee9b58..36ab285 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -96,6 +96,36 @@ async def on_complete(usage: Dict[str, Any]) -> None:
     return JSONResponse(data, status_code=resp.status_code)
 
 
+@router.post("/responses")
+async def responses_create(
+    request: Request,
+    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+):
+    body = await request.json()
+    mapping, _ = _managers()
+    body["model"] = resolve_model_id(body.get("model", ""), mapping)
+    extra = _passthrough_extra_headers(request)
+
+    if body.get("stream"):
+        async def on_complete(usage: Dict[str, Any]) -> None:
+            _record_usage(api_key_info, usage, body["model"], "responses")
+        return StreamingResponse(
+            stream_passthrough("POST", "/responses", body, "responses", on_complete, extra),
+            media_type="text/event-stream",
+        )
+
+    resp = await get_client().post(
+        "/responses", json=body, headers=upstream_headers(extra)
+    )
+    if resp.status_code >= 400:
+        return JSONResponse(_safe_json(resp), status_code=resp.status_code)
+
+    data = resp.json()
+    if isinstance(data, dict) and isinstance(data.get("usage"), dict):
+        _record_usage(api_key_info, data["usage"], body["model"], "responses")
+    return JSONResponse(data, status_code=resp.status_code)
+
+
 def _safe_json(resp) -> Dict[str, Any]:
     try:
         return resp.json()
diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py
new file mode 100644
index 0000000..a596fed
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/test_responses.py
@@ -0,0 +1,79 @@
+"""Integration tests for POST /openai/v1/responses (streaming + non-streaming)."""
+import json
+
+import httpx
+
+
+def test_non_streaming_responses_forwards_and_logs_usage(
+    client, respx_mock, mock_usage_tracker
+):
+    upstream = {
+        "id": "resp-1",
+        "object": "response",
+        "model": "openai.gpt-oss-120b",
+        "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "hi"}]}],
+        "usage": {"input_tokens": 11, "output_tokens": 4, "total_tokens": 15},
+    }
+    route = respx_mock.post("/responses").mock(return_value=httpx.Response(200, json=upstream))
+
+    r = client.post(
+        "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}]},
+    )
+
+    assert r.status_code == 200
+    assert r.json() == upstream
+    assert route.called
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 11
+    assert kw["output_tokens"] == 4
+    assert kw["api_surface"] == "responses"
+
+
+def test_streaming_responses_records_usage_from_response_completed(
+    client, respx_mock, mock_usage_tracker
+):
+    sse_lines = [
+        'event: response.created',
+        'data: {"type":"response.created","response":{"id":"r-1"}}',
+        'event: response.output_text.delta',
+        'data: {"type":"response.output_text.delta","delta":"hi"}',
+        'event: response.completed',
+        'data: ' + json.dumps({
+            "type": "response.completed",
+            "response": {"id": "r-1", "usage": {"input_tokens": 12, "output_tokens": 3, "total_tokens": 15}},
+        }),
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/responses").mock(
+        return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body)
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "openai.gpt-oss-120b", "input": [{"role": "user", "content": "hi"}], "stream": True},
+    ) as r:
+        out = b"".join(r.iter_bytes())
+
+    assert b"response.completed" in out
+    assert b"hi" in out
+    kw = mock_usage_tracker.record_usage.call_args.kwargs
+    assert kw["input_tokens"] == 12
+    assert kw["output_tokens"] == 3
+    assert kw["api_surface"] == "responses"
+
+
+def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker):
+    respx_mock.post("/responses").mock(
+        return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}})
+    )
+    r = client.post(
+        "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "input": []},
+    )
+    assert r.status_code == 400
+    assert r.json()["error"]["message"] == "bad input"
+    assert not mock_usage_tracker.record_usage.called

From 8abab658fc5ffa26c674575c60ea259cbe048506 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 07:13:50 +0000
Subject: [PATCH 13/22] feat(openai-passthrough): /responses CRUD passthrough
 (GET, DELETE, cancel, input_items)

---
 app/api/openai_passthrough/router.py          | 48 ++++++++++++++++-
 .../test_responses_crud.py                    | 51 +++++++++++++++++++
 2 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_openai_passthrough/test_responses_crud.py

diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index 36ab285..d0135c5 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -8,7 +8,7 @@
 from typing import Any, Dict
 from uuid import uuid4
 
-from fastapi import APIRouter, Depends, Request
+from fastapi import APIRouter, Depends, Request, Response
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from app.api.openai_passthrough.client import get_client, upstream_headers
@@ -126,6 +126,52 @@ async def on_complete(usage: Dict[str, Any]) -> None:
     return JSONResponse(data, status_code=resp.status_code)
 
 
+async def _passthrough_request(request: Request, path: str) -> Response:
+    """Forward request to upstream and mirror the upstream response."""
+    extra = _passthrough_extra_headers(request)
+    body = None
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body = await request.json()
+        except Exception:
+            body = None
+    resp = await get_client().request(
+        request.method, path, json=body, headers=upstream_headers(extra)
+    )
+    return Response(
+        content=resp.content,
+        status_code=resp.status_code,
+        media_type=resp.headers.get("content-type"),
+    )
+
+
+@router.api_route("/responses/{response_id}", methods=["GET", "DELETE"])
+async def responses_get_or_delete(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}")
+
+
+@router.post("/responses/{response_id}/cancel")
+async def responses_cancel(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}/cancel")
+
+
+@router.get("/responses/{response_id}/input_items")
+async def responses_input_items(
+    response_id: str,
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, f"/responses/{response_id}/input_items")
+
+
 def _safe_json(resp) -> Dict[str, Any]:
     try:
         return resp.json()
diff --git a/tests/integration/test_openai_passthrough/test_responses_crud.py b/tests/integration/test_openai_passthrough/test_responses_crud.py
new file mode 100644
index 0000000..3692c3b
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/test_responses_crud.py
@@ -0,0 +1,51 @@
+"""Integration tests for the Responses CRUD endpoints — pure passthrough."""
+import httpx
+
+
+def test_get_response_forwards_and_returns_body(client, respx_mock, mock_usage_tracker):
+    body = {"id": "r-1", "model": "x", "status": "completed"}
+    respx_mock.get("/responses/r-1").mock(return_value=httpx.Response(200, json=body))
+
+    r = client.get("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == body
+    # No usage logged for retrieval
+    assert not mock_usage_tracker.record_usage.called
+
+
+def test_delete_response_forwards(client, respx_mock):
+    respx_mock.delete("/responses/r-1").mock(
+        return_value=httpx.Response(200, json={"id": "r-1", "deleted": True})
+    )
+    r = client.delete("/openai/v1/responses/r-1", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == {"id": "r-1", "deleted": True}
+
+
+def test_cancel_response_forwards(client, respx_mock):
+    respx_mock.post("/responses/r-1/cancel").mock(
+        return_value=httpx.Response(200, json={"id": "r-1", "status": "cancelled"})
+    )
+    r = client.post("/openai/v1/responses/r-1/cancel", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json()["status"] == "cancelled"
+
+
+def test_list_input_items_forwards(client, respx_mock):
+    body = {"data": [{"id": "msg-1", "role": "user"}], "object": "list"}
+    respx_mock.get("/responses/r-1/input_items").mock(return_value=httpx.Response(200, json=body))
+    r = client.get(
+        "/openai/v1/responses/r-1/input_items",
+        headers={"Authorization": "Bearer sk-test"},
+    )
+    assert r.status_code == 200
+    assert r.json() == body
+
+
+def test_get_response_404_returned_verbatim(client, respx_mock):
+    respx_mock.get("/responses/missing").mock(
+        return_value=httpx.Response(404, json={"error": {"message": "not found"}})
+    )
+    r = client.get("/openai/v1/responses/missing", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 404
+    assert r.json()["error"]["message"] == "not found"

From bb2b1b33243089dd219873b2a7ca638f8c7983d2 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 07:22:46 +0000
Subject: [PATCH 14/22] feat(openai-passthrough): /models endpoint passthrough

---
 app/api/openai_passthrough/router.py            |  8 ++++++++
 .../test_openai_passthrough/test_models.py      | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 tests/integration/test_openai_passthrough/test_models.py

diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index d0135c5..1adb3ae 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -172,6 +172,14 @@ async def responses_input_items(
     return await _passthrough_request(request, f"/responses/{response_id}/input_items")
 
 
+@router.get("/models")
+async def list_models(
+    request: Request,
+    _: Dict[str, Any] = Depends(get_api_key_info),
+):
+    return await _passthrough_request(request, "/models")
+
+
 def _safe_json(resp) -> Dict[str, Any]:
     try:
         return resp.json()
diff --git a/tests/integration/test_openai_passthrough/test_models.py b/tests/integration/test_openai_passthrough/test_models.py
new file mode 100644
index 0000000..0a6d8cd
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/test_models.py
@@ -0,0 +1,17 @@
+"""Integration test for GET /openai/v1/models — pure passthrough."""
+import httpx
+
+
+def test_list_models_forwards(client, respx_mock):
+    upstream = {
+        "object": "list",
+        "data": [
+            {"id": "openai.gpt-oss-120b", "object": "model"},
+            {"id": "us.anthropic.claude-sonnet-4-6", "object": "model"},
+        ],
+    }
+    respx_mock.get("/models").mock(return_value=httpx.Response(200, json=upstream))
+
+    r = client.get("/openai/v1/models", headers={"Authorization": "Bearer sk-test"})
+    assert r.status_code == 200
+    assert r.json() == upstream

From 8048cc3ddbe7549ff51fee5f95e771a84164e29d Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 08:15:02 +0000
Subject: [PATCH 15/22] test(openai-passthrough): pin guardrail header
 forwarding behavior

---
 .../test_chat_completions.py                  | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
index 4298436..a2e9bdb 100644
--- a/tests/integration/test_openai_passthrough/test_chat_completions.py
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -153,3 +153,25 @@ def test_streaming_chat_completions_without_include_usage_does_not_log(
         list(r.iter_bytes())  # drain
 
     assert not mock_usage_tracker.record_usage.called
+
+
+def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock):
+    """X-Amzn-Bedrock-* headers from the client should reach the upstream call."""
+    route = respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(200, json={
+            "id": "x", "choices": [],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        })
+    )
+    client.post(
+        "/openai/v1/chat/completions",
+        headers={
+            "Authorization": "Bearer sk-test",
+            "X-Amzn-Bedrock-GuardrailIdentifier": "GR12345",
+            "X-Amzn-Bedrock-GuardrailVersion": "DRAFT",
+        },
+        json={"model": "m", "messages": [{"role": "user", "content": "hi"}]},
+    )
+    sent = route.calls[0].request
+    assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345"
+    assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT"

From 6fb932fbe99a026ff3ae9d2a7d49e09d687cca9a Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 08:17:13 +0000
Subject: [PATCH 16/22] chore(openai-passthrough): lint and type cleanup

---
 app/api/openai_passthrough/router.py          | 28 +++++++++----------
 app/api/openai_passthrough/streaming.py       | 13 +++++----
 app/api/openai_passthrough/usage_extractor.py |  6 ++--
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index 1adb3ae..159d28e 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict
+from typing import Any, cast
 from uuid import uuid4
 
 from fastapi import APIRouter, Depends, Request, Response
@@ -36,7 +36,7 @@ def _managers() -> tuple[ModelMappingManager, UsageTracker]:
     return _mapping, _usage
 
 
-def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model: str, api_surface: str) -> None:
+def _record_usage(api_key_info: dict[str, Any], raw_usage: dict[str, Any], model: str, api_surface: str) -> None:
     _, usage = _managers()
     norm = normalize_usage(raw_usage, api_surface)
     try:
@@ -55,9 +55,9 @@ def _record_usage(api_key_info: Dict[str, Any], raw_usage: Dict[str, Any], model
         logger.warning("[OPENAI-PASSTHROUGH] usage recording failed: %s", exc)
 
 
-def _passthrough_extra_headers(request: Request) -> Dict[str, str]:
+def _passthrough_extra_headers(request: Request) -> dict[str, str]:
     """Forward Bedrock-specific headers from the client to upstream (e.g. guardrails)."""
-    extra: Dict[str, str] = {}
+    extra: dict[str, str] = {}
     for name, value in request.headers.items():
         if name.lower().startswith("x-amzn-bedrock-"):
             extra[name] = value
@@ -67,7 +67,7 @@ def _passthrough_extra_headers(request: Request) -> Dict[str, str]:
 @router.post("/chat/completions")
 async def chat_completions(
     request: Request,
-    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+    api_key_info: dict[str, Any] = Depends(get_api_key_info),
 ):
     body = await request.json()
     mapping, _ = _managers()
@@ -75,7 +75,7 @@ async def chat_completions(
     extra = _passthrough_extra_headers(request)
 
     if body.get("stream"):
-        async def on_complete(usage: Dict[str, Any]) -> None:
+        async def on_complete(usage: dict[str, Any]) -> None:
             _record_usage(api_key_info, usage, body["model"], "chat_completions")
         return StreamingResponse(
             stream_passthrough(
@@ -99,7 +99,7 @@ async def on_complete(usage: Dict[str, Any]) -> None:
 @router.post("/responses")
 async def responses_create(
     request: Request,
-    api_key_info: Dict[str, Any] = Depends(get_api_key_info),
+    api_key_info: dict[str, Any] = Depends(get_api_key_info),
 ):
     body = await request.json()
     mapping, _ = _managers()
@@ -107,7 +107,7 @@ async def responses_create(
     extra = _passthrough_extra_headers(request)
 
     if body.get("stream"):
-        async def on_complete(usage: Dict[str, Any]) -> None:
+        async def on_complete(usage: dict[str, Any]) -> None:
             _record_usage(api_key_info, usage, body["model"], "responses")
         return StreamingResponse(
             stream_passthrough("POST", "/responses", body, "responses", on_complete, extra),
@@ -149,7 +149,7 @@ async def _passthrough_request(request: Request, path: str) -> Response:
 async def responses_get_or_delete(
     response_id: str,
     request: Request,
-    _: Dict[str, Any] = Depends(get_api_key_info),
+    _: dict[str, Any] = Depends(get_api_key_info),
 ):
     return await _passthrough_request(request, f"/responses/{response_id}")
 
@@ -158,7 +158,7 @@ async def responses_get_or_delete(
 async def responses_cancel(
     response_id: str,
     request: Request,
-    _: Dict[str, Any] = Depends(get_api_key_info),
+    _: dict[str, Any] = Depends(get_api_key_info),
 ):
     return await _passthrough_request(request, f"/responses/{response_id}/cancel")
 
@@ -167,7 +167,7 @@ async def responses_cancel(
 async def responses_input_items(
     response_id: str,
     request: Request,
-    _: Dict[str, Any] = Depends(get_api_key_info),
+    _: dict[str, Any] = Depends(get_api_key_info),
 ):
     return await _passthrough_request(request, f"/responses/{response_id}/input_items")
 
@@ -175,13 +175,13 @@ async def responses_input_items(
 @router.get("/models")
 async def list_models(
     request: Request,
-    _: Dict[str, Any] = Depends(get_api_key_info),
+    _: dict[str, Any] = Depends(get_api_key_info),
 ):
     return await _passthrough_request(request, "/models")
 
 
-def _safe_json(resp) -> Dict[str, Any]:
+def _safe_json(resp) -> dict[str, Any]:
     try:
-        return resp.json()
+        return cast(dict[str, Any], resp.json())
     except ValueError:
         return {"error": {"message": resp.text, "type": "upstream_error"}}
diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
index 584c441..8030f00 100644
--- a/app/api/openai_passthrough/streaming.py
+++ b/app/api/openai_passthrough/streaming.py
@@ -8,7 +8,8 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, AsyncIterator, Awaitable, Callable, Dict
+from collections.abc import AsyncIterator, Awaitable, Callable
+from typing import Any
 
 from app.api.openai_passthrough.client import get_client, upstream_headers
 from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse
@@ -19,13 +20,13 @@
 async def stream_passthrough(
     method: str,
     path: str,
-    body: Dict[str, Any] | None,
+    body: dict[str, Any] | None,
     api_surface: str,
-    on_complete: Callable[[Dict[str, Any]], Awaitable[None] | None],
-    extra_headers: Dict[str, str] | None = None,
+    on_complete: Callable[[dict[str, Any]], Awaitable[None] | None],
+    extra_headers: dict[str, str] | None = None,
 ) -> AsyncIterator[bytes]:
     """Stream upstream response bytes line-by-line; capture usage; trigger callback."""
-    usage: Dict[str, Any] = {}
+    usage: dict[str, Any] = {}
 
     client = get_client()
     headers = upstream_headers(extra_headers)
@@ -46,4 +47,4 @@ async def stream_passthrough(
         result = on_complete(usage)
         # Support both sync and async callbacks
         if hasattr(result, "__await__"):
-            await result  # type: ignore[func-returns-value]
+            await result  # type: ignore[misc]
diff --git a/app/api/openai_passthrough/usage_extractor.py b/app/api/openai_passthrough/usage_extractor.py
index 96a5f1e..0eb8a60 100644
--- a/app/api/openai_passthrough/usage_extractor.py
+++ b/app/api/openai_passthrough/usage_extractor.py
@@ -11,10 +11,10 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Dict
+from typing import Any
 
 
-def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]:
+def normalize_usage(raw: dict[str, Any], api_surface: str) -> dict[str, int]:
     """Normalize OpenAI-shaped usage into Anthropic-shaped fields.
 
     api_surface: "chat_completions" or "responses"
@@ -46,7 +46,7 @@ def normalize_usage(raw: Dict[str, Any], api_surface: str) -> Dict[str, int]:
 
 
 def try_extract_usage_from_sse(
-    raw_line: str, holder: Dict[str, Any], api_surface: str
+    raw_line: str, holder: dict[str, Any], api_surface: str
 ) -> None:
     """Inspect an SSE line and, if it carries usage info, store it in holder.
 

From d7c7d37c5fd3765df95b26579e66e9f0f8e7f0ab Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 08:23:37 +0000
Subject: [PATCH 17/22] docs(openai-passthrough): document new feature in
 env.example, CLAUDE.md, and features.md

---
 CLAUDE.md                     |  4 ++-
 docs/architecture/features.md | 68 +++++++++++++++++++++++++++++++++++
 env.example                   |  8 +++++
 3 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index c7d00e7..461e8ea 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -40,8 +40,9 @@ black app tests && ruff check app tests && mypy app
 - **InvokeModel API** (Claude models): Native Anthropic format, minimal conversion, full beta feature support
 - **Converse API** (non-Claude models): Requires format conversion, unified API for all Bedrock models
 - **OpenAI Chat Completions API** (non-Claude models, optional): When `ENABLE_OPENAI_COMPAT=True`, non-Claude models use Bedrock's OpenAI-compatible endpoint via bedrock-mantle instead of Converse API
+- **OpenAI Passthrough** (any model bedrock-mantle accepts, optional): When `ENABLE_OPENAI_PASSTHROUGH=True`, mounts `/openai/v1/{chat/completions,responses,responses/{id},models}` for clients using OpenAI-format directly.
 
-**Routing**: If model ID contains "anthropic" or "claude" → InvokeModel; else if `ENABLE_OPENAI_COMPAT` → OpenAI Chat Completions; else → Converse.
+**Routing**: If model ID contains "anthropic" or "claude" → InvokeModel; else if `ENABLE_OPENAI_COMPAT` → OpenAI Chat Completions; else → Converse. OpenAI Passthrough routes are independent and mount at `/openai/v1/*`.
 
 > **Detailed conversion flows, content block mapping, and streaming implementation**: see [docs/architecture/detailed-flows.md](docs/architecture/detailed-flows.md)
 
@@ -104,6 +105,7 @@ Each feature has detailed docs in [docs/architecture/features.md](docs/architect
 - **OpenTelemetry Tracing**: OTEL GenAI semantic conventions, session-based trace grouping. Zero overhead when disabled.
 - **Admin Portal**: Separate FastAPI app for API key/usage/pricing management with Cognito auth.
 - **OpenAI-Compatible API**: Non-Claude models can optionally use Bedrock's OpenAI Chat Completions API via bedrock-mantle endpoint instead of Converse API. Controlled by `ENABLE_OPENAI_COMPAT` flag. Maps `thinking` to OpenAI `reasoning` with configurable effort thresholds.
+- **OpenAI Passthrough**: New `/openai/v1/*` endpoints accept OpenAI-native Chat Completions and Responses API requests and forward them to bedrock-mantle. Distinct from `ENABLE_OPENAI_COMPAT` (which routes Anthropic-format requests on `/v1/messages`). Reuses proxy API key auth, rate limits, budgets, and usage tracking. Controlled by `ENABLE_OPENAI_PASSTHROUGH`.
 
 ## Common Development Tasks
 
diff --git a/docs/architecture/features.md b/docs/architecture/features.md
index af6d16b..e9f2b5d 100644
--- a/docs/architecture/features.md
+++ b/docs/architecture/features.md
@@ -419,3 +419,71 @@ The admin portal is a separate FastAPI application for managing API keys, usage,
 ### Production Deployment
 
 In production (ECS), the admin portal frontend is served as static files from the main proxy at `/admin/`, with API calls proxied to the backend.
+
+---
+
+## OpenAI Passthrough
+
+Adds new `/openai/v1/*` endpoints that accept OpenAI-native API formats and forward them to `bedrock-mantle`. Distinct from `ENABLE_OPENAI_COMPAT` (which converts Anthropic-format requests on `/v1/messages` into OpenAI calls).
+
+### When to use it
+
+- You have client code using the OpenAI Python/JS SDK and want to point it at Bedrock without rewriting.
+- You want stateful conversation chaining via the Responses API (`previous_response_id`, `store=true`).
+- You want the proxy's API key auth, rate limits, budgets, and usage analytics for OpenAI-format traffic too.
+
+### Configuration
+
+```bash
+ENABLE_OPENAI_PASSTHROUGH=True
+OPENAI_API_KEY=<your-bedrock-api-key>
+OPENAI_BASE_URL=https://bedrock-mantle.us-east-1.api.aws/v1
+```
+
+### Endpoints
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/openai/v1/chat/completions` | Chat Completions (streaming + non-streaming) |
+| POST | `/openai/v1/responses` | Responses API (streaming + non-streaming) |
+| GET | `/openai/v1/responses/{id}` | Retrieve stored response |
+| DELETE | `/openai/v1/responses/{id}` | Delete stored response |
+| GET | `/openai/v1/responses/{id}/input_items` | List input items |
+| POST | `/openai/v1/responses/{id}/cancel` | Cancel background response |
+| GET | `/openai/v1/models` | List available models |
+
+### OpenAI SDK example
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="<your-proxy-api-key>",
+    base_url="https://your-proxy.example.com/openai/v1",
+)
+resp = client.chat.completions.create(
+    model="openai.gpt-oss-120b",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+```
+
+### Auth
+
+Either `Authorization: Bearer <proxy-key>` (OpenAI SDK default) or `x-api-key: <proxy-key>` works. The proxy uses its configured `OPENAI_API_KEY` (Bedrock API key) for the upstream call.
+
+### Model mapping
+
+The existing `anthropic-proxy-model-mapping` table is consulted. If a mapping exists, the client-supplied `model` is replaced before forwarding. If no mapping exists, the model ID is passed through unchanged — so Bedrock-native IDs like `openai.gpt-oss-120b` work without registration.
+
+### Usage tracking
+
+Usage is normalized into the existing `anthropic-proxy-usage` schema. Two new sparse columns are written:
+
+- `api_surface` ∈ `{"messages", "chat_completions", "responses"}`
+- `reasoning_tokens` (already counted in `output_tokens`; stored separately for visibility)
+
+For streaming Chat Completions, clients must set `stream_options: {"include_usage": true}` for usage to be captured. Without it, usage is logged as zero. The Responses API always emits `response.completed` with usage.
+
+### Guardrails
+
+`X-Amzn-Bedrock-*` headers from the client (e.g. `X-Amzn-Bedrock-GuardrailIdentifier`) are forwarded to bedrock-mantle.
diff --git a/env.example b/env.example
index 29029de..93733ea 100644
--- a/env.example
+++ b/env.example
@@ -104,6 +104,14 @@ DEFAULT_SERVICE_TIER=default
 # OPENAI_COMPAT_THINKING_HIGH_THRESHOLD=10000
 # OPENAI_COMPAT_THINKING_MEDIUM_THRESHOLD=4000
 
+# ===========================================
+# OpenAI Passthrough — mount /openai/v1/* endpoints accepting native OpenAI
+# Chat Completions and Responses API requests, forwarded to bedrock-mantle.
+# Independent of ENABLE_OPENAI_COMPAT (the two flags can be enabled together).
+# Reuses OPENAI_API_KEY and OPENAI_BASE_URL.
+# ===========================================
+# ENABLE_OPENAI_PASSTHROUGH=False
+
 # ===========================================
 # OpenTelemetry Tracing
 # ===========================================

From fc827c76cba807b612142fb6acafec7c27da7f78 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 08:44:25 +0000
Subject: [PATCH 18/22] fix(openai-passthrough): yield structured SSE error on
 upstream timeout; add flag-off and timeout tests

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/openai_passthrough/streaming.py       | 18 ++++++++-
 .../test_chat_completions.py                  | 22 +++++++++++
 .../test_openai_passthrough/test_flag_off.py  | 37 +++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_openai_passthrough/test_flag_off.py

diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
index 8030f00..220903b 100644
--- a/app/api/openai_passthrough/streaming.py
+++ b/app/api/openai_passthrough/streaming.py
@@ -7,10 +7,13 @@
 """
 from __future__ import annotations
 
+import json
 import logging
 from collections.abc import AsyncIterator, Awaitable, Callable
 from typing import Any
 
+import httpx
+
 from app.api.openai_passthrough.client import get_client, upstream_headers
 from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse
 
@@ -38,9 +41,22 @@ async def stream_passthrough(
                 # framing byte so the SSE body is well-formed for the downstream client.
                 yield (raw_line + "\n").encode("utf-8")
                 try_extract_usage_from_sse(raw_line, usage, api_surface)
+    except (httpx.RequestError, httpx.TimeoutException) as exc:
+        # Upstream connection/timeout failure during streaming. OpenAI SDK clients
+        # expect a clean SSE termination, not an abruptly closed stream.
+        logger.error("[OPENAI-PASSTHROUGH] upstream stream connection error: %s", exc)
+        err = {
+            "error": {
+                "message": f"upstream connection failed: {type(exc).__name__}",
+                "type": "upstream_error",
+            }
+        }
+        yield ("data: " + json.dumps(err) + "\n\n").encode("utf-8")
+        yield b"data: [DONE]\n\n"
+        return
     except Exception as exc:
+        # Unexpected error — re-raise so FastAPI can convert to 500.
         logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc)
-        # Re-raise so FastAPI can return a 500; downstream client sees the stream end.
         raise
 
     if usage:
diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
index a2e9bdb..b26965c 100644
--- a/tests/integration/test_openai_passthrough/test_chat_completions.py
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -175,3 +175,25 @@ def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock):
     sent = route.calls[0].request
     assert sent.headers["x-amzn-bedrock-guardrailidentifier"] == "GR12345"
     assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT"
+
+
+def test_streaming_upstream_timeout_yields_clean_sse_error(
+    client, respx_mock, mock_usage_tracker
+):
+    """Upstream timeout during streaming should produce a structured SSE error event, not crash the stream."""
+    respx_mock.post("/chat/completions").mock(
+        side_effect=httpx.ReadTimeout("upstream took too long")
+    )
+
+    with client.stream(
+        "POST",
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "messages": [], "stream": True},
+    ) as r:
+        out = b"".join(r.iter_bytes())
+
+    assert b'"upstream_error"' in out, f"expected structured error, got: {out}"
+    assert b"[DONE]" in out
+    # No usage logged when the stream errored before any usage event arrived
+    assert not mock_usage_tracker.record_usage.called
diff --git a/tests/integration/test_openai_passthrough/test_flag_off.py b/tests/integration/test_openai_passthrough/test_flag_off.py
new file mode 100644
index 0000000..da8d816
--- /dev/null
+++ b/tests/integration/test_openai_passthrough/test_flag_off.py
@@ -0,0 +1,37 @@
+"""Verify that /openai/v1/* paths return 404 when ENABLE_OPENAI_PASSTHROUGH is off."""
+import importlib
+
+from fastapi.testclient import TestClient
+
+
+def test_flag_off_returns_404(monkeypatch):
+    """With ENABLE_OPENAI_PASSTHROUGH=False, /openai/v1/* paths must not exist."""
+    monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", False)
+    monkeypatch.setattr("app.core.config.settings.require_api_key", False)
+
+    # Reload main so the conditional mount re-evaluates with the flag off.
+    import app.main as _main
+    importlib.reload(_main)
+
+    client = TestClient(_main.app)
+    r = client.post(
+        "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "x", "messages": []},
+    )
+    assert r.status_code == 404, f"expected 404 with flag off, got {r.status_code}"
+
+    r = client.get("/openai/v1/models")
+    assert r.status_code == 404
+
+
+def test_flag_off_does_not_register_routes(monkeypatch):
+    """Programmatic verification: no route paths under /openai/v1 when flag is off."""
+    monkeypatch.setattr("app.core.config.settings.enable_openai_passthrough", False)
+
+    import app.main as _main
+    importlib.reload(_main)
+
+    extra = [getattr(r, "path", "") for r in _main.app.routes
+             if getattr(r, "path", "").startswith("/openai/v1")]
+    assert not extra, f"unexpected routes registered: {extra}"

From 20b58971faf88335c002d29328a73ebf08983b88 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 12:30:19 +0000
Subject: [PATCH 19/22] fix(openai-passthrough): build absolute upstream URLs
 to preserve /v1 base path

httpx follows RFC 3986 path-merging on AsyncClient.base_url: a request path
starting with `/` REPLACES the base_url's path entirely. With
OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1, calls like
`client.post("/chat/completions")` were being sent to
`bedrock-mantle.us-west-2.api.aws/chat/completions` (no `/v1`), causing
404s in production.

Fix:
- Drop base_url from the AsyncClient
- Add upstream_url(path) that explicitly joins OPENAI_BASE_URL + path
- Use upstream_url() everywhere we previously passed bare paths
- Add unit tests covering leading-slash, trailing-slash, and ID-in-path
  cases that would have caught this

Integration tests passed previously because respx joins base_url + path
intuitively; only real httpx exhibits the RFC 3986 replacement behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/openai_passthrough/client.py          | 27 +++++++-
 app/api/openai_passthrough/router.py          |  8 +--
 app/api/openai_passthrough/streaming.py       |  4 +-
 .../test_client_url.py                        | 63 +++++++++++++++++++
 4 files changed, 95 insertions(+), 7 deletions(-)
 create mode 100644 tests/unit/test_openai_passthrough/test_client_url.py

diff --git a/app/api/openai_passthrough/client.py b/app/api/openai_passthrough/client.py
index 93fcb80..04a9479 100644
--- a/app/api/openai_passthrough/client.py
+++ b/app/api/openai_passthrough/client.py
@@ -2,6 +2,15 @@
 
 Headers are NOT set on the client itself; they're added per-request in the
 router so we can include the proxy's Bedrock API key in Authorization.
+
+URL building note: we deliberately do NOT set ``base_url`` on the AsyncClient.
+httpx follows RFC 3986 path-merging, which means a request path starting with
+``/`` REPLACES the path component of the base_url. With
+``OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1``, calling
+``client.post("/chat/completions")`` would produce
+``https://bedrock-mantle.us-west-2.api.aws/chat/completions`` (the ``/v1`` is
+dropped). To avoid this footgun we build full URLs explicitly via
+``upstream_url(path)``.
 """
 from __future__ import annotations
 
@@ -16,7 +25,6 @@ def get_client() -> httpx.AsyncClient:
     global _client
     if _client is None:
         _client = httpx.AsyncClient(
-            base_url=settings.openai_base_url,
             timeout=httpx.Timeout(settings.bedrock_timeout, connect=10.0),
             limits=httpx.Limits(max_connections=200, max_keepalive_connections=50),
         )
@@ -32,6 +40,23 @@ def reset_client_for_testing() -> None:
         _client = None
 
 
+def upstream_url(path: str) -> str:
+    """Build a full upstream URL by appending ``path`` to ``OPENAI_BASE_URL``.
+
+    Avoids httpx's RFC 3986 path-replacement behaviour by always producing a
+    fully-qualified URL.
+
+    Examples:
+        OPENAI_BASE_URL=https://bedrock-mantle.us-west-2.api.aws/v1
+        upstream_url("/chat/completions")  -> https://bedrock-mantle.us-west-2.api.aws/v1/chat/completions
+        upstream_url("models")             -> https://bedrock-mantle.us-west-2.api.aws/v1/models
+    """
+    base = settings.openai_base_url.rstrip("/")
+    if not path.startswith("/"):
+        path = "/" + path
+    return base + path
+
+
 def upstream_headers(extra: dict[str, str] | None = None) -> dict[str, str]:
     """Build the Authorization + standard headers for an upstream call."""
     headers = {
diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index 159d28e..a505f49 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -11,7 +11,7 @@
 from fastapi import APIRouter, Depends, Request, Response
 from fastapi.responses import JSONResponse, StreamingResponse
 
-from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url
 from app.api.openai_passthrough.model_mapping import resolve_model_id
 from app.api.openai_passthrough.streaming import stream_passthrough
 from app.api.openai_passthrough.usage_extractor import normalize_usage
@@ -85,7 +85,7 @@ async def on_complete(usage: dict[str, Any]) -> None:
         )
 
     resp = await get_client().post(
-        "/chat/completions", json=body, headers=upstream_headers(extra)
+        upstream_url("/chat/completions"), json=body, headers=upstream_headers(extra)
     )
     if resp.status_code >= 400:
         return JSONResponse(_safe_json(resp), status_code=resp.status_code)
@@ -115,7 +115,7 @@ async def on_complete(usage: dict[str, Any]) -> None:
         )
 
     resp = await get_client().post(
-        "/responses", json=body, headers=upstream_headers(extra)
+        upstream_url("/responses"), json=body, headers=upstream_headers(extra)
     )
     if resp.status_code >= 400:
         return JSONResponse(_safe_json(resp), status_code=resp.status_code)
@@ -136,7 +136,7 @@ async def _passthrough_request(request: Request, path: str) -> Response:
         except Exception:
             body = None
     resp = await get_client().request(
-        request.method, path, json=body, headers=upstream_headers(extra)
+        request.method, upstream_url(path), json=body, headers=upstream_headers(extra)
     )
     return Response(
         content=resp.content,
diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
index 220903b..883cb49 100644
--- a/app/api/openai_passthrough/streaming.py
+++ b/app/api/openai_passthrough/streaming.py
@@ -14,7 +14,7 @@
 
 import httpx
 
-from app.api.openai_passthrough.client import get_client, upstream_headers
+from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url
 from app.api.openai_passthrough.usage_extractor import try_extract_usage_from_sse
 
 logger = logging.getLogger(__name__)
@@ -35,7 +35,7 @@ async def stream_passthrough(
     headers = upstream_headers(extra_headers)
 
     try:
-        async with client.stream(method, path, json=body, headers=headers) as resp:
+        async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp:
             async for raw_line in resp.aiter_lines():
                 # Upstream gives us SSE lines without trailing newlines; restore the
                 # framing byte so the SSE body is well-formed for the downstream client.
diff --git a/tests/unit/test_openai_passthrough/test_client_url.py b/tests/unit/test_openai_passthrough/test_client_url.py
new file mode 100644
index 0000000..0d1f9b3
--- /dev/null
+++ b/tests/unit/test_openai_passthrough/test_client_url.py
@@ -0,0 +1,63 @@
+"""Tests for upstream_url — guards against the httpx RFC 3986 path-replacement footgun.
+
+If you ever set ``base_url`` on the AsyncClient and pass a leading-slash path,
+httpx will silently drop the ``/v1`` from the base. This test family asserts
+that ``upstream_url`` always produces a fully-qualified URL with both the
+configured ``OPENAI_BASE_URL`` path AND the request path joined intact.
+"""
+from app.api.openai_passthrough.client import upstream_url
+
+
+def test_includes_base_url_path_with_leading_slash(monkeypatch):
+    monkeypatch.setattr(
+        "app.core.config.settings.openai_base_url",
+        "https://bedrock-mantle.us-west-2.api.aws/v1",
+    )
+    # The bug: with httpx base_url=".../v1", "/chat/completions" would drop "/v1".
+    # upstream_url must keep both segments.
+    assert (
+        upstream_url("/chat/completions")
+        == "https://bedrock-mantle.us-west-2.api.aws/v1/chat/completions"
+    )
+
+
+def test_includes_base_url_path_without_leading_slash(monkeypatch):
+    monkeypatch.setattr(
+        "app.core.config.settings.openai_base_url",
+        "https://bedrock-mantle.us-west-2.api.aws/v1",
+    )
+    assert (
+        upstream_url("models")
+        == "https://bedrock-mantle.us-west-2.api.aws/v1/models"
+    )
+
+
+def test_strips_trailing_slash_from_base(monkeypatch):
+    monkeypatch.setattr(
+        "app.core.config.settings.openai_base_url",
+        "https://bedrock-mantle.us-west-2.api.aws/v1/",
+    )
+    assert (
+        upstream_url("/responses")
+        == "https://bedrock-mantle.us-west-2.api.aws/v1/responses"
+    )
+
+
+def test_works_with_response_id_in_path(monkeypatch):
+    monkeypatch.setattr(
+        "app.core.config.settings.openai_base_url",
+        "https://bedrock-mantle.us-west-2.api.aws/v1",
+    )
+    assert (
+        upstream_url("/responses/resp-123/cancel")
+        == "https://bedrock-mantle.us-west-2.api.aws/v1/responses/resp-123/cancel"
+    )
+
+
+def test_works_with_base_url_no_path_segment(monkeypatch):
+    """Some clients might point at a domain root; still produce a sensible URL."""
+    monkeypatch.setattr(
+        "app.core.config.settings.openai_base_url",
+        "https://example.com",
+    )
+    assert upstream_url("/models") == "https://example.com/models"

From c88eb51bd37ad602c703e0d0425d65ee536413b5 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 12:41:31 +0000
Subject: [PATCH 20/22] fix(cdk): propagate ENABLE_OPENAI_PASSTHROUGH env var
 to ECS task definition

The previous deploy mounted the new /openai/v1/* code but the CDK never
passed ENABLE_OPENAI_PASSTHROUGH through to the container, so the
conditional router mount at app/main.py evaluated False and the routes
weren't registered. Add support symmetrical to enableOpenaiCompat:

- AppConfig: new enableOpenaiPassthrough field
- prod default: true (ship the feature on by default)
- dev default: false (avoid accidental routing changes in dev)
- env-var override: ENABLE_OPENAI_PASSTHROUGH at deploy time
- ECS task env: emit ENABLE_OPENAI_PASSTHROUGH=<value>

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cdk/config/config.ts | 9 +++++++++
 cdk/lib/ecs-stack.ts | 1 +
 2 files changed, 10 insertions(+)

diff --git a/cdk/config/config.ts b/cdk/config/config.ts
index cc7b665..c064926 100644
--- a/cdk/config/config.ts
+++ b/cdk/config/config.ts
@@ -90,6 +90,7 @@ export interface EnvironmentConfig {
 
   // OpenAI-Compatible API (Bedrock Mantle) Configuration
   enableOpenaiCompat: boolean;
+  enableOpenaiPassthrough: boolean;          // Mount /openai/v1/* passthrough endpoints
   openaiBaseUrl?: string;                    // e.g., https://bedrock-mantle.us-east-1.api.aws/v1
 
   // Admin Portal Configuration
@@ -196,6 +197,7 @@ export const environments: { [key: string]: EnvironmentConfigWithoutRuntime } =
 
     // OpenAI-Compatible API (Bedrock Mantle)
     enableOpenaiCompat: false,
+    enableOpenaiPassthrough: true,
     // openaiBaseUrl: 'https://bedrock-mantle.us-east-1.api.aws/v1',
 
     // Admin Portal
@@ -300,6 +302,7 @@ export const environments: { [key: string]: EnvironmentConfigWithoutRuntime } =
 
     // OpenAI-Compatible API (Bedrock Mantle)
     enableOpenaiCompat: false,
+    enableOpenaiPassthrough: true,
     // openaiBaseUrl: 'https://bedrock-mantle.us-east-1.api.aws/v1',
 
     // Admin Portal
@@ -410,6 +413,11 @@ export function getConfig(environmentName: string = 'dev'): EnvironmentConfig {
     ? process.env.ENABLE_OPENAI_COMPAT.toLowerCase() === 'true'
     : config.enableOpenaiCompat;
 
+  // Override OpenAI-passthrough settings from environment variables
+  const enableOpenaiPassthrough = process.env.ENABLE_OPENAI_PASSTHROUGH
+    ? process.env.ENABLE_OPENAI_PASSTHROUGH.toLowerCase() === 'true'
+    : config.enableOpenaiPassthrough;
+
   // Override CloudFront settings from environment variables
   const enableCloudFront = process.env.ENABLE_CLOUDFRONT
     ? process.env.ENABLE_CLOUDFRONT.toLowerCase() === 'true'
@@ -425,6 +433,7 @@ export function getConfig(environmentName: string = 'dev'): EnvironmentConfig {
     enableWebSearch,
     enableWebFetch,
     enableOpenaiCompat,
+    enableOpenaiPassthrough,
     enableCloudFront,
     ...(process.env.OPENAI_BASE_URL && { openaiBaseUrl: process.env.OPENAI_BASE_URL }),
     ...(process.env.OTEL_EXPORTER_OTLP_ENDPOINT && { otelExporterEndpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT }),
diff --git a/cdk/lib/ecs-stack.ts b/cdk/lib/ecs-stack.ts
index 744e31c..9da7901 100644
--- a/cdk/lib/ecs-stack.ts
+++ b/cdk/lib/ecs-stack.ts
@@ -283,6 +283,7 @@ export class ECSStack extends cdk.Stack {
 
       // OpenAI-Compatible API (Bedrock Mantle)
       ENABLE_OPENAI_COMPAT: config.enableOpenaiCompat.toString(),
+      ENABLE_OPENAI_PASSTHROUGH: config.enableOpenaiPassthrough.toString(),
       ...(config.openaiBaseUrl && { OPENAI_BASE_URL: config.openaiBaseUrl }),
       ...(process.env.OPENAI_API_KEY && { OPENAI_API_KEY: process.env.OPENAI_API_KEY }),
 

From b926b421cbd774d0a47453ead2ff0eea48e00a02 Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 13:00:46 +0000
Subject: [PATCH 21/22] fix(openai-passthrough): synthesize event: lines for
 Responses API SSE

Bedrock-mantle emits Responses API SSE as data-only frames (the event type
is embedded as a JSON field but no `event: <type>` line is present). This
matches the SSE spec but diverges from real OpenAI servers, which prepend
each frame with `event: <type>`. Strict clients like OpenAI Codex CLI key
off the `event:` field and report "stream closed before response.completed"
when they don't see it.

Synthesize `event: <type>` lines from each data frame's JSON `type` field
when api_surface == "responses". Chat Completions streams remain unchanged
(real OpenAI doesn't use event: lines for that endpoint).

Tests:
- test_streaming_responses_synthesizes_event_lines_for_data_only_upstream
  asserts every data: frame is preceded by the matching event: line
- test_streaming_chat_completions_does_not_inject_event_lines
  pins the no-injection contract for chat completions

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/openai_passthrough/streaming.py       | 35 ++++++++++++++++++
 .../test_chat_completions.py                  | 27 ++++++++++++++
 .../test_openai_passthrough/test_responses.py | 36 +++++++++++++++++++
 3 files changed, 98 insertions(+)

diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
index 883cb49..7864c5a 100644
--- a/app/api/openai_passthrough/streaming.py
+++ b/app/api/openai_passthrough/streaming.py
@@ -4,6 +4,14 @@
 StreamingResponse forwards them unchanged. After upstream stream ends, it
 calls the supplied on_complete callback with the captured usage dict so the
 caller can record usage to DynamoDB.
+
+Responses API note: bedrock-mantle emits Responses SSE as data-only frames
+(``data: {"type": "response.completed", ...}``) without the corresponding
+``event: response.completed`` line that the real OpenAI Responses API
+includes. Strict SSE clients (e.g. OpenAI Codex CLI) key off the ``event:``
+field and reject streams that lack it. For api_surface="responses" we
+synthesize ``event: <type>`` lines from the JSON ``type`` field on each frame
+to maintain wire compatibility with the real OpenAI server.
 """
 from __future__ import annotations
 
@@ -20,6 +28,24 @@
 logger = logging.getLogger(__name__)
 
 
+def _extract_event_type(raw_line: str) -> str | None:
+    """Return the ``type`` field from a ``data:`` JSON frame, or None if not parseable."""
+    line = raw_line.strip()
+    if not line.startswith("data:"):
+        return None
+    payload = line[len("data:"):].strip()
+    if not payload or payload == "[DONE]":
+        return None
+    try:
+        obj = json.loads(payload)
+    except (ValueError, TypeError):
+        return None
+    if not isinstance(obj, dict):
+        return None
+    event_type = obj.get("type")
+    return event_type if isinstance(event_type, str) else None
+
+
 async def stream_passthrough(
     method: str,
     path: str,
@@ -33,10 +59,19 @@ async def stream_passthrough(
 
     client = get_client()
     headers = upstream_headers(extra_headers)
+    synthesize_event_lines = api_surface == "responses"
 
     try:
         async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp:
             async for raw_line in resp.aiter_lines():
+                # For the Responses API, prepend an ``event: <type>`` line whenever
+                # we see a data frame whose JSON carries a ``type`` field. This
+                # restores the OpenAI-spec SSE format that strict clients expect.
+                if synthesize_event_lines:
+                    event_type = _extract_event_type(raw_line)
+                    if event_type is not None:
+                        yield f"event: {event_type}\n".encode("utf-8")
+
                 # Upstream gives us SSE lines without trailing newlines; restore the
                 # framing byte so the SSE body is well-formed for the downstream client.
                 yield (raw_line + "\n").encode("utf-8")
diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
index b26965c..d726fdd 100644
--- a/tests/integration/test_openai_passthrough/test_chat_completions.py
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -155,6 +155,33 @@ def test_streaming_chat_completions_without_include_usage_does_not_log(
     assert not mock_usage_tracker.record_usage.called
 
 
+def test_streaming_chat_completions_does_not_inject_event_lines(
+    client, respx_mock,
+):
+    """Chat Completions SSE per OpenAI spec is data-only (no `event:` lines).
+    The proxy must NOT synthesize event: lines for this api_surface.
+    """
+    sse_lines = [
+        'data: {"id":"x","choices":[{"index":0,"delta":{"content":"hi"}}]}',
+        'data: [DONE]',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200, headers={"content-type": "text/event-stream"}, content=body
+        )
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "messages": [], "stream": True},
+    ) as r:
+        out = b"".join(r.iter_bytes()).decode()
+
+    assert "event: " not in out, f"chat completions stream should not contain event: lines, got:\n{out}"
+
+
 def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock):
     """X-Amzn-Bedrock-* headers from the client should reach the upstream call."""
     route = respx_mock.post("/chat/completions").mock(
diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py
index a596fed..8190a98 100644
--- a/tests/integration/test_openai_passthrough/test_responses.py
+++ b/tests/integration/test_openai_passthrough/test_responses.py
@@ -65,6 +65,42 @@ def test_streaming_responses_records_usage_from_response_completed(
     assert kw["api_surface"] == "responses"
 
 
+def test_streaming_responses_synthesizes_event_lines_for_data_only_upstream(
+    client, respx_mock,
+):
+    """Bedrock-mantle's Responses API emits data-only SSE (no `event:` lines).
+    Strict clients (e.g. Codex CLI) require `event: <type>` per OpenAI spec, so
+    the proxy must synthesize them from each frame's JSON `type` field.
+    """
+    sse_lines = [
+        'data: {"type":"response.created","response":{"id":"r-1"}}',
+        '',
+        'data: {"type":"response.output_text.delta","delta":"hi"}',
+        '',
+        'data: ' + json.dumps({
+            "type": "response.completed",
+            "response": {"id": "r-1", "usage": {"input_tokens": 1, "output_tokens": 1, "total_tokens": 2}},
+        }),
+        '',
+    ]
+    body = "\n".join(sse_lines).encode()
+    respx_mock.post("/responses").mock(
+        return_value=httpx.Response(200, headers={"content-type": "text/event-stream"}, content=body)
+    )
+
+    with client.stream(
+        "POST", "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "input": [], "stream": True},
+    ) as r:
+        out = b"".join(r.iter_bytes()).decode()
+
+    # Each data: frame with a `type` field should be preceded by an event: line
+    assert "event: response.created\ndata: " in out
+    assert "event: response.output_text.delta\ndata: " in out
+    assert "event: response.completed\ndata: " in out
+
+
 def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usage_tracker):
     respx_mock.post("/responses").mock(
         return_value=httpx.Response(400, json={"error": {"message": "bad input", "type": "invalid_request_error"}})

From e950e8a36b7f05a82a874c472bd83aaeab74d8ad Mon Sep 17 00:00:00 2001
From: River Xie <xiehust@163.com>
Date: Mon, 25 May 2026 13:22:22 +0000
Subject: [PATCH 22/22] fix(openai-passthrough): surface upstream HTTP errors
 with real status codes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, when client requested stream=true and upstream returned a
non-2xx status (e.g. validation 400) or a connection error, the proxy
would still return 200 text/event-stream and dump the JSON error body
(or a synthetic SSE frame) into the stream. Strict SSE clients like
OpenAI Codex CLI then hang waiting for response.completed and report
"stream closed before response.completed" — masking the real error.

Refactor: split open_upstream_stream() (peeks at status) from
stream_passthrough_response() (streams an open 2xx body). The router
now:

- Returns the real upstream status as JSONResponse when the upstream
  responds with 4xx/5xx for a streaming request.
- Returns 502/504 JSON when the upstream is unreachable
  (TimeoutException / RequestError) before any bytes flow.
- Continues to emit an SSE error+[DONE] frame only for failures that
  occur AFTER the 2xx stream has begun (where we cannot retroactively
  change the HTTP status).

Tests:
- test_streaming_responses_upstream_4xx_returns_json_not_sse
- test_streaming_upstream_timeout_returns_json_504 (replaces the prior
  test that asserted the buggy SSE-error behavior)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/openai_passthrough/router.py          |  62 +++++++-
 app/api/openai_passthrough/streaming.py       | 143 +++++++++++++++---
 .../test_chat_completions.py                  |  22 +--
 .../test_openai_passthrough/test_responses.py |  23 +++
 4 files changed, 216 insertions(+), 34 deletions(-)

diff --git a/app/api/openai_passthrough/router.py b/app/api/openai_passthrough/router.py
index a505f49..70c0406 100644
--- a/app/api/openai_passthrough/router.py
+++ b/app/api/openai_passthrough/router.py
@@ -13,7 +13,11 @@
 
 from app.api.openai_passthrough.client import get_client, upstream_headers, upstream_url
 from app.api.openai_passthrough.model_mapping import resolve_model_id
-from app.api.openai_passthrough.streaming import stream_passthrough
+from app.api.openai_passthrough.streaming import (
+    UpstreamConnectionError,
+    open_upstream_stream,
+    stream_passthrough_response,
+)
 from app.api.openai_passthrough.usage_extractor import normalize_usage
 from app.db.dynamodb import DynamoDBClient, ModelMappingManager, UsageTracker
 from app.middleware.auth import get_api_key_info
@@ -75,12 +79,26 @@ async def chat_completions(
     extra = _passthrough_extra_headers(request)
 
     if body.get("stream"):
+        try:
+            upstream_resp, error_body = await open_upstream_stream(
+                "POST", "/chat/completions", body, extra
+            )
+        except UpstreamConnectionError as exc:
+            return JSONResponse(
+                {"error": {"message": exc.message, "type": "upstream_error"}},
+                status_code=exc.status_code,
+            )
+        if error_body is not None:
+            return JSONResponse(
+                _decode_error_body(error_body),
+                status_code=upstream_resp.status_code,
+            )
+
         async def on_complete(usage: dict[str, Any]) -> None:
             _record_usage(api_key_info, usage, body["model"], "chat_completions")
+
         return StreamingResponse(
-            stream_passthrough(
-                "POST", "/chat/completions", body, "chat_completions", on_complete, extra
-            ),
+            stream_passthrough_response(upstream_resp, "chat_completions", on_complete),
             media_type="text/event-stream",
         )
 
@@ -107,10 +125,26 @@ async def responses_create(
     extra = _passthrough_extra_headers(request)
 
     if body.get("stream"):
+        try:
+            upstream_resp, error_body = await open_upstream_stream(
+                "POST", "/responses", body, extra
+            )
+        except UpstreamConnectionError as exc:
+            return JSONResponse(
+                {"error": {"message": exc.message, "type": "upstream_error"}},
+                status_code=exc.status_code,
+            )
+        if error_body is not None:
+            return JSONResponse(
+                _decode_error_body(error_body),
+                status_code=upstream_resp.status_code,
+            )
+
         async def on_complete(usage: dict[str, Any]) -> None:
             _record_usage(api_key_info, usage, body["model"], "responses")
+
         return StreamingResponse(
-            stream_passthrough("POST", "/responses", body, "responses", on_complete, extra),
+            stream_passthrough_response(upstream_resp, "responses", on_complete),
             media_type="text/event-stream",
         )
 
@@ -185,3 +219,21 @@ def _safe_json(resp) -> dict[str, Any]:
         return cast(dict[str, Any], resp.json())
     except ValueError:
         return {"error": {"message": resp.text, "type": "upstream_error"}}
+
+
+def _decode_error_body(body: bytes) -> dict[str, Any]:
+    """Parse a non-2xx upstream body as JSON, falling back to a wrapped string."""
+    import json as _json
+
+    try:
+        decoded = _json.loads(body)
+    except (ValueError, TypeError):
+        return {
+            "error": {
+                "message": body.decode("utf-8", "replace"),
+                "type": "upstream_error",
+            }
+        }
+    if isinstance(decoded, dict):
+        return cast(dict[str, Any], decoded)
+    return {"error": {"message": str(decoded), "type": "upstream_error"}}
diff --git a/app/api/openai_passthrough/streaming.py b/app/api/openai_passthrough/streaming.py
index 7864c5a..8648f91 100644
--- a/app/api/openai_passthrough/streaming.py
+++ b/app/api/openai_passthrough/streaming.py
@@ -12,6 +12,11 @@
 field and reject streams that lack it. For api_surface="responses" we
 synthesize ``event: <type>`` lines from the JSON ``type`` field on each frame
 to maintain wire compatibility with the real OpenAI server.
+
+Upstream-error contract: ``open_upstream_stream`` returns the (resp, error_body)
+tuple BEFORE FastAPI has committed any response headers. If the upstream
+returns a non-2xx status, the caller can hand back a JSONResponse with the
+real upstream status code instead of a fake 200 streaming response.
 """
 from __future__ import annotations
 
@@ -46,36 +51,94 @@ def _extract_event_type(raw_line: str) -> str | None:
     return event_type if isinstance(event_type, str) else None
 
 
-async def stream_passthrough(
+class UpstreamConnectionError(Exception):
+    """Raised by open_upstream_stream when the upstream is unreachable.
+
+    Carries an HTTP status to return to the client (502 Bad Gateway by
+    default) and the underlying httpx exception for logging.
+    """
+
+    def __init__(self, status_code: int, message: str, exc_type: str) -> None:
+        super().__init__(message)
+        self.status_code = status_code
+        self.message = message
+        self.exc_type = exc_type
+
+
+async def open_upstream_stream(
     method: str,
     path: str,
     body: dict[str, Any] | None,
-    api_surface: str,
-    on_complete: Callable[[dict[str, Any]], Awaitable[None] | None],
     extra_headers: dict[str, str] | None = None,
-) -> AsyncIterator[bytes]:
-    """Stream upstream response bytes line-by-line; capture usage; trigger callback."""
-    usage: dict[str, Any] = {}
+) -> tuple[httpx.Response, bytes | None]:
+    """Open an upstream streaming request and peek at the status code.
+
+    Returns (resp, error_body):
+      - error_body is None if upstream returned 2xx — caller streams the body
+        and is responsible for closing the response.
+      - error_body is the full upstream body bytes if status >= 400 — caller
+        should return a JSONResponse with resp.status_code. The response is
+        already closed.
 
+    Raises UpstreamConnectionError if the upstream is unreachable
+    (timeout, DNS, TLS, connection reset). Caller should turn this into a
+    JSON 502/504 with the carried status code.
+    """
     client = get_client()
     headers = upstream_headers(extra_headers)
+    req = client.build_request(method, upstream_url(path), json=body, headers=headers)
+    try:
+        resp = await client.send(req, stream=True)
+    except httpx.TimeoutException as exc:
+        logger.error("[OPENAI-PASSTHROUGH] upstream timeout opening stream: %s", exc)
+        raise UpstreamConnectionError(
+            status_code=504,
+            message=f"upstream timeout: {exc}",
+            exc_type=type(exc).__name__,
+        ) from exc
+    except httpx.RequestError as exc:
+        logger.error("[OPENAI-PASSTHROUGH] upstream connection error opening stream: %s", exc)
+        raise UpstreamConnectionError(
+            status_code=502,
+            message=f"upstream connection failed: {exc}",
+            exc_type=type(exc).__name__,
+        ) from exc
+
+    if resp.status_code >= 400:
+        try:
+            error_body = await resp.aread()
+        finally:
+            await resp.aclose()
+        return resp, error_body
+    return resp, None
+
+
+async def stream_passthrough_response(
+    resp: httpx.Response,
+    api_surface: str,
+    on_complete: Callable[[dict[str, Any]], Awaitable[None] | None],
+) -> AsyncIterator[bytes]:
+    """Stream the body of an already-opened 2xx upstream response.
+
+    Closes the response when done.
+    """
+    usage: dict[str, Any] = {}
     synthesize_event_lines = api_surface == "responses"
 
     try:
-        async with client.stream(method, upstream_url(path), json=body, headers=headers) as resp:
-            async for raw_line in resp.aiter_lines():
-                # For the Responses API, prepend an ``event: <type>`` line whenever
-                # we see a data frame whose JSON carries a ``type`` field. This
-                # restores the OpenAI-spec SSE format that strict clients expect.
-                if synthesize_event_lines:
-                    event_type = _extract_event_type(raw_line)
-                    if event_type is not None:
-                        yield f"event: {event_type}\n".encode("utf-8")
-
-                # Upstream gives us SSE lines without trailing newlines; restore the
-                # framing byte so the SSE body is well-formed for the downstream client.
-                yield (raw_line + "\n").encode("utf-8")
-                try_extract_usage_from_sse(raw_line, usage, api_surface)
+        async for raw_line in resp.aiter_lines():
+            # For the Responses API, prepend an ``event: <type>`` line whenever
+            # we see a data frame whose JSON carries a ``type`` field. This
+            # restores the OpenAI-spec SSE format that strict clients expect.
+            if synthesize_event_lines:
+                event_type = _extract_event_type(raw_line)
+                if event_type is not None:
+                    yield f"event: {event_type}\n".encode("utf-8")
+
+            # Upstream gives us SSE lines without trailing newlines; restore the
+            # framing byte so the SSE body is well-formed for the downstream client.
+            yield (raw_line + "\n").encode("utf-8")
+            try_extract_usage_from_sse(raw_line, usage, api_surface)
     except (httpx.RequestError, httpx.TimeoutException) as exc:
         # Upstream connection/timeout failure during streaming. OpenAI SDK clients
         # expect a clean SSE termination, not an abruptly closed stream.
@@ -93,9 +156,49 @@ async def stream_passthrough(
         # Unexpected error — re-raise so FastAPI can convert to 500.
         logger.error("[OPENAI-PASSTHROUGH] upstream stream error: %s", exc)
         raise
+    finally:
+        await resp.aclose()
 
     if usage:
         result = on_complete(usage)
         # Support both sync and async callbacks
         if hasattr(result, "__await__"):
             await result  # type: ignore[misc]
+
+
+# ---------------------------------------------------------------------------
+# Backwards-compat helper: streams in one call. Useful where the caller doesn't
+# need to differentiate streaming-error vs streaming-success at the HTTP-status
+# level (legacy code path; new code should use open_upstream_stream +
+# stream_passthrough_response so non-2xx errors come back as a real JSONResponse).
+# ---------------------------------------------------------------------------
+
+async def stream_passthrough(
+    method: str,
+    path: str,
+    body: dict[str, Any] | None,
+    api_surface: str,
+    on_complete: Callable[[dict[str, Any]], Awaitable[None] | None],
+    extra_headers: dict[str, str] | None = None,
+) -> AsyncIterator[bytes]:
+    """Open + stream + close in one call. Status-checking variant lives in
+    open_upstream_stream/stream_passthrough_response."""
+    resp, error_body = await open_upstream_stream(method, path, body, extra_headers)
+    if error_body is not None:
+        # Legacy callers can't surface a real error status here; emit an SSE
+        # error frame and terminate cleanly so the client doesn't hang.
+        try:
+            err_payload = json.loads(error_body)
+        except (ValueError, TypeError):
+            err_payload = {
+                "error": {
+                    "message": error_body.decode("utf-8", "replace"),
+                    "type": "upstream_error",
+                }
+            }
+        yield ("data: " + json.dumps(err_payload) + "\n\n").encode("utf-8")
+        yield b"data: [DONE]\n\n"
+        return
+
+    async for chunk in stream_passthrough_response(resp, api_surface, on_complete):
+        yield chunk
diff --git a/tests/integration/test_openai_passthrough/test_chat_completions.py b/tests/integration/test_openai_passthrough/test_chat_completions.py
index d726fdd..0fc1120 100644
--- a/tests/integration/test_openai_passthrough/test_chat_completions.py
+++ b/tests/integration/test_openai_passthrough/test_chat_completions.py
@@ -204,23 +204,27 @@ def test_bedrock_guardrail_headers_are_forwarded(client, respx_mock):
     assert sent.headers["x-amzn-bedrock-guardrailversion"] == "DRAFT"
 
 
-def test_streaming_upstream_timeout_yields_clean_sse_error(
+def test_streaming_upstream_timeout_returns_json_504(
     client, respx_mock, mock_usage_tracker
 ):
-    """Upstream timeout during streaming should produce a structured SSE error event, not crash the stream."""
+    """When upstream times out before the stream begins, the proxy must
+    surface a real HTTP 504 with a JSON error body (NOT a fake 200
+    text/event-stream wrapping an SSE error frame). Strict clients can
+    then act on the status code instead of hanging on a malformed stream.
+    """
     respx_mock.post("/chat/completions").mock(
         side_effect=httpx.ReadTimeout("upstream took too long")
     )
 
-    with client.stream(
-        "POST",
+    r = client.post(
         "/openai/v1/chat/completions",
         headers={"Authorization": "Bearer sk-test"},
         json={"model": "m", "messages": [], "stream": True},
-    ) as r:
-        out = b"".join(r.iter_bytes())
+    )
 
-    assert b'"upstream_error"' in out, f"expected structured error, got: {out}"
-    assert b"[DONE]" in out
-    # No usage logged when the stream errored before any usage event arrived
+    assert r.status_code == 504
+    assert r.headers["content-type"].startswith("application/json")
+    body = r.json()
+    assert body["error"]["type"] == "upstream_error"
+    assert "timeout" in body["error"]["message"].lower()
     assert not mock_usage_tracker.record_usage.called
diff --git a/tests/integration/test_openai_passthrough/test_responses.py b/tests/integration/test_openai_passthrough/test_responses.py
index 8190a98..e791651 100644
--- a/tests/integration/test_openai_passthrough/test_responses.py
+++ b/tests/integration/test_openai_passthrough/test_responses.py
@@ -113,3 +113,26 @@ def test_responses_upstream_error_returned_verbatim(client, respx_mock, mock_usa
     assert r.status_code == 400
     assert r.json()["error"]["message"] == "bad input"
     assert not mock_usage_tracker.record_usage.called
+
+
+def test_streaming_responses_upstream_4xx_returns_json_not_sse(
+    client, respx_mock, mock_usage_tracker
+):
+    """When upstream rejects a streaming request with 4xx, the proxy must
+    surface a real JSON 4xx response — NOT a fake 200 text/event-stream
+    that wraps the error body. Strict SSE clients (codex) hang waiting
+    for response.completed if we send the error as event-stream.
+    """
+    err = {"error": {"message": "tools[13].type=namespace not allowed", "type": "validation_error"}}
+    respx_mock.post("/responses").mock(return_value=httpx.Response(400, json=err))
+
+    r = client.post(
+        "/openai/v1/responses",
+        headers={"Authorization": "Bearer sk-test"},
+        json={"model": "m", "input": [], "stream": True, "tools": [{"type": "namespace"}]},
+    )
+    assert r.status_code == 400
+    assert r.headers["content-type"].startswith("application/json"), \
+        f"expected JSON content-type, got {r.headers['content-type']}"
+    assert r.json() == err
+    assert not mock_usage_tracker.record_usage.called