From 0093f01d1449260349f813b783a177277e5f1fcb Mon Sep 17 00:00:00 2001 From: aymaneo Date: Thu, 6 Nov 2025 11:00:27 +0100 Subject: [PATCH] Add Responses API support for reasoning models --- verifiers/envs/environment.py | 60 ++++++++++++++++++----- verifiers/envs/multiturn_env.py | 4 +- verifiers/utils/message_utils.py | 38 +++++++++++++++ verifiers/utils/responses_api_adapter.py | 61 ++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 12 deletions(-) create mode 100644 verifiers/utils/responses_api_adapter.py diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index fcc2eed3c..b9cbf8d3e 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -32,9 +32,12 @@ from verifiers.utils.async_utils import maybe_semaphore from verifiers.utils.eval_utils import make_dataset, save_results from verifiers.utils.message_utils import ( + adapt_tools_for_responses_api, cleanup_messages, + extract_system_message, get_overlong_prompt_dummy_response, ) +from verifiers.utils.responses_api_adapter import ResponsesAPIAdapter from verifiers.utils.path_utils import get_results_path from verifiers.utils.processing_utils import ( process_chat_format_vllm, @@ -67,6 +70,7 @@ def __init__( env_id: str | None = None, env_args: dict | None = None, map_kwargs: dict = {}, + use_responses_api: bool = False, **kwargs, ): self.logger = logging.getLogger(f"verifiers.envs.{self.__class__.__name__}") @@ -75,6 +79,7 @@ def __init__( self.system_prompt = system_prompt self.few_shot = few_shot self.parser = parser or Parser() + self.use_responses_api = use_responses_api self.rubric = rubric or Rubric() if self.parser.__class__ != self.rubric.parser.__class__: self.logger.warning( @@ -281,20 +286,53 @@ async def get_model_response( "modalities": ["text"], } - if oai_tools: - response = await client.chat.completions.create( + if self.use_responses_api: + instructions, input_messages = extract_system_message(prompt) + adapted_tools = adapt_tools_for_responses_api(oai_tools) + if len(input_messages) == 1: + api_input = input_messages[0].get("content", "") + else: + api_input = input_messages + unsupported_params = { + "n", + "presence_penalty", + "frequency_penalty", + "logprobs", + "top_logprobs", + "logit_bias", + "stream", + "stream_options", + "user", + "temperature", + } + responses_sampling_args = { + k: v + for k, v in clean_sampling_args.items() + if k not in unsupported_params + } + response = await client.responses.create( model=model, - messages=prompt, # type: ignore - tools=oai_tools, - **clean_sampling_args, + instructions=instructions, + input=api_input, + tools=adapted_tools, + **responses_sampling_args, ) + return ResponsesAPIAdapter(response) else: - response = await client.chat.completions.create( - model=model, - messages=prompt, # type: ignore - **clean_sampling_args, - ) - return response + if oai_tools: + response = await client.chat.completions.create( + model=model, + messages=prompt, # type: ignore + tools=oai_tools, + **clean_sampling_args, + ) + else: + response = await client.chat.completions.create( + model=model, + messages=prompt, # type: ignore + **clean_sampling_args, + ) + return response elif message_type == "completion": if oai_tools: raise ValueError( diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 3540dca40..a9e7e65a6 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -106,7 +106,9 @@ async def rollout( response_text: str = "" if self.message_type == "chat": assert isinstance(context_messages, list) - assert isinstance(response, ChatCompletion) + assert isinstance(response, ChatCompletion) or hasattr( + response, "choices" + ) if response.choices and response.choices[0].message: response_text = response.choices[0].message.content or "" response_message: ChatMessage = { diff --git a/verifiers/utils/message_utils.py b/verifiers/utils/message_utils.py index 0a61abde0..cada6fdc4 100644 --- a/verifiers/utils/message_utils.py +++ b/verifiers/utils/message_utils.py @@ -155,3 +155,41 @@ def get_overlong_prompt_dummy_response(message_type: MessageType) -> ModelRespon ) else: raise ValueError(f"Invalid message type: {message_type}") + + +def extract_system_message(messages: list) -> tuple[str | None, list]: + """Extract system message as instructions for Responses API.""" + instructions = None + remaining = [] + + for msg in messages: + if msg.get("role") == "system": + if instructions is None: + instructions = msg.get("content") + else: + remaining.append(msg) + + return instructions, remaining + + +def adapt_tools_for_responses_api(chat_tools: list | None) -> list | None: + """Convert Chat Completions tool format to Responses API format.""" + if not chat_tools: + return None + + adapted = [] + for tool in chat_tools: + if tool.get("type") == "function": + func = tool.get("function", {}) + adapted.append( + { + "type": "function", + "name": func.get("name"), + "description": func.get("description"), + "parameters": func.get("parameters"), + } + ) + else: + adapted.append(tool) + + return adapted diff --git a/verifiers/utils/responses_api_adapter.py b/verifiers/utils/responses_api_adapter.py new file mode 100644 index 000000000..7f6606761 --- /dev/null +++ b/verifiers/utils/responses_api_adapter.py @@ -0,0 +1,61 @@ +from openai.types.chat import ChatCompletionMessage +from openai.types.chat.chat_completion import Choice + + +class ResponsesAPIAdapter: + """Adapter to normalize Responses API responses to ChatCompletion format.""" + + def __init__(self, responses_response): + self._response = responses_response + self._text = getattr(responses_response, "output_text", "") + self._tool_calls = self._extract_tool_calls() + + def _extract_tool_calls(self): + tool_calls = [] + output = getattr(self._response, "output", []) + + for item in output: + item_type = getattr(item, "type", None) + if item_type == "function_call": + tool_calls.append( + { + "id": getattr(item, "call_id", ""), + "type": "function", + "function": { + "name": getattr(item, "name", ""), + "arguments": str(getattr(item, "arguments", {})), + }, + } + ) + + return tool_calls if tool_calls else None + + @property + def choices(self): + return [ + Choice( + index=0, + message=ChatCompletionMessage(role="assistant", content=self._text, tool_calls=self._tool_calls), + finish_reason="stop", + ) + ] + + @property + def id(self): + return getattr(self._response, "id", "responses-api-adapter") + + @property + def model(self): + return getattr(self._response, "model", "") + + @property + def created(self): + return getattr(self._response, "created_at", 0) + + @property + def object(self): + return "chat.completion" + + @property + def usage(self): + return getattr(self._response, "usage", None)