From 359cc693e07e4945820bf2930125d39301f2057e Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Sat, 4 Oct 2025 12:59:30 -0700 Subject: [PATCH 1/6] feat: http transport + browserbase mcp example --- environments/mcp_env/mcp_env.py | 11 ++++ .../mcp_env/src/mcp_server_connection.py | 59 ++++++++++++++----- environments/mcp_env/src/models.py | 13 ++-- 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/environments/mcp_env/mcp_env.py b/environments/mcp_env/mcp_env.py index a7d3f6244..2ba20203b 100644 --- a/environments/mcp_env/mcp_env.py +++ b/environments/mcp_env/mcp_env.py @@ -19,6 +19,7 @@ EXA_FETCH_TOOLS = [ { "name": "exa", + "transport": "stdio", "command": "npx", "args": [ "-y", @@ -31,12 +32,22 @@ }, { "name": "fetch", + "transport": "stdio", "command": "uvx", "args": ["mcp-server-fetch"], "description": "Fetch MCP server", }, ] +BROWSERBASE_TOOLS = [ + { + "name": "browserbase", + "transport": "http", + "url": os.getenv("BROWSERBASE_URL"), + "description": "Browserbase MCP", + }, +] + class MCPEnv(ToolEnv): """Environment for MCP-based tools using the official MCP SDK.""" diff --git a/environments/mcp_env/src/mcp_server_connection.py b/environments/mcp_env/src/mcp_server_connection.py index 675947651..899ce9382 100644 --- a/environments/mcp_env/src/mcp_server_connection.py +++ b/environments/mcp_env/src/mcp_server_connection.py @@ -4,6 +4,7 @@ from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client +from mcp.client.streamable_http import streamablehttp_client from mcp.types import TextContent, Tool from .models import MCPServerConfig @@ -35,27 +36,55 @@ async def connect(self): async def _get_connection(self): try: - server_params = StdioServerParameters( - command=self.config.command, - args=self.config.args or [], - env=self.config.env, - ) + if self.config.transport == "stdio": + if not self.config.command: + raise ValueError("stdio transport requires 'command'") + server_params = StdioServerParameters( + command=self.config.command, + args=self.config.args or [], + env=self.config.env, + ) - async with stdio_client(server_params) as (read, write): - async with ClientSession(read, write) as session: - self.session = session + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + self.session = session - await session.initialize() + await session.initialize() - tools_response = await session.list_tools() + tools_response = await session.list_tools() - for tool in tools_response.tools: - self.tools[tool.name] = tool + for tool in tools_response.tools: + self.tools[tool.name] = tool - self._ready.set() + self._ready.set() - while True: - await asyncio.sleep(1) + while True: + await asyncio.sleep(1) + + elif self.config.transport == "http": + if not self.config.url: + raise ValueError("http transport requires 'url'") + + async with streamablehttp_client( + self.config.url, + headers=self.config.headers or {}, + ) as (read, write, _get_session_id): + async with ClientSession(read, write) as session: + self.session = session + + await session.initialize() + + tools_response = await session.list_tools() + + for tool in tools_response.tools: + self.tools[tool.name] = tool + + self._ready.set() + + while True: + await asyncio.sleep(1) + else: + raise ValueError(f"Unknown transport: {self.config.transport}") except asyncio.CancelledError: raise diff --git a/environments/mcp_env/src/models.py b/environments/mcp_env/src/models.py index 7a20dd38e..1b5ffc7c8 100644 --- a/environments/mcp_env/src/models.py +++ b/environments/mcp_env/src/models.py @@ -1,11 +1,16 @@ from dataclasses import dataclass -from typing import Dict, List +from typing import Dict, List, Literal, Optional @dataclass class MCPServerConfig: name: str - command: str - args: List[str] | None = None - env: Dict[str, str] | None = None + transport: Literal["stdio", "http"] = "stdio" description: str = "" + # stdio params + command: Optional[str] = None + args: Optional[List[str]] = None + env: Optional[Dict[str, str]] = None + # http params + url: Optional[str] = None + headers: Optional[Dict[str, str]] = None From b4ae862352e37fe2662c2b64791fccd0d7e4189a Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Fri, 7 Nov 2025 13:58:44 -0500 Subject: [PATCH 2/6] make the browserbase mcp run and add logging --- environments/mcp_env/mcp_env.py | 87 ++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/environments/mcp_env/mcp_env.py b/environments/mcp_env/mcp_env.py index 2ba20203b..03bc04bb7 100644 --- a/environments/mcp_env/mcp_env.py +++ b/environments/mcp_env/mcp_env.py @@ -15,6 +15,7 @@ from verifiers.types import Message load_dotenv() +print("Please export OPENAI_API_KEY to the environment") EXA_FETCH_TOOLS = [ { @@ -42,13 +43,20 @@ BROWSERBASE_TOOLS = [ { "name": "browserbase", - "transport": "http", - "url": os.getenv("BROWSERBASE_URL"), - "description": "Browserbase MCP", + "transport": "stdio", + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": os.getenv("BROWSERBASE_API_KEY", ""), + "BROWSERBASE_PROJECT_ID": os.getenv("BROWSERBASE_PROJECT_ID", ""), + "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""), + }, + "description": "Browserbase MCP (via npx)", }, ] + class MCPEnv(ToolEnv): """Environment for MCP-based tools using the official MCP SDK.""" @@ -78,15 +86,21 @@ def __init__( super().__init__( tools=[], max_turns=max_turns, error_formatter=error_formatter, **kwargs ) + + self.logger.info(f"Initializing MCPEnv with {len(self.mcp_servers)} MCP server(s)") + # Start a persistent background event loop and connect synchronously self._bg_loop = asyncio.new_event_loop() self._bg_thread = threading.Thread( target=self._run_loop, args=(self._bg_loop,), daemon=True ) self._bg_thread.start() + self.logger.debug("Background event loop started") + fut = asyncio.run_coroutine_threadsafe(self._connect_servers(), self._bg_loop) fut.result() self._setup_complete = True + self.logger.info("MCPEnv initialization complete") # cleanup on exit atexit.register( @@ -104,44 +118,69 @@ def _run_loop(self, loop: asyncio.AbstractEventLoop): async def _connect_servers(self): wrapper_tools = [] + self.logger.info(f"Starting connection to {len(self.mcp_servers)} MCP server(s)") for server_config in self.mcp_servers: - connection = MCPServerConnection(server_config, self.logger) - tools = await connection.connect() + self.logger.info(f"Connecting to MCP server: '{server_config.name}'") + self.logger.debug(f" Transport: {server_config.transport}") + self.logger.debug(f" Command: {server_config.command}") + self.logger.debug(f" Args: {server_config.args}") + if server_config.env: + env_keys = list(server_config.env.keys()) + self.logger.debug(f" Environment variables: {env_keys}") - self.server_connections[server_config.name] = connection - - for tool in tools.values(): - wrapper = MCPToolWrapper(server_config.name, tool, connection) - wrapper_tools.append(wrapper) - self.mcp_tools[wrapper.__name__] = wrapper - self.logger.info( - f"Registered MCP tool: {wrapper.__name__} from server '{server_config.name}'" - ) + try: + connection = MCPServerConnection(server_config, self.logger) + tools = await connection.connect() + + self.server_connections[server_config.name] = connection + self.logger.info(f"✓ Successfully connected to '{server_config.name}', discovered {len(tools)} tool(s)") + + for tool in tools.values(): + wrapper = MCPToolWrapper(server_config.name, tool, connection) + wrapper_tools.append(wrapper) + self.mcp_tools[wrapper.__name__] = wrapper + self.logger.info( + f" ├─ Registered MCP tool: {wrapper.__name__}" + ) + except Exception as e: + self.logger.error(f"✗ Failed to connect to MCP server '{server_config.name}': {e}") + raise self.tools = wrapper_tools self.oai_tools = [tool.to_oai_tool() for tool in wrapper_tools] self.tool_map = {tool.__name__: tool for tool in wrapper_tools} + self.logger.info(f"✓ Total MCP tools registered: {len(self.tool_map)}") async def call_tool( self, tool_name: str, tool_args: dict, tool_call_id: str, **kwargs ) -> Message: if tool_name in self.tool_map: tool_wrapper = self.tool_map[tool_name] + self.logger.info(f"Calling tool: {tool_name}") + self.logger.debug(f" Arguments: {tool_args}") + try: result = await tool_wrapper(**tool_args) + result_str = str(result) + result_preview = result_str[:200] + "..." if len(result_str) > 200 else result_str + self.logger.info(f"✓ Tool '{tool_name}' completed successfully") + self.logger.debug(f" Result preview: {result_preview}") + return { "role": "tool", - "content": str(result), + "content": result_str, "tool_call_id": tool_call_id, } except Exception as e: + self.logger.error(f"✗ Tool '{tool_name}' failed: {e}") return { "role": "tool", "content": self.error_formatter(e), "tool_call_id": tool_call_id, } else: + self.logger.error(f"✗ Tool '{tool_name}' not found in tool_map") return { "role": "tool", "content": f"Error: Tool '{tool_name}' not found", @@ -149,25 +188,35 @@ async def call_tool( } async def cleanup(self): - for connection in self.server_connections.values(): - await connection.disconnect() + self.logger.info(f"Cleaning up {len(self.server_connections)} MCP server connection(s)") + + for name, connection in self.server_connections.items(): + try: + self.logger.debug(f"Disconnecting from MCP server: '{name}'") + await connection.disconnect() + self.logger.info(f"✓ Disconnected from MCP server: '{name}'") + except Exception as e: + self.logger.error(f"✗ Error disconnecting from MCP server '{name}': {e}") self.server_connections.clear() self.mcp_tools.clear() + self.logger.info("Cleanup complete") def _shutdown_loop(self): + self.logger.debug("Shutting down background event loop") self._bg_loop.call_soon_threadsafe(self._bg_loop.stop) self._bg_thread.join(timeout=5) + self.logger.debug("Background event loop stopped") def load_environment( - mcp_servers: list = EXA_FETCH_TOOLS, dataset=None, **kwargs + mcp_servers: list = EXA_FETCH_TOOLS + BROWSERBASE_TOOLS, dataset=None, **kwargs ) -> vf.Environment: """Load an MCPEnv environment with fetch server for testing.""" dataset = dataset or Dataset.from_dict( { "question": [ - "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai", + "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai. Use the browserbase tools to get the information.", ], "answer": ["ENVIRONMENTS HUB"], } From 2d1643091fdb566becdcc349228454c160e7ae1e Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Fri, 7 Nov 2025 14:18:44 -0500 Subject: [PATCH 3/6] add env var prereq --- environments/mcp_env/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/environments/mcp_env/README.md b/environments/mcp_env/README.md index 700fee9b3..6fd22c17a 100644 --- a/environments/mcp_env/README.md +++ b/environments/mcp_env/README.md @@ -20,6 +20,14 @@ ### Quickstart +**Prerequisites:** + +Export your OpenAI API key for the judge LLM: + +```bash +export OPENAI_API_KEY=your_api_key_here +``` + Run an evaluation with default settings: ```bash From 86045100d6d4c166368d8555484e2ffc819c1904 Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Fri, 7 Nov 2025 14:53:53 -0500 Subject: [PATCH 4/6] update readme --- environments/mcp_env/README.md | 60 +++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/environments/mcp_env/README.md b/environments/mcp_env/README.md index 6fd22c17a..23c8bfa91 100644 --- a/environments/mcp_env/README.md +++ b/environments/mcp_env/README.md @@ -14,20 +14,72 @@ ### Task -- **Type**: +- **Type**: multi-turn | tool use - **Parser**: N/A -- **Rubric overview**: N/A +- **Rubric overview**: Judge-based evaluation using gpt-4.1-mini + +### MCP Tools + +This environment integrates with MCP (Model Context Protocol) servers to provide tool-calling capabilities. By default, it includes: + +#### Exa & Fetch Tools + +- **Exa MCP Server**: Search and discovery tool for finding relevant web content + - Command: `npx -y exa-mcp-server` + - Required: `EXA_API_KEY` environment variable + +- **Fetch MCP Server**: Fetches and retrieves web content from URLs + - Command: `uvx mcp-server-fetch` + - No API key required + +#### Browserbase Tools + +- **Browserbase MCP Server**: Browser automation for interacting with web pages using AI-powered navigation + - Command: `npx @browserbasehq/mcp-server-browserbase` + - Required environment variables: + - `BROWSERBASE_API_KEY` + - `BROWSERBASE_PROJECT_ID` + - `GEMINI_API_KEY` + +**Customizing Tools:** + +You can pass custom MCP server configurations via the `mcp_servers` argument to `load_environment()`: + +```python +custom_servers = [ + { + "name": "my-server", + "transport": "stdio", + "command": "npx", + "args": ["my-mcp-server"], + "env": {"API_KEY": "your_key"}, + "description": "Custom MCP server" + } +] +env = load_environment(mcp_servers=custom_servers) +``` ### Quickstart **Prerequisites:** -Export your OpenAI API key for the judge LLM: +Export the required API keys for the judge LLM and MCP tools: ```bash -export OPENAI_API_KEY=your_api_key_here +# Required for judge-based evaluation +export OPENAI_API_KEY=your_openai_key + +# Required for Exa MCP server (search/discovery) +export EXA_API_KEY=your_exa_key + +# Required for Browserbase MCP server (browser automation) +export BROWSERBASE_API_KEY=your_browserbase_key +export BROWSERBASE_PROJECT_ID=your_project_id +export GEMINI_API_KEY=your_gemini_key ``` +**Note:** Not all API keys are required for every task. The Fetch MCP server works without any API key. Only export the keys for the tools you intend to use. + Run an evaluation with default settings: ```bash From 26c4766c8f324e2c2a9778e6d72d4ce667196b07 Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Mon, 1 Dec 2025 09:48:20 +0000 Subject: [PATCH 5/6] update the mcp bb example --- environments/mcp_env/.env.example | 11 +++++++++++ environments/mcp_env/README.md | 11 ++++++----- environments/mcp_env/mcp_env.py | 12 +++++++----- environments/mcp_env/pyproject.toml | 2 +- 4 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 environments/mcp_env/.env.example diff --git a/environments/mcp_env/.env.example b/environments/mcp_env/.env.example new file mode 100644 index 000000000..52b78ee64 --- /dev/null +++ b/environments/mcp_env/.env.example @@ -0,0 +1,11 @@ +# OpenAI API key (required for judge-based evaluation) +OPENAI_API_KEY=your_openai_api_key + +# Smithery credentials for Exa MCP server +SMITHERY_KEY=your_smithery_key +SMITHERY_PROFILE=your_smithery_profile + +# Browserbase MCP server credentials +BROWSERBASE_API_KEY=your_browserbase_api_key +BROWSERBASE_PROJECT_ID=your_browserbase_project_id +GEMINI_API_KEY=your_gemini_api_key diff --git a/environments/mcp_env/README.md b/environments/mcp_env/README.md index 23c8bfa91..55300b0b7 100644 --- a/environments/mcp_env/README.md +++ b/environments/mcp_env/README.md @@ -24,9 +24,9 @@ This environment integrates with MCP (Model Context Protocol) servers to provide #### Exa & Fetch Tools -- **Exa MCP Server**: Search and discovery tool for finding relevant web content - - Command: `npx -y exa-mcp-server` - - Required: `EXA_API_KEY` environment variable +- **Exa MCP Server**: Search and discovery tool for finding relevant web content (via Smithery) + - Command: `npx -y @smithery/cli@latest run exa --key --profile ` + - Note: Authentication is handled via Smithery CLI key/profile - **Fetch MCP Server**: Fetches and retrieves web content from URLs - Command: `uvx mcp-server-fetch` @@ -69,8 +69,9 @@ Export the required API keys for the judge LLM and MCP tools: # Required for judge-based evaluation export OPENAI_API_KEY=your_openai_key -# Required for Exa MCP server (search/discovery) -export EXA_API_KEY=your_exa_key +# Required for Exa MCP server (via Smithery) +export SMITHERY_KEY=your_smithery_key +export SMITHERY_PROFILE=your_smithery_profile # Required for Browserbase MCP server (browser automation) export BROWSERBASE_API_KEY=your_browserbase_key diff --git a/environments/mcp_env/mcp_env.py b/environments/mcp_env/mcp_env.py index 03bc04bb7..3ec0da39d 100644 --- a/environments/mcp_env/mcp_env.py +++ b/environments/mcp_env/mcp_env.py @@ -15,7 +15,6 @@ from verifiers.types import Message load_dotenv() -print("Please export OPENAI_API_KEY to the environment") EXA_FETCH_TOOLS = [ { @@ -24,11 +23,14 @@ "command": "npx", "args": [ "-y", - "exa-mcp-server", + "@smithery/cli@latest", + "run", + "exa", + "--key", + os.getenv("SMITHERY_KEY", ""), + "--profile", + os.getenv("SMITHERY_PROFILE", ""), ], - "env": { - "EXA_API_KEY": os.getenv("EXA_API_KEY"), - }, "description": "Exa MCP server", }, { diff --git a/environments/mcp_env/pyproject.toml b/environments/mcp_env/pyproject.toml index 43529eadf..b59cc1c21 100644 --- a/environments/mcp_env/pyproject.toml +++ b/environments/mcp_env/pyproject.toml @@ -5,7 +5,7 @@ tags = ["train", "eval"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ - "mcp>=1.14.1", + "mcp[cli]>=1.14.1", "python-dotenv>=1.1.1", "verifiers>=0.1.4", ] From 6f259d2c9cf506fdd16890f784e5bc9c3e3f715e Mon Sep 17 00:00:00 2001 From: Filip Michalsky Date: Thu, 11 Dec 2025 06:27:25 -0500 Subject: [PATCH 6/6] mcp update --- .gitignore | 6 ++++++ environments/mcp_env/mcp_env.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index f920a7c4e..7f95c3c6d 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,9 @@ scratch/ .vscode/ *.swp .DS_Store + +# Node modules +node_modules/ + +# Browser env generated files +environments/browser_env/cua-server/node_modules/ diff --git a/environments/mcp_env/mcp_env.py b/environments/mcp_env/mcp_env.py index 3ec0da39d..e31e9f561 100644 --- a/environments/mcp_env/mcp_env.py +++ b/environments/mcp_env/mcp_env.py @@ -31,6 +31,9 @@ "--profile", os.getenv("SMITHERY_PROFILE", ""), ], + "env": { + "NPM_CONFIG_LOGLEVEL": "silent", + }, "description": "Exa MCP server", }, { @@ -52,6 +55,7 @@ "BROWSERBASE_API_KEY": os.getenv("BROWSERBASE_API_KEY", ""), "BROWSERBASE_PROJECT_ID": os.getenv("BROWSERBASE_PROJECT_ID", ""), "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""), + "NPM_CONFIG_LOGLEVEL": "silent", }, "description": "Browserbase MCP (via npx)", },