-
Notifications
You must be signed in to change notification settings - Fork 539
Cdreetz/mcp env v2 #566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Cdreetz/mcp env v2 #566
Changes from all commits
cf27804
2ce55e3
895db50
cbb763b
b1453f8
ffaf523
9c91156
019ba78
06fb010
4116365
66fd979
29a98cb
820ae81
ad3d068
21053b5
5bf0bb1
0cd29c9
e7bd0c6
5ae3494
eb7601e
da02c45
69e77b8
fcb351c
60d84a8
77183a3
9ccffc5
6a3efaf
2c98efd
97dee82
0118c03
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| # http-mcp-env | ||
|
|
||
| > Replace the placeholders below, then remove this callout. | ||
|
|
||
| ### Overview | ||
| - **Environment ID**: `http-mcp-env` | ||
| - **Short description**: <one-sentence description> | ||
| - **Tags**: <comma-separated tags> | ||
|
|
||
| ### Datasets | ||
| - **Primary dataset(s)**: <name(s) and brief description> | ||
| - **Source links**: <links> | ||
| - **Split sizes**: <train/eval counts> | ||
|
|
||
| ### Task | ||
| - **Type**: <single-turn | multi-turn | tool use> | ||
| - **Parser**: <e.g., ThinkParser, XMLParser, custom> | ||
| - **Rubric overview**: <briefly list reward functions and key metrics> | ||
|
|
||
| ### Quickstart | ||
| Run an evaluation with default settings: | ||
|
|
||
| ```bash | ||
| uv run vf-eval http-mcp-env | ||
| ``` | ||
|
|
||
| Configure model and sampling: | ||
|
|
||
| ```bash | ||
| uv run vf-eval http-mcp-env -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"key": "value"}' # env-specific args as JSON | ||
| ``` | ||
|
|
||
| Notes: | ||
| - Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. | ||
|
|
||
| ### Environment Arguments | ||
| Document any supported environment arguments and their meaning. Example: | ||
|
|
||
| | Arg | Type | Default | Description | | ||
| | --- | ---- | ------- | ----------- | | ||
| | `foo` | str | `"bar"` | What this controls | | ||
| | `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | | ||
|
|
||
| ### Metrics | ||
| Summarize key metrics your rubric emits and how they’re interpreted. | ||
|
|
||
| | Metric | Meaning | | ||
| | ------ | ------- | | ||
| | `reward` | Main scalar reward (weighted sum of criteria) | | ||
| | `accuracy` | Exact match on target answer | | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| import verifiers as vf | ||
| from datasets import Dataset | ||
| import os | ||
| from dotenv import load_dotenv | ||
| from urllib.parse import urlencode | ||
|
|
||
| load_dotenv() | ||
|
|
||
|
|
||
| def get_remote_url(): | ||
| API_KEY = os.getenv("SMITHERY_API_KEY") | ||
| PROFILE = os.getenv("SMITHERY_PROFILE") | ||
| base_url = "https://server.smithery.ai/@smithery-ai/fetch/mcp" | ||
| params = {"api_key": API_KEY, "profile": PROFILE} | ||
| smithery_url = f"{base_url}?{urlencode(params)}" | ||
| return smithery_url | ||
|
|
||
|
|
||
| def load_environment(**kwargs): | ||
| remote_url = get_remote_url() | ||
| ds = Dataset.from_dict( | ||
| { | ||
| "question": [ | ||
| "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai", | ||
| ], | ||
| "answer": ["ENVIRONMENTS HUB"], # Or whatever the actual top result is | ||
| } | ||
| ) | ||
|
|
||
| rub = vf.JudgeRubric(judge_model="gpt-4.1-mini") | ||
|
|
||
| async def judge_reward(judge, prompt, completion, answer, state): | ||
| judge_response = await judge(prompt, completion, answer, state) | ||
| return 1.0 if "yes" in judge_response.lower() else 0.0 | ||
|
|
||
| rub.add_reward_func(judge_reward, weight=1.0) | ||
|
|
||
| env = vf.MCPEnv( | ||
| mcp_servers=[ | ||
| { | ||
| "name": "fetch-mcp-server-http", | ||
| "command": "node", # Not used for HTTP transport, but required by MCPServerConfig | ||
| "args": [], | ||
| } | ||
| ], | ||
| transport_type="http", | ||
| http_urls={ | ||
| "fetch-mcp-server-http": remote_url | ||
| }, | ||
| http_timeout=60.0, | ||
| http_max_retries=3, | ||
| dataset=ds, | ||
| rubric=rub, | ||
| max_turns=10, | ||
| **kwargs | ||
| ) | ||
|
|
||
| return env |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| [project] | ||
| name = "http-mcp-env" | ||
| description = "Your environment description here" | ||
| tags = ["placeholder-tag", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.10" | ||
| dependencies = [ | ||
| "verifiers", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["http_mcp_env.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 5 | ||
| rollouts_per_example = 3 | ||
|
|
||
| [tool.uv.sources] | ||
| verifiers = { path = "../../../../" } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| # sandbox-mcp-env | ||
|
|
||
| 1. Define the MCP server or servers you want to use | ||
| 2. Setup state will | ||
| 1. Start by creating a sandbox for the rollout and exposing a port | ||
| 2. Then create transport(s) for the mcp servers which provide the interface for using the server | ||
| 3. It will run any necessary commands required for the mcp server | ||
| 4. Run the server in StreamableHTTP mode | ||
| 5. Finally register the MCP server's available tools | ||
| 3. Rollout proceeds and agent can make mcp tool calls that are safe to interact within the sandbox | ||
|
|
||
| ### Overview | ||
| - **Environment ID**: `sandbox-mcp-env` | ||
| - **Short description**: MCPEnv via sandboxed streaming http MCP servers | ||
| - **Tags**: mcp, sandbox | ||
|
|
||
| ### Datasets | ||
| - **Primary dataset(s)**: NA | ||
| - **Source links**: NA | ||
| - **Split sizes**: NA | ||
|
|
||
| ### Task | ||
| - **Type**: tool use | ||
| - **Parser**: NA | ||
| - **Rubric overview**: NA | ||
|
|
||
| ### Quickstart | ||
| Run an evaluation with default settings: | ||
|
|
||
| ```bash | ||
| uv run vf-eval sandbox-mcp-env -n 1 -r 1 | ||
| ``` | ||
|
|
||
| Configure model and sampling: | ||
|
|
||
| ```bash | ||
| uv run vf-eval sandbox-mcp-env -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"key": "value"}' # env-specific args as JSON | ||
| ``` | ||
|
|
||
| Notes: | ||
| - Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. | ||
|
|
||
| ### Environment Arguments | ||
| Demo | ||
|
|
||
| | Arg | Type | Default | Description | | ||
| | --- | ---- | ------- | ----------- | | ||
| | `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | | ||
|
|
||
| ### Metrics | ||
| Demo | ||
|
|
||
| | Metric | Meaning | | ||
| | ------ | ------- | | ||
| | `reward` | Main scalar reward (weighted sum of criteria) | | ||
| | `accuracy` | Exact match on target answer | | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| [project] | ||
| name = "sandbox-mcp-env" | ||
| description = "Your environment description here" | ||
| tags = ["placeholder-tag", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.10" | ||
| dependencies = [ | ||
| "verifiers>=0.1.7.post0", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["sandbox_mcp_env.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 5 | ||
| rollouts_per_example = 3 | ||
|
|
||
| [tool.uv.sources] | ||
| verifiers = { path = "../../../.." } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| import verifiers as vf | ||
| from verifiers.utils.mcp_utils.models import MCPServerConfig | ||
| from datasets import Dataset | ||
|
|
||
|
|
||
| def load_environment(**kwargs): | ||
| ds = Dataset.from_dict( | ||
| { | ||
| "question": [ | ||
| "Check out what tools are available and try one that looks interesting to you", | ||
| ], | ||
| "answer": ["Hello World"], | ||
| } | ||
| ) | ||
|
|
||
| rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini") | ||
|
|
||
| async def judge_reward(judge, prompt, completion, answer, state): | ||
| judge_response = await judge(prompt, completion, answer, state) | ||
| return 1.0 if "yes" in judge_response.lower() else 0.0 | ||
|
|
||
| rubric.add_reward_func(judge_reward, weight=1.0) | ||
|
|
||
| env = vf.MCPEnv( | ||
| mcp_servers=[ | ||
| MCPServerConfig( | ||
| name="everything-mcp", | ||
| command="npx", | ||
| args=[ | ||
| "@modelcontextprotocol/server-everything", | ||
| "streamableHttp", | ||
| ], | ||
| env={ | ||
| "PORT": "8000", | ||
| }, | ||
| setup_commands=[ | ||
| "apt update", | ||
| "apt upgrade -y", | ||
| "apt install -y git curl", | ||
| "curl -fsSL https://deb.nodesource.com/setup_lts.x | bash -", | ||
| "apt-get install -y nodejs", | ||
| "npm install -g @modelcontextprotocol/server-everything@latest", | ||
| ], | ||
| ) | ||
| ], | ||
| transport_type="sandbox", | ||
| sandbox_image="python:3.11-slim", | ||
| sandbox_start_command="tail -f /dev/null", | ||
| sandbox_cpu_cores=1, | ||
| sandbox_memory_gb=2, | ||
| sandbox_disk_size_gb=5, | ||
| sandbox_timeout_minutes=15, | ||
| sandbox_port_to_expose=8000, # Port the MCP server listens on | ||
| # Standard env options | ||
| dataset=ds, | ||
| rubric=rubric, | ||
| max_turns=10, | ||
| **kwargs | ||
| ) | ||
|
|
||
| return env |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| # stdio-mcp-env | ||
|
|
||
| > Replace the placeholders below, then remove this callout. | ||
|
|
||
| ### Overview | ||
| - **Environment ID**: `stdio-mcp-env` | ||
| - **Short description**: <one-sentence description> | ||
| - **Tags**: <comma-separated tags> | ||
|
|
||
| ### Datasets | ||
| - **Primary dataset(s)**: <name(s) and brief description> | ||
| - **Source links**: <links> | ||
| - **Split sizes**: <train/eval counts> | ||
|
|
||
| ### Task | ||
| - **Type**: <single-turn | multi-turn | tool use> | ||
| - **Parser**: <e.g., ThinkParser, XMLParser, custom> | ||
| - **Rubric overview**: <briefly list reward functions and key metrics> | ||
|
|
||
| ### Quickstart | ||
| Run an evaluation with default settings: | ||
|
|
||
| ```bash | ||
| uv run vf-eval stdio-mcp-env | ||
| ``` | ||
|
|
||
| Configure model and sampling: | ||
|
|
||
| ```bash | ||
| uv run vf-eval stdio-mcp-env -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"key": "value"}' # env-specific args as JSON | ||
| ``` | ||
|
|
||
| Notes: | ||
| - Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. | ||
|
|
||
| ### Environment Arguments | ||
| Document any supported environment arguments and their meaning. Example: | ||
|
|
||
| | Arg | Type | Default | Description | | ||
| | --- | ---- | ------- | ----------- | | ||
| | `foo` | str | `"bar"` | What this controls | | ||
| | `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | | ||
|
|
||
| ### Metrics | ||
| Summarize key metrics your rubric emits and how they’re interpreted. | ||
|
|
||
| | Metric | Meaning | | ||
| | ------ | ------- | | ||
| | `reward` | Main scalar reward (weighted sum of criteria) | | ||
| | `accuracy` | Exact match on target answer | | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| [project] | ||
| name = "stdio-mcp-env" | ||
| description = "Your environment description here" | ||
| tags = ["placeholder-tag", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.10" | ||
| dependencies = [ | ||
| "verifiers>=0.1.7.post0", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["stdio_mcp_env.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 5 | ||
| rollouts_per_example = 3 | ||
|
|
||
| [tool.uv.sources] | ||
| verifiers = { path = "../../../../" } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import verifiers as vf | ||
| from datasets import Dataset | ||
|
|
||
| def load_environment(**kwargs): | ||
| ds = Dataset.from_dict( | ||
| { | ||
| "question": [ | ||
| "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai", | ||
| ], | ||
| "answer": ["ENVIRONMENTS HUB"], | ||
| } | ||
| ) | ||
|
|
||
| rub = vf.JudgeRubric(judge_model="gpt-4.1-mini") | ||
|
|
||
| async def judge_reward(judge, prompt, completion, answer, state): | ||
| judge_response = await judge(prompt, completion, answer, state) | ||
| return 1.0 if "yes" in judge_response.lower() else 0.0 | ||
|
|
||
| rub.add_reward_func(judge_reward, weight=1.0) | ||
|
|
||
| env = vf.MCPEnv( | ||
| mcp_servers=[ | ||
| {"name": "fetch", "command": "uvx", "args": ["mcp-server-fetch"]} | ||
| ], | ||
| transport_type="stdio", | ||
| dataset=ds, | ||
| rubric=rub, | ||
| max_turns=10, | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing kwargs forwarding in stdio environment loaderMedium Severity The |
||
|
|
||
| return env | ||


Uh oh!
There was an error while loading. Please reload this page.