PrimeIntellect-ai · cdreetz · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/environments/mcp_envs/environments/http_mcp_env/README.md b/environments/mcp_envs/environments/http_mcp_env/README.md
@@ -0,0 +1,51 @@
+# http-mcp-env
+
+> Replace the placeholders below, then remove this callout.
+
+### Overview
+- **Environment ID**: `http-mcp-env`
+- **Short description**: <one-sentence description>
+- **Tags**: <comma-separated tags>
+
+### Datasets
+- **Primary dataset(s)**: <name(s) and brief description>
+- **Source links**: <links>
+- **Split sizes**: <train/eval counts>
+
+### Task
+- **Type**: <single-turn | multi-turn | tool use>
+- **Parser**: <e.g., ThinkParser, XMLParser, custom>
+- **Rubric overview**: <briefly list reward functions and key metrics>
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval http-mcp-env
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval http-mcp-env   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `foo` | str | `"bar"` | What this controls |
+| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+
+### Metrics
+Summarize key metrics your rubric emits and how they’re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+| `accuracy` | Exact match on target answer |
+
diff --git a/environments/mcp_envs/environments/http_mcp_env/http_mcp_env.py b/environments/mcp_envs/environments/http_mcp_env/http_mcp_env.py
@@ -0,0 +1,58 @@
+import verifiers as vf
+from datasets import Dataset
+import os
+from dotenv import load_dotenv
+from urllib.parse import urlencode
+
+load_dotenv()
+
+
+def get_remote_url():
+    API_KEY = os.getenv("SMITHERY_API_KEY")
+    PROFILE = os.getenv("SMITHERY_PROFILE")
+    base_url = "https://server.smithery.ai/@smithery-ai/fetch/mcp"
+    params = {"api_key": API_KEY, "profile": PROFILE}
+    smithery_url = f"{base_url}?{urlencode(params)}"
+    return smithery_url
+
+
+def load_environment(**kwargs):
+    remote_url = get_remote_url()
+    ds = Dataset.from_dict(
+        {
+            "question": [
+                "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai",
+            ],
+            "answer": ["ENVIRONMENTS HUB"],  # Or whatever the actual top result is
+        }
+    )
+
+    rub = vf.JudgeRubric(judge_model="gpt-4.1-mini")
+
+    async def judge_reward(judge, prompt, completion, answer, state):
+        judge_response = await judge(prompt, completion, answer, state)
+        return 1.0 if "yes" in judge_response.lower() else 0.0
+
+    rub.add_reward_func(judge_reward, weight=1.0)
+
+    env = vf.MCPEnv(
+        mcp_servers=[
+            {
+                "name": "fetch-mcp-server-http",
+                "command": "node",  # Not used for HTTP transport, but required by MCPServerConfig
+                "args": [],
+            }
+        ],
+        transport_type="http",
+        http_urls={
+            "fetch-mcp-server-http": remote_url
+        },
+        http_timeout=60.0,
+        http_max_retries=3,
+        dataset=ds,
+        rubric=rub,
+        max_turns=10,
+        **kwargs
+    )
+
+    return env
diff --git a/environments/mcp_envs/environments/http_mcp_env/pyproject.toml b/environments/mcp_envs/environments/http_mcp_env/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "http-mcp-env"
+description = "Your environment description here"
+tags = ["placeholder-tag", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["http_mcp_env.py", "pyproject.toml"] 
+
+[tool.verifiers.eval]
+num_examples = 5
+rollouts_per_example = 3
+
+[tool.uv.sources]
+verifiers = { path = "../../../../" }
diff --git a/environments/mcp_envs/environments/sandbox_mcp_env/README.md b/environments/mcp_envs/environments/sandbox_mcp_env/README.md
@@ -0,0 +1,57 @@
+# sandbox-mcp-env
+
+1. Define the MCP server or servers you want to use
+2. Setup state will 
+    1. Start by creating a sandbox for the rollout and exposing a port
+    2. Then create transport(s) for the mcp servers which provide the interface for using the server
+    3. It will run any necessary commands required for the mcp server
+    4. Run the server in StreamableHTTP mode
+    5. Finally register the MCP server's available tools
+3. Rollout proceeds and agent can make mcp tool calls that are safe to interact within the sandbox
+
+### Overview
+- **Environment ID**: `sandbox-mcp-env`
+- **Short description**: MCPEnv via sandboxed streaming http MCP servers
+- **Tags**: mcp, sandbox
+
+### Datasets
+- **Primary dataset(s)**: NA
+- **Source links**: NA
+- **Split sizes**: NA
+
+### Task
+- **Type**: tool use
+- **Parser**: NA
+- **Rubric overview**: NA
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval sandbox-mcp-env -n 1 -r 1
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval sandbox-mcp-env   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Demo
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+
+### Metrics
+Demo
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+| `accuracy` | Exact match on target answer |
+
diff --git a/environments/mcp_envs/environments/sandbox_mcp_env/pyproject.toml b/environments/mcp_envs/environments/sandbox_mcp_env/pyproject.toml
@@ -0,0 +1,24 @@
+[project]
+name = "sandbox-mcp-env"
+description = "Your environment description here"
+tags = ["placeholder-tag", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers>=0.1.7.post0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["sandbox_mcp_env.py", "pyproject.toml"] 
+
+[tool.verifiers.eval]
+num_examples = 5
+rollouts_per_example = 3
+
+[tool.uv.sources]
+verifiers = { path = "../../../.." }
+
diff --git a/environments/mcp_envs/environments/sandbox_mcp_env/sandbox_mcp_env.py b/environments/mcp_envs/environments/sandbox_mcp_env/sandbox_mcp_env.py
@@ -0,0 +1,61 @@
+import verifiers as vf
+from verifiers.utils.mcp_utils.models import MCPServerConfig
+from datasets import Dataset
+
+
+def load_environment(**kwargs):
+    ds = Dataset.from_dict(
+        {
+            "question": [
+                "Check out what tools are available and try one that looks interesting to you",
+            ],
+            "answer": ["Hello World"],
+        }
+    )
+
+    rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini")
+
+    async def judge_reward(judge, prompt, completion, answer, state):
+        judge_response = await judge(prompt, completion, answer, state)
+        return 1.0 if "yes" in judge_response.lower() else 0.0
+
+    rubric.add_reward_func(judge_reward, weight=1.0)
+
+    env = vf.MCPEnv(
+        mcp_servers=[
+            MCPServerConfig(
+                name="everything-mcp",
+                command="npx",
+                args=[
+                    "@modelcontextprotocol/server-everything",
+                    "streamableHttp",
+                ],
+                env={
+                    "PORT": "8000",
+                },
+                setup_commands=[
+                    "apt update",
+                    "apt upgrade -y",
+                    "apt install -y git curl",
+                    "curl -fsSL https://deb.nodesource.com/setup_lts.x | bash -",
+                    "apt-get install -y nodejs",
+                    "npm install -g @modelcontextprotocol/server-everything@latest",
+                ],
+            )
+        ],
+        transport_type="sandbox",
+        sandbox_image="python:3.11-slim",
+        sandbox_start_command="tail -f /dev/null",
+        sandbox_cpu_cores=1,
+        sandbox_memory_gb=2,
+        sandbox_disk_size_gb=5,
+        sandbox_timeout_minutes=15,
+        sandbox_port_to_expose=8000,  # Port the MCP server listens on
+        # Standard env options
+        dataset=ds,
+        rubric=rubric,
+        max_turns=10,
+        **kwargs
+    )
+
+    return env
diff --git a/environments/mcp_envs/environments/stdio_mcp_env/README.md b/environments/mcp_envs/environments/stdio_mcp_env/README.md
@@ -0,0 +1,51 @@
+# stdio-mcp-env
+
+> Replace the placeholders below, then remove this callout.
+
+### Overview
+- **Environment ID**: `stdio-mcp-env`
+- **Short description**: <one-sentence description>
+- **Tags**: <comma-separated tags>
+
+### Datasets
+- **Primary dataset(s)**: <name(s) and brief description>
+- **Source links**: <links>
+- **Split sizes**: <train/eval counts>
+
+### Task
+- **Type**: <single-turn | multi-turn | tool use>
+- **Parser**: <e.g., ThinkParser, XMLParser, custom>
+- **Rubric overview**: <briefly list reward functions and key metrics>
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval stdio-mcp-env
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval stdio-mcp-env   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `foo` | str | `"bar"` | What this controls |
+| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+
+### Metrics
+Summarize key metrics your rubric emits and how they’re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+| `accuracy` | Exact match on target answer |
+
diff --git a/environments/mcp_envs/environments/stdio_mcp_env/pyproject.toml b/environments/mcp_envs/environments/stdio_mcp_env/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "stdio-mcp-env"
+description = "Your environment description here"
+tags = ["placeholder-tag", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers>=0.1.7.post0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["stdio_mcp_env.py", "pyproject.toml"] 
+
+[tool.verifiers.eval]
+num_examples = 5
+rollouts_per_example = 3
+
+[tool.uv.sources]
+verifiers = { path = "../../../../" }
diff --git a/environments/mcp_envs/environments/stdio_mcp_env/stdio_mcp_env.py b/environments/mcp_envs/environments/stdio_mcp_env/stdio_mcp_env.py
@@ -0,0 +1,32 @@
+import verifiers as vf
+from datasets import Dataset
+
+def load_environment(**kwargs):
+    ds = Dataset.from_dict(
+        {
+            "question": [
+                "Find out what Prime Intellect's newest announcement was from their website, give me the headline in 2 words. Their url is primeintellect.ai",
+            ],
+            "answer": ["ENVIRONMENTS HUB"],
+        }
+    )
+
+    rub = vf.JudgeRubric(judge_model="gpt-4.1-mini")
+
+    async def judge_reward(judge, prompt, completion, answer, state):
+        judge_response = await judge(prompt, completion, answer, state)
+        return 1.0 if "yes" in judge_response.lower() else 0.0
+
+    rub.add_reward_func(judge_reward, weight=1.0)
+
+    env = vf.MCPEnv(
+        mcp_servers=[
+            {"name": "fetch", "command": "uvx", "args": ["mcp-server-fetch"]}
+        ],
+        transport_type="stdio",
+        dataset=ds,
+        rubric=rub,
+        max_turns=10,
+    )
+
+    return env
diff --git a/public/mcp-envs.png b/public/mcp-envs.png