Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions environments/math_python/math_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def load_environment(

parser = vf.Parser(extract_fn=extract_boxed_answer)
math_rubric = vf.MathRubric(parser=parser)
vf_env = vf.PythonEnv(
return vf.PythonEnv(
dataset=dataset,
system_prompt=system_prompt,
parser=parser,
Expand All @@ -42,7 +42,3 @@ def load_environment(
timeout_per_command_seconds=sandbox_timeout_per_command_seconds,
**kwargs,
)
assert vf_env.tools is not None
tool_rubric = vf.ToolRubric(tools=vf_env.tools)
vf_env.rubric = vf.RubricGroup(rubrics=[tool_rubric, vf_env.rubric])
return vf_env
2 changes: 1 addition & 1 deletion tests/test_env_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def func3(completion, **kwargs):

assert rubric.env_map == env_map
# Should have all unique reward function names
assert set(rubric.all_reward_names) == {"func1", "func2", "func3"}
assert set(rubric.all_reward_names) == {"num_turns", "func1", "func2", "func3"}

@pytest.mark.asyncio
async def test_env_group_rubric_score_rollout(self, mock_openai_client):
Expand Down
2 changes: 2 additions & 0 deletions verifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .parsers.think_parser import ThinkParser
from .parsers.xml_parser import XMLParser
from .rubrics.judge_rubric import JudgeRubric
from .rubrics.monitor_rubric import MonitorRubric
from .rubrics.rubric_group import RubricGroup
from .rubrics.tool_rubric import ToolRubric
from .utils.data_utils import (
Expand Down Expand Up @@ -85,6 +86,7 @@ def setup_logging(
"JudgeRubric",
"RubricGroup",
"ToolRubric",
"MonitorRubric",
"MathRubric",
"TextArenaEnv",
"ReasoningGymEnv",
Expand Down
6 changes: 6 additions & 0 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ def add_task(example):
dataset = dataset.map(add_task, **map_kwargs)
return dataset

def add_rubric(self, rubric: Rubric) -> None:
if self.rubric is None:
self.rubric = rubric
else:
self.rubric = vf.RubricGroup(rubrics=[self.rubric, rubric])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if self.rubric is already a RubricGroup, maybe we should do self.rubric.add_rubric ?


def format_dataset(
self,
dataset: Dataset,
Expand Down
7 changes: 7 additions & 0 deletions verifiers/envs/multiturn_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from openai import AsyncOpenAI

import verifiers as vf
from verifiers.rubrics.monitor_rubric import MonitorRubric
from verifiers.types import (
Messages,
ModelResponse,
Expand All @@ -22,10 +23,16 @@
logger = logging.getLogger(__name__)


class MultiTurnMonitorRubric(MonitorRubric):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small nit -- what's the need for a distinct MultiTurnMonitorRubric class? All env classes currently extend MultiTurnEnv already, and build up the trajectory (even if single-turn)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Risks other extensions (e.g. SandboxMonitorRubric) not being multi-turn compatible by default, which seems undesirable.

def __init__(self):
super().__init__(state_keys=[("trajectory", "num_turns", len)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this feels a bit restrictive/unintuitive + not totally seeing the value of the state_keys approach.

we can already do:

async def num_turns(state) -> float:
    return len(state['trajectory'])

which is fairly concise + avoids the need for a new pattern

could also be worth adding add_metric to Rubric which behaves just like add_reward_func but with default weight 0?



class MultiTurnEnv(vf.Environment):
def __init__(self, max_turns: int = -1, **kwargs):
super().__init__(**kwargs)
self.max_turns = max_turns
self.add_rubric(MultiTurnMonitorRubric())

async def setup_state(self, state: State) -> State:
return state
Expand Down
26 changes: 21 additions & 5 deletions verifiers/envs/python_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@
class PythonWorkerState(TypedDict):
ready: bool
execution_count: int
ready_wait_time: float


class PythonMonitorRubric(vf.MonitorRubric):
def __init__(self):
super().__init__(
state_keys=[
("python_state.ready_wait_time", "python_ready_wait_time"),
]
)


class PythonWorkerNotReadyError(vf.SandboxError): ...
Expand Down Expand Up @@ -184,6 +194,7 @@ def __init__(
start_command=start_command,
**kwargs,
)
self.add_rubric(PythonMonitorRubric())
self.add_tool(
self.python, args_to_skip=["sandbox_id", "sandbox_state", "python_state"]
)
Expand Down Expand Up @@ -224,7 +235,7 @@ async def python(
) -> str:
"""Execute `code` inside persistent Python REPL."""
if not python_state["ready"]:
await self._wait_for_worker_ready(sandbox_state, sandbox_id)
await self._wait_for_worker_ready(sandbox_id, sandbox_state, python_state)
python_state["ready"] = True
sandbox_response = await self._send_worker_request(
sandbox_id, sandbox_state, {"code": code}
Expand All @@ -236,7 +247,10 @@ async def cleanup_python_state(self, state: vf.State):
state.pop("python_state", None)

async def _wait_for_worker_ready(
self, sandbox_state: SandboxState, sandbox_id: str
self,
sandbox_id: str,
sandbox_state: SandboxState,
python_state: PythonWorkerState,
) -> None:
s = time.time()
try:
Expand All @@ -248,11 +262,13 @@ async def _wait_for_worker_ready(
)
if result.exit_code != 0:
raise RuntimeError(result.stderr)
self.logger.debug(
f"Waited {time.time() - s:.1f}s for Python worker to be ready"
)
except Exception as e:
raise PythonWorkerNotReadyError from e
ready_wait_time = time.time() - s
python_state["ready_wait_time"] = ready_wait_time
self.logger.debug(
f"Waited {ready_wait_time:.1f}s for Python worker to be ready"
)

async def _send_worker_request(
self, sandbox_id: str, sandbox_state, payload: dict[str, Any]
Expand Down
38 changes: 33 additions & 5 deletions verifiers/envs/sandbox_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,22 @@ def teardown(self, wait: bool = True) -> None:

class SandboxState(TypedDict):
ready: bool
ready_wait_time: float
command_execution_times: list[float]


class SandboxMonitorRubric(vf.MonitorRubric):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this serving a different role than MultiTurnMonitorRubric? is the idea to have many monitor rubrics which track new things introduced at different hierarchies?

def __init__(self):
super().__init__(
state_keys=[
("sandbox_state.ready_wait_time", "sandbox_ready_wait_time"),
(
"sandbox_state.command_execution_times",
"sandbox_command_execution_time",
lambda x: sum(x) / len(x) if len(x) > 0 else 0.0,
),
]
)


class SandboxCreationError(vf.SandboxError): ...
Expand Down Expand Up @@ -127,6 +143,7 @@ def __init__(
stop_errors=stop_errors if stop_errors is not None else [vf.SandboxError],
**kwargs,
)
self.add_rubric(SandboxMonitorRubric())
self.timeout_per_command_seconds = timeout_per_command_seconds
self.sandbox_client = ThreadedAsyncSandboxClient(
max_workers=sandbox_client_max_workers,
Expand Down Expand Up @@ -173,7 +190,9 @@ async def _wait_for_sandbox_ready(
sandbox_state["ready"] = True
except Exception as e:
raise SandboxNotReadyError(e)
self.logger.debug(f"Waited {time.time() - s:.1f}s for sandbox to be ready")
ready_wait_time = time.time() - s
sandbox_state["ready_wait_time"] = ready_wait_time
self.logger.debug(f"Waited {ready_wait_time:.1f}s for sandbox to be ready")

async def bash(
self,
Expand All @@ -197,13 +216,16 @@ async def bash(
timeout=self.timeout_per_command_seconds,
)
except CommandTimeoutError:
e = time.time()
timeout_msg = f"Command timed out after {self.timeout_per_command_seconds}s"
self.logger.warning(f"{timeout_msg} in sandbox {sandbox_id}")
sandbox_state["command_execution_times"].append(
self.timeout_per_command_seconds
)
return f"Error: {timeout_msg}"
except Exception as e:
raise vf.SandboxError from e
e = time.time()
command_execution_time = time.time() - s
sandbox_state["command_execution_times"].append(command_execution_time)
stdout = results.stdout.strip()
stderr = (results.stderr or "").strip()
combined = stdout
Expand All @@ -213,7 +235,9 @@ async def bash(
else:
combined = f"stderr:\n{stderr}"
output = combined or "(no output)"
self.logger.debug(f"Executed command in {e - s:.1f}s. Got output: {output}")
self.logger.debug(
f"Executed command in {command_execution_time:.1f}s. Got output: {output}"
)
return output

async def post_rollout(self, state: vf.State):
Expand Down Expand Up @@ -252,7 +276,11 @@ async def setup_state(self, state: vf.State, **kwargs) -> vf.State:
self.active_sandboxes.add(sandbox.id)
self.logger.debug(f"Created sandbox {sandbox.id}")
state["sandbox_id"] = sandbox.id
state["sandbox_state"] = {"ready": False}
state["sandbox_state"] = {
"ready": False,
"ready_wait_time": None,
"command_execution_times": [],
}
return await super().setup_state(state, **kwargs)

def update_tool_args(
Expand Down
2 changes: 2 additions & 0 deletions verifiers/envs/tool_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from openai.types.chat import ChatCompletionAssistantMessageParam

import verifiers as vf
from verifiers.rubrics.tool_rubric import ToolRubric
from verifiers.utils.async_utils import maybe_await
from verifiers.utils.tool_utils import convert_func_to_oai_tool

Expand All @@ -27,6 +28,7 @@ def __init__(
for tool in self.tools
}
super().__init__(oai_tools=self.oai_tools, max_turns=max_turns, **kwargs)
self.add_rubric(ToolRubric(tools=self.tools))

def _should_stop_for_error(self, err: Exception) -> bool:
"""Check if error is in stop_errors."""
Expand Down
52 changes: 52 additions & 0 deletions verifiers/rubrics/monitor_rubric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Callable

from verifiers.rubrics.rubric import Rubric
from verifiers.types import State

StateKey = str
RenamedStateKey = tuple[StateKey, str]
RenamedTransformedStateKey = tuple[StateKey, str, Callable[..., float]]


class MonitorRubric(Rubric):
"""Simple rubric that reads values from the state for logging."""

def __init__(
self,
state_keys: list[StateKey | RenamedStateKey | RenamedTransformedStateKey]
| None = None,
):
self.state_keys: list[
StateKey | RenamedStateKey | RenamedTransformedStateKey
] = state_keys or []

reward_funcs = []
for state_key in self.state_keys:
if isinstance(state_key, str):
reward_func = self.get_read_from_state(state_key)
else:
reward_func = self.get_read_from_state(*state_key) # type: ignore
reward_funcs.append(reward_func)
reward_weights = [0.0] * len(self.state_keys) # only for logging

# pass them to parent class
super().__init__(funcs=reward_funcs, weights=reward_weights)

def get_read_from_state(
self,
key: str,
name: str | None = None,
transform: Callable[..., float] = float,
) -> Callable:
"""Create a reward function that reads from the state."""

async def read_from_state(state: State) -> float:
key_parts = key.split(".")
for key_part in key_parts[:-1]:
state = state.get(key_part, {})
value = state.get(key_parts[-1], 0.0)
return transform(value)

read_from_state.__name__ = name if name is not None else key

return read_from_state
Loading