From a8f6e495a2a2709d8c8b550be56c068889874bcb Mon Sep 17 00:00:00 2001 From: vxnuaj Date: Tue, 4 Nov 2025 01:15:09 -0800 Subject: [PATCH 1/5] Add dynamic sparse rewards --- PR.md | 209 ++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + verifiers/envs/env_group.py | 102 ++++++++++++++++- verifiers/envs/environment.py | 30 +++++ verifiers/rubrics/rubric.py | 38 ++++++- verifiers/types.py | 15 +++ verifiers/utils/eval_utils.py | 48 +++++++- 7 files changed, 432 insertions(+), 11 deletions(-) create mode 100644 PR.md diff --git a/PR.md b/PR.md new file mode 100644 index 000000000..45088e9f9 --- /dev/null +++ b/PR.md @@ -0,0 +1,209 @@ +## Overview + +This PR implements sparse metrics / rubrics, which enables mathematically correct averaging in multi-domain environments. The key change heere is selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments. + +In environments like [`ProfBench`](https://arxiv.org/pdf/2510.18941), domain-specific scores get mixed with irrelevant zeros, making the averages misleading. + +**Example Issue:** +Evaluating GPT-4 on 12 tasks: 10 physics + 2 chemistry tasks + +``` +physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, 0, 0] # zeros for chemistry tasks +chemistry_reward: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 76] # zeros for physics tasks +``` + +- **Before**: `physics_reward: avg - 56.2` (diluted by irrelevant zeros) +- **Before**: `chemistry_reward: avg - 13.7` (misleading!) + +After, + +``` +physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, -, -] # zeros for chemistry tasks +chemistry_reward: [-, -, -, -, -, -, -, -, -, -, 88, 76] # zeros for physics tasks +``` + +- **After**: `chemistry_reward: avg - 82.0 (relevant: 2/12)` (actual chemistry skill) +- **After**: `physics_reward: avg - 66.2 (relevant: 10/12)` (pure physics performance) + +Which can all be done now within an `EnvGroup` with `enable_sparse_metrics=True`. + +we can now + +1. mark irrelevant values as sparse during scoring +2. exclude sparse values from averaging calculations +3. display sparsity clearly with `-` instead of `0.0` +4. maintain backwards compatibility with existing environments + +## Core + +### 1. type extensions @ `types.py` + +**New Fields Added:** + +```python +class RolloutScore(BaseModel): + sparse_metrics: set[str] | None = Field(default=None) + # set of metric names to exclude from averaging for this rollout + +class RolloutScores(BaseModel): + sparse_metrics: dict[str, list[bool]] | None = Field(default=None) + # per-rolout exclusion flags for batch scoring + +class GenerateOutputs(BaseModel): + sparse_metrics: dict[str, list[bool]] | None = Field(default=None) + # final sparse tracking for evaluation results +``` + +THis tracks which metric values should be excluded from averaging calculations. + +### 2. Environment Sparse Tracking @ `envs/environment.py` + +**Key Changes:** +- **Initialize sparse flags** for all metrics during interleaved scoring +- **Track sparse metrics** from rubric scoring results +- **Conditionally assign** sparse_metrics only if sparsity detected (backwards compatible) + +```python +# Initialize sparse tracking +sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names} + +# Process sparse flags from scoring +if rs.sparse_metrics: + for sparse_key in rs.sparse_metrics: + sparse_flags[sparse_key][i] = True + +# Only add if sparsity detected (backwards compatible) +if any(any(flags) for flags in sparse_flags.values()): + results.sparse_metrics = sparse_flags +``` + +this collects and aggregates sparse metadata during evaluation execution. + +### 3. Batch Scoring with Sparse Handling @ `rubrics/rubric.py` + +**Key Changes:** +- **Collect all metric keys** across rollouts (handles mixed metrics) +- **Fill missing metrics** with 0.0 and mark as sparse +- **Track sparsity flags** from individual rollout scores +- **Return sparse metadata** only if sparsity detected + +```python +# Handle missing metrics as sparse +if k in reward.metrics: + metrics[k].append(reward.metrics[k]) + is_sparse = reward.sparse_metrics and k in reward.sparse_metrics + sparse_flags[k].append(is_sparse) +else: + # Missing metric -> sparse 0.0 + metrics[k].append(0.0) + sparse_flags[k].append(True) +``` + +ensure consistent metric structure while preserving sparsity information. + +### 4. EnvGroup Sparse Architecture @ `envs/env_group.py`) + +**New Class: `EnvGroupSparseRubric`** + +Extends standard `EnvGroupRubric` with domain-specific sparse marking: + +```python +class EnvGroupSparseRubric(EnvGroupRubric): + async def score_rollout(self, ...): + # Route to domain-specific environment + env_results = await env.rubric.score_rollout(...) + + # Mark uncomputed metrics as sparse + uncomputed_metrics = set(all_rewards) - set(env_results.metrics.keys()) + sparse_metrics = uncomputed_metrics if uncomputed_metrics else None + + return RolloutScore(sparse_metrics=sparse_metrics, ...) +``` + +**Activation Logic:** +```python +# Key decision point for sparse metrics +if enable_sparse_metrics: + rubric = EnvGroupSparseRubric(self.env_map) # Sparse-aware +else: + rubric = EnvGroupRubric(self.env_map) # Standard (backwards compatible) +``` + +automatically mark domain-specific metrics as sparse when irrelevant. + +### 5. Sparse-Aware Display @ `utils/eval_utils.py` + +**Selective Averaging:** +```python +# Filter out sparse values before averaging +sparse_flags = results.sparse_metrics[k] +relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse] + +if relevant_values: + avg = sum(relevant_values) / len(relevant_values) + sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})" + print(f"{k}: avg - {avg:.3f}{sparsity_info}") +else: + print(f"{k}: no relevant data (all values sparse)") +``` + +**Enhanced Display:** +```python +# Show "-" for sparse values instead of misleading 0.0 +if sparse_flags[idx]: + trials.append("-") # Sparse (excluded from averaging) +else: + trials.append(round(v[idx], 3)) # Actual computed value +``` + +provide mathematically correct averages and clear visual distinction of sparsity. + +## Usage + +```python +# Standard behavior (backwards compatible) +env = vf.EnvGroup(envs, names) # Standard averaging + +# Sparse metrics enabled +env = vf.EnvGroup(envs, names, enable_sparse_metrics=True) # Selective averaging +``` + +```python +def load_environment(enable_sparse_metrics: bool = True): + return vf.EnvGroup( + envs=domain_envs, + env_names=domain_names, + enable_sparse_metrics=enable_sparse_metrics + ) +``` + +## To Test: + +To test sparse metrics with ProfBench: + +1. **Pull the ProfBench environment changes:** + ```bash + git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench + cd prime-environments + ``` + +2. **Pull this verifiers PR with sparse metrics implementation** + +3. **Install verifiers in editable mode:** + ```bash + cd verifiers + uv pip install -e . + ``` + +4. **Run evaluation to see sparse metrics in action:** + ```bash + vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 + # -n must be >= 10 for sparsity to be detected, as if we do less, then profbench only loads from the first domain ( i believe physics or chemistry ) + # feel free to do -r x \in R^n + ``` + +**Expected output:** +- Domain-specific averages (e.g., `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)`) +- Sparse values displayed as `-` instead of `0.0` +- Mathematically correct averages excluding irrelevant domain scores + diff --git a/pyproject.toml b/pyproject.toml index f0953afbc..81acfc07d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ "tomli; python_version < '3.11'", "prime-sandboxes>=0.1.0", "wget>=3.2", + "torch>=2.8.0", ] [dependency-groups] diff --git a/verifiers/envs/env_group.py b/verifiers/envs/env_group.py index da9b4d108..c07a9f2eb 100644 --- a/verifiers/envs/env_group.py +++ b/verifiers/envs/env_group.py @@ -83,6 +83,88 @@ async def score_rollout( return RolloutScore(reward=reward, metrics=metrics) +class EnvGroupSparseRubric(EnvGroupRubric): + """ + enhanced EnvGroup rubric with domain-specific sparse tracking. + + this rubric extends EnvGroupRubric to support sparse metrics for multi-domain environments. + when routing scoring to domain-specific environments, it automatically marks metrics + that weren't computed by the target environment as sparse (excluded from averaging). + + Key differences from standard EnvGroupRubric: + - marks uncomputed domain metrics as sparse (e.g., chemistry_reward=0.0 becomes sparse) + - enables mathematically correct domain averaging by excluding irrelevant zeros + - Only used when EnvGroup is initialized with enable_sparse_metrics=True + + Example: For a chemistry task in ProfBench, physics/finance/consulting rewards are marked + sparse, ensuring chemistry_reward averages only over actual chemistry evaluations. + """ + + async def score_rollout( + self, + prompt: str | list[ChatMessage], + completion: str | list[ChatMessage], + answer: str = "", + state: State | None = None, + task: str = "default", + info: dict | None = None, + example_id: int | None = None, + **kwargs, + ) -> RolloutScore: + """ + Route scoring with sparse metrics support for multi-domain environments. + + This method handles scoring by: + 1. Routing the task to the appropriate domain-specific environment + 2. Computing metrics using that environment's rubric + 3. Filling uncomputed metrics with 0.0 and marking them as sparse + 4. Returning results with sparse flags for proper averaging + + Only used when EnvGroup has enable_sparse_metrics=True. + """ + state = state or {} + info = info or {} + + # pre-initialize all known metrics to 0.0 + # this ensures consistent metric structure across all rollouts + # uncomputed metrics will remain 0.0 and be marked sparse + metrics = {name: 0.0 for name in self.all_reward_names} + reward = 0.0 + + # Route to appropriate domain environment based on task + env = self.env_map.get(task) + if env is None: + self.logger.warning(f"No environment found for task '{task}'") + return RolloutScore(reward=reward, metrics=metrics) + + # Score using the domain-specific environment's rubric + # this computes only the metrics relevant to this domain + env_results = await env.rubric.score_rollout( + prompt, completion, answer, state, task, info, example_id, **kwargs + ) + + # update metrics with computed values from domain environment + # metrics not computed by this environment remain at 0.0 + for reward_name, score in env_results.metrics.items(): + if reward_name in metrics: + metrics[reward_name] = score + + # mark uncomputed metrics as sparse for exclusion from averaging + # example: for chemistry task, physics/finance/consulting rewards marked sparse + # this enables mathematically correct domain averaging + uncomputed_metrics = set(self.all_reward_names) - set(env_results.metrics.keys()) + sparse_metrics = uncomputed_metrics if uncomputed_metrics else None + + # Overall reward comes from the domain environment + reward = env_results.reward + + return RolloutScore( + reward=reward, + metrics=metrics, + sparse_metrics=sparse_metrics + ) + + class EnvGroup(Environment): """ Environment group that acts as a mixture of multiple environments. @@ -91,7 +173,8 @@ class EnvGroup(Environment): """ def __init__( - self, envs: list[Environment], env_names: list[str] | None = None, **kwargs + self, envs: list[Environment], env_names: list[str] | None = None, + enable_sparse_metrics: bool = False, **kwargs ): """ Initialize EnvGroup with a list of environments. @@ -100,6 +183,7 @@ def __init__( envs: list of Environment instances env_names: Optional list of names for each environment. If not provided, uses "env_0", "env_1", etc. + enable_sparse_metrics: Enable sparse metrics for mathematically correct domain averaging **kwargs: Additional arguments passed to parent Environment """ if not envs: @@ -133,10 +217,18 @@ def add_task(example): eval_datasets.append(env_eval_dataset) dataset = concatenate_datasets(datasets) if datasets else None eval_dataset = concatenate_datasets(eval_datasets) if eval_datasets else None - # wrap rubrics - rubric = EnvGroupRubric(self.env_map) - - # Don't set oai_tools at the group level since different sub-environments + # choose rubric type based on enable_sparse_metrics flag + # this is the key decision point for sparse metrics activation + if enable_sparse_metrics: + # use sparse-aware rubric that marks uncomputed domain metrics as sparse + # enables mathematically correct averaging by excluding irrelevant zeros + rubric = EnvGroupSparseRubric(self.env_map) + else: + # use standard rubric that includes all values in averaging (backwards compatible) + # this preserves existing behavior for environments without sparse metrics + rubric = EnvGroupRubric(self.env_map) + + # don't set oai_tools at the group level since different sub-environments # may have different tools. Instead, set them per-task in rollout(). # initialize parent Environment super().__init__( diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index 9e9e39158..83e0c6af5 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -605,6 +605,12 @@ async def generate( if interleave_scoring and score_rollouts: # interleaved pipeline: separate semaphores for generation and scoring # pre-allocate metrics using known reward function names + reward_func_names = self.rubric.get_reward_func_names() + sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names} + # ^^ initialize sparse tracking flags for each metric + # sparse_flags tracks which rollout values should be excluded from averaging + # Initially all values are marked as relevant (False = not sparse) + # Rubrics can mark specific rollouts as sparse during scoring maybe_gen_sem = generation_semaphore or ( semaphore or await maybe_semaphore(gen_limit) ) @@ -658,7 +664,19 @@ async def run_one(i: int) -> None: # ensure key exists in case of EnvGroup/RubricGroup if k not in results.metrics: results.metrics[k] = [0.0] * n + sparse_flags[k] = [False] * n + # ^^ initialize sparse flags for dynamically discovered metrics results.metrics[k][i] = v + + # process sparse metric flags from rubric scoring + # when a rubric marks certain metrics as sparse for this rollout, + # we set the corresponding sparse flags to True to exclude them from averaging + if rs.sparse_metrics: + for sparse_key in rs.sparse_metrics: + if sparse_key not in sparse_flags: + # handle metrics marked sparse that weren't pre-allocated + sparse_flags[sparse_key] = [False] * n + sparse_flags[sparse_key][i] = True num_completed += 1 if save_every > 0 and num_completed % save_every == 0: self.logger.debug(f"Saving results to {results_path}") @@ -718,6 +736,9 @@ async def run_one(i: int) -> None: ) results.reward = rollout_scores.reward results.metrics = rollout_scores.metrics + # pass through sparse_metrics if present + if hasattr(rollout_scores, 'sparse_metrics') and rollout_scores.sparse_metrics: + results.sparse_metrics = rollout_scores.sparse_metrics else: results.reward = [] results.metrics = {} @@ -739,6 +760,15 @@ async def run_one(i: int) -> None: results.metadata.avg_reward = avg_reward results.metadata.avg_metrics = avg_metrics + # conditionally add sparse tracking to results + # only include sparse_metrics if: + # 1. We're using interleaved scoring (where sparse tracking occurs) + # 2. Score rollouts is enabled (metrics are being computed) + # 3. At least one metric has sparse values (maintains backwards compatibility) + # this ensures existing environments without sparse metrics remain unchanged + if interleave_scoring and score_rollouts and any(any(flags) for flags in sparse_flags.values()): + results.sparse_metrics = sparse_flags + return results # alias for backward compatibility diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py index 1fd0220b7..f9b1ced53 100644 --- a/verifiers/rubrics/rubric.py +++ b/verifiers/rubrics/rubric.py @@ -262,11 +262,43 @@ async def score_rollouts( return RolloutScores( reward=[], metrics={name: [] for name in reward_func_names}, + # return sparse tracking only if needed (this is backwrds compatible) + sparse_metrics={name: [] for name in reward_func_names} if any(r.sparse_metrics for r in rewards if r.sparse_metrics) else None ) + # collect all possible metric keys across all rollouts + # this handles cases where different rollouts may have different metrics + # (e.g., multi-domain environments where some metrics don't apply to all tasks) + all_metric_keys = set() + for reward in rewards: + all_metric_keys.update(reward.metrics.keys()) + + # build unified metrics dict with sparse tracking + # ensures all metric keys are present in all rollout results, filling missing + # values with 0.0 and marking them as sparse (excluded from averaging) + metrics = {} + sparse_flags = {} + for k in all_metric_keys: + metrics[k] = [] + sparse_flags[k] = [] + + for reward in rewards: + if k in reward.metrics: + # metric computed for this rollout - include the actual value + metrics[k].append(reward.metrics[k]) + # check if rubric marked this metric as sparse for this rollout + is_sparse = reward.sparse_metrics and k in reward.sparse_metrics + sparse_flags[k].append(is_sparse) + else: + # metric not computed for this rollout - fill with sparse 0.0 + # this handles domain-specific metrics that don't apply to all tasks + metrics[k].append(0.0) + sparse_flags[k].append(True) + return RolloutScores( reward=[reward.reward for reward in rewards], - metrics={ - k: [item.metrics[k] for item in rewards] for k in rewards[0].metrics - }, + metrics=metrics, + # only include sparse_metrics if at least one metric has sparse values + # this maintains backwards compatibility - environments without sparse metrics get None + sparse_metrics=sparse_flags if any(any(flags) for flags in sparse_flags.values()) else None ) diff --git a/verifiers/types.py b/verifiers/types.py index daa3eb780..eeef7a091 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -86,6 +86,13 @@ class GenerateOutputs(BaseModel): reward: list[float] metrics: dict[str, list[float]] = Field(default_factory=dict) metadata: GenerateMetadata + sparse_metrics: dict[str, list[bool]] | None = Field(default=None) + # ^^ pptional sparse tracking for multi-domain environments + # When present, sparse_metrics[metric_name] indicates which rollout values should be + # excluded from averaging (e.g., domain-specific metrics evaluated on irrelevant tasks). + # True = sparse (exclude from average), False = relevant (include in average) + # Example: chemistry_reward=[50.0, 0.0, 75.0] with sparse_metrics={"chemistry_reward": [False, True, False]} + # would average to 62.5 instead of 41.7, excluding the irrelevant 0.0 score. class RolloutScore(BaseModel): @@ -93,6 +100,10 @@ class RolloutScore(BaseModel): reward: float metrics: dict[str, float] = Field(default_factory=dict) + sparse_metrics: set[str] | None = Field(default=None) + # ^^ set of metric names that should be excluded from averaging for this rollout + # Used by rubrics to mark domain-specific metrics as irrelevant for certain tasks + # Example: {"chemistry_reward", "physics_reward"} when evaluating a finance task class RolloutScores(BaseModel): @@ -100,6 +111,10 @@ class RolloutScores(BaseModel): reward: list[float] metrics: dict[str, list[float]] = Field(default_factory=dict) + sparse_metrics: dict[str, list[bool]] | None = Field(default=None) + # ^^ per-rollout exclusion flags for batch scoring + # Maps metric names to lists of boolean flags (True = sparse, False = relevant) + # Length matches the rollout lists in reward/metrics. Aggregated from individual RolloutScore.sparse_metrics class ProcessedOutputs(BaseModel): diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py index 3c2c18d38..2a9187f56 100644 --- a/verifiers/utils/eval_utils.py +++ b/verifiers/utils/eval_utils.py @@ -89,10 +89,52 @@ def print_results(results: GenerateOutputs, num_samples: int = 1): print(out) for k in results.metrics: v = results.metrics[k] - print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}") + + # selective averaging that excludes sparse values + # only average over relevant (non-sparse) values + # instead of including misleading zeros in the calculation + if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics: + # filter out sparse values from averaging calculation + # sparse_flags[i] = True means exclude rollout i from averaging + sparse_flags = results.sparse_metrics[k] + relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse] + + if relevant_values: + # calculate statistics over only the relevant (non-sparse) values + # this gives mathematically correct domain-specific averages + avg = sum(relevant_values) / len(relevant_values) + std = np.std(relevant_values) + sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})" + print(f"{k}: avg - {avg:.3f}, std - {std:.3f}{sparsity_info}") + else: + # all values marked sparse - no relevant data to average + print(f"{k}: no relevant data (all values sparse)") + else: + # standard averaging for non-sparse metrics (backwards compatible) + # this preserves existing behavior for environments without sparse metrics + print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}") + + # enhanced rollout display that shows sparsity clearly + # Instead of showing misleading 0.0 values, display "-" for sparse metrics + # This makes it immediately obvious which rollouts are relevant vs excluded for i in range(r): - # rounded to 3 decimal places - trials = [round(v[(i * n) + j], 3) for j in range(n)] + if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics: + # For sparse metrics: "-" indicates sparse (irrelevant), numbers show actual values + # This visual distinction prevents confusion about which values contribute to averages + sparse_flags = results.sparse_metrics[k] + trials = [] + for j in range(n): + idx = (i * n) + j + if sparse_flags[idx]: + # sparse value - show "-" instead of 0.0 to indicate exclusion from averaging + trials.append("-") + else: + # non-sparse value - show actual computed score + trials.append(round(v[idx], 3)) + else: + # standard rollout printing for non-sparse metrics (backwards compatible) + # all values shown as numbers since none are excluded from averaging + trials = [round(v[(i * n) + j], 3) for j in range(n)] out = f"r{i + 1}: {trials}" print(out) From fe41304772b4d68d408b24f4d5a3f436c15c483c Mon Sep 17 00:00:00 2001 From: vxnuaj Date: Tue, 4 Nov 2025 01:21:31 -0800 Subject: [PATCH 2/5] edit PR.md --- PR.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PR.md b/PR.md index 45088e9f9..174629152 100644 --- a/PR.md +++ b/PR.md @@ -187,7 +187,10 @@ To test sparse metrics with ProfBench: cd prime-environments ``` -2. **Pull this verifiers PR with sparse metrics implementation** +2. **Pull this verifiers fork / pr with sparse metrics implementation:** + ```bash + git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards + ``` 3. **Install verifiers in editable mode:** ```bash From 849b1c18c09c73fe73e7bd880c36be56b4be4c31 Mon Sep 17 00:00:00 2001 From: vxnuaj Date: Tue, 4 Nov 2025 01:26:34 -0800 Subject: [PATCH 3/5] ruff, pre-commit, pytest --- GITHUB_PR_TEMPLATE.md | 66 +++++++++++++++++++++++++++++++++++ environments/math/math.py | 4 +-- verifiers/envs/env_group.py | 31 ++++++++-------- verifiers/envs/environment.py | 21 +++++++---- verifiers/rubrics/rubric.py | 10 ++++-- verifiers/types.py | 2 +- verifiers/utils/eval_utils.py | 28 ++++++++++----- 7 files changed, 127 insertions(+), 35 deletions(-) create mode 100644 GITHUB_PR_TEMPLATE.md diff --git a/GITHUB_PR_TEMPLATE.md b/GITHUB_PR_TEMPLATE.md new file mode 100644 index 000000000..0966ff899 --- /dev/null +++ b/GITHUB_PR_TEMPLATE.md @@ -0,0 +1,66 @@ +## Description +Add sparse metrics support for mathematically correct domain averaging in multi-domain environments. This feature enables selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments like ProfBench. + +**Key improvements:** +- Chemistry domain: `avg - 72.9 (relevant: 2/12)` instead of diluted `avg - 12.3` +- Physics domain: `avg - 66.2 (relevant: 10/12)` instead of diluted `avg - 56.2` +- Visual distinction: Shows `-` for sparse values instead of misleading `0.0` + +## Type of Change +- [ ] Bug fix (non-breaking change which fixes an issue) +- [x] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) +- [ ] Documentation update +- [ ] Test improvement + +## Testing +- [ ] All existing tests pass when running `uv run pytest` locally. +- [ ] New tests have been added to cover the changes + +**Manual Testing:** +Tested with ProfBench environment showing correct sparse metrics behavior: +- Domain-specific averages exclude irrelevant metrics +- Sparse values display as `-` in output +- `(relevant: X/Y)` info shows sparsity clearly + +## Checklist +- [ ] My code follows the style guidelines of this project as outlined in [AGENTS.md](https://github.com/PrimeIntellect-ai/verifiers/blob/main/AGENTS.md) +- [x] I have performed a self-review of my own code +- [x] I have commented my code, particularly in hard-to-understand areas +- [ ] I have made corresponding changes to the documentation +- [ ] My changes generate no new warnings +- [ ] Any dependent changes have been merged and published + +## Additional Notes + +### Implementation Overview +- **Types**: Added `sparse_metrics` fields to `RolloutScore`, `RolloutScores`, and `GenerateOutputs` +- **Environment**: Sparse tracking during interleaved scoring in `generate()` method +- **Rubrics**: Batch scoring with missing metrics marked as sparse in `score_rollouts()` +- **EnvGroup**: New `EnvGroupSparseRubric` class with `enable_sparse_metrics=True` opt-in +- **Display**: Sparse-aware averaging and `-` display in `eval_utils.py` + +### Backwards Compatibility +✅ Zero breaking changes - all existing environments work unchanged +✅ Opt-in only - sparse metrics activate only with `enable_sparse_metrics=True` +✅ Default behavior preserved - standard averaging remains identical + +### Testing Instructions +To test with ProfBench: +```bash +# 1. Clone ProfBench with sparse support +# ( this is a env. bounty that is in progrss of being ipmlemented, which needed this PR ) +git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench +cd prime-environments + +# 2. Clone this verifiers fork / pr branch +git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards +cd verifiers + +# 3. Install and test +cd .. +uv pip install -e . +vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 +``` + +**Expected output:** Domain averages like `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)` with `-` showing sparse values. diff --git a/environments/math/math.py b/environments/math/math.py index f63e6cc02..cff3579d0 100644 --- a/environments/math/math.py +++ b/environments/math/math.py @@ -2,7 +2,7 @@ def load_environment(**kwargs) -> vf.Environment: - ''' + """ Loads a custom environment. - ''' + """ raise NotImplementedError("Implement your custom environment here.") diff --git a/verifiers/envs/env_group.py b/verifiers/envs/env_group.py index 0d56ca25b..db60b2a48 100644 --- a/verifiers/envs/env_group.py +++ b/verifiers/envs/env_group.py @@ -87,20 +87,20 @@ async def score_rollout( class EnvGroupSparseRubric(EnvGroupRubric): """ enhanced EnvGroup rubric with domain-specific sparse tracking. - + this rubric extends EnvGroupRubric to support sparse metrics for multi-domain environments. - when routing scoring to domain-specific environments, it automatically marks metrics + when routing scoring to domain-specific environments, it automatically marks metrics that weren't computed by the target environment as sparse (excluded from averaging). - + Key differences from standard EnvGroupRubric: - marks uncomputed domain metrics as sparse (e.g., chemistry_reward=0.0 becomes sparse) - enables mathematically correct domain averaging by excluding irrelevant zeros - Only used when EnvGroup is initialized with enable_sparse_metrics=True - + Example: For a chemistry task in ProfBench, physics/finance/consulting rewards are marked sparse, ensuring chemistry_reward averages only over actual chemistry evaluations. """ - + async def score_rollout( self, prompt: str | list[ChatMessage], @@ -114,13 +114,13 @@ async def score_rollout( ) -> RolloutScore: """ Route scoring with sparse metrics support for multi-domain environments. - + This method handles scoring by: 1. Routing the task to the appropriate domain-specific environment - 2. Computing metrics using that environment's rubric + 2. Computing metrics using that environment's rubric 3. Filling uncomputed metrics with 0.0 and marking them as sparse 4. Returning results with sparse flags for proper averaging - + Only used when EnvGroup has enable_sparse_metrics=True. """ state = state or {} @@ -153,16 +153,16 @@ async def score_rollout( # mark uncomputed metrics as sparse for exclusion from averaging # example: for chemistry task, physics/finance/consulting rewards marked sparse # this enables mathematically correct domain averaging - uncomputed_metrics = set(self.all_reward_names) - set(env_results.metrics.keys()) + uncomputed_metrics = set(self.all_reward_names) - set( + env_results.metrics.keys() + ) sparse_metrics = uncomputed_metrics if uncomputed_metrics else None # Overall reward comes from the domain environment reward = env_results.reward return RolloutScore( - reward=reward, - metrics=metrics, - sparse_metrics=sparse_metrics + reward=reward, metrics=metrics, sparse_metrics=sparse_metrics ) @@ -174,8 +174,11 @@ class EnvGroup(Environment): """ def __init__( - self, envs: list[Environment], env_names: list[str] | None = None, - enable_sparse_metrics: bool = False, **kwargs + self, + envs: list[Environment], + env_names: list[str] | None = None, + enable_sparse_metrics: bool = False, + **kwargs, ): """ Initialize EnvGroup with a list of environments. diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index 83e0c6af5..c68b7ad5b 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -606,7 +606,9 @@ async def generate( # interleaved pipeline: separate semaphores for generation and scoring # pre-allocate metrics using known reward function names reward_func_names = self.rubric.get_reward_func_names() - sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names} + sparse_flags: dict[str, list[bool]] = { + name: [False] * n for name in reward_func_names + } # ^^ initialize sparse tracking flags for each metric # sparse_flags tracks which rollout values should be excluded from averaging # Initially all values are marked as relevant (False = not sparse) @@ -664,10 +666,10 @@ async def run_one(i: int) -> None: # ensure key exists in case of EnvGroup/RubricGroup if k not in results.metrics: results.metrics[k] = [0.0] * n - sparse_flags[k] = [False] * n + sparse_flags[k] = [False] * n # ^^ initialize sparse flags for dynamically discovered metrics results.metrics[k][i] = v - + # process sparse metric flags from rubric scoring # when a rubric marks certain metrics as sparse for this rollout, # we set the corresponding sparse flags to True to exclude them from averaging @@ -737,7 +739,10 @@ async def run_one(i: int) -> None: results.reward = rollout_scores.reward results.metrics = rollout_scores.metrics # pass through sparse_metrics if present - if hasattr(rollout_scores, 'sparse_metrics') and rollout_scores.sparse_metrics: + if ( + hasattr(rollout_scores, "sparse_metrics") + and rollout_scores.sparse_metrics + ): results.sparse_metrics = rollout_scores.sparse_metrics else: results.reward = [] @@ -763,10 +768,14 @@ async def run_one(i: int) -> None: # conditionally add sparse tracking to results # only include sparse_metrics if: # 1. We're using interleaved scoring (where sparse tracking occurs) - # 2. Score rollouts is enabled (metrics are being computed) + # 2. Score rollouts is enabled (metrics are being computed) # 3. At least one metric has sparse values (maintains backwards compatibility) # this ensures existing environments without sparse metrics remain unchanged - if interleave_scoring and score_rollouts and any(any(flags) for flags in sparse_flags.values()): + if ( + interleave_scoring + and score_rollouts + and any(any(flags) for flags in sparse_flags.values()) + ): results.sparse_metrics = sparse_flags return results diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py index f9b1ced53..c0d08f789 100644 --- a/verifiers/rubrics/rubric.py +++ b/verifiers/rubrics/rubric.py @@ -263,7 +263,9 @@ async def score_rollouts( reward=[], metrics={name: [] for name in reward_func_names}, # return sparse tracking only if needed (this is backwrds compatible) - sparse_metrics={name: [] for name in reward_func_names} if any(r.sparse_metrics for r in rewards if r.sparse_metrics) else None + sparse_metrics={name: [] for name in reward_func_names} + if any(r.sparse_metrics for r in rewards if r.sparse_metrics) + else None, ) # collect all possible metric keys across all rollouts @@ -281,7 +283,7 @@ async def score_rollouts( for k in all_metric_keys: metrics[k] = [] sparse_flags[k] = [] - + for reward in rewards: if k in reward.metrics: # metric computed for this rollout - include the actual value @@ -300,5 +302,7 @@ async def score_rollouts( metrics=metrics, # only include sparse_metrics if at least one metric has sparse values # this maintains backwards compatibility - environments without sparse metrics get None - sparse_metrics=sparse_flags if any(any(flags) for flags in sparse_flags.values()) else None + sparse_metrics=sparse_flags + if any(any(flags) for flags in sparse_flags.values()) + else None, ) diff --git a/verifiers/types.py b/verifiers/types.py index eeef7a091..22b9bb7b2 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -88,7 +88,7 @@ class GenerateOutputs(BaseModel): metadata: GenerateMetadata sparse_metrics: dict[str, list[bool]] | None = Field(default=None) # ^^ pptional sparse tracking for multi-domain environments - # When present, sparse_metrics[metric_name] indicates which rollout values should be + # When present, sparse_metrics[metric_name] indicates which rollout values should be # excluded from averaging (e.g., domain-specific metrics evaluated on irrelevant tasks). # True = sparse (exclude from average), False = relevant (include in average) # Example: chemistry_reward=[50.0, 0.0, 75.0] with sparse_metrics={"chemistry_reward": [False, True, False]} diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py index 2a9187f56..7211e6ec8 100644 --- a/verifiers/utils/eval_utils.py +++ b/verifiers/utils/eval_utils.py @@ -89,16 +89,22 @@ def print_results(results: GenerateOutputs, num_samples: int = 1): print(out) for k in results.metrics: v = results.metrics[k] - - # selective averaging that excludes sparse values + + # selective averaging that excludes sparse values # only average over relevant (non-sparse) values # instead of including misleading zeros in the calculation - if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics: + if ( + hasattr(results, "sparse_metrics") + and results.sparse_metrics + and k in results.sparse_metrics + ): # filter out sparse values from averaging calculation # sparse_flags[i] = True means exclude rollout i from averaging sparse_flags = results.sparse_metrics[k] - relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse] - + relevant_values = [ + val for val, is_sparse in zip(v, sparse_flags) if not is_sparse + ] + if relevant_values: # calculate statistics over only the relevant (non-sparse) values # this gives mathematically correct domain-specific averages @@ -113,12 +119,16 @@ def print_results(results: GenerateOutputs, num_samples: int = 1): # standard averaging for non-sparse metrics (backwards compatible) # this preserves existing behavior for environments without sparse metrics print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}") - + # enhanced rollout display that shows sparsity clearly # Instead of showing misleading 0.0 values, display "-" for sparse metrics # This makes it immediately obvious which rollouts are relevant vs excluded for i in range(r): - if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics: + if ( + hasattr(results, "sparse_metrics") + and results.sparse_metrics + and k in results.sparse_metrics + ): # For sparse metrics: "-" indicates sparse (irrelevant), numbers show actual values # This visual distinction prevents confusion about which values contribute to averages sparse_flags = results.sparse_metrics[k] @@ -127,10 +137,10 @@ def print_results(results: GenerateOutputs, num_samples: int = 1): idx = (i * n) + j if sparse_flags[idx]: # sparse value - show "-" instead of 0.0 to indicate exclusion from averaging - trials.append("-") + trials.append("-") else: # non-sparse value - show actual computed score - trials.append(round(v[idx], 3)) + trials.append(round(v[idx], 3)) else: # standard rollout printing for non-sparse metrics (backwards compatible) # all values shown as numbers since none are excluded from averaging From 95a954690d80aa056e6ba789cf532a43b20c1dfb Mon Sep 17 00:00:00 2001 From: vxnuaj Date: Tue, 4 Nov 2025 01:40:55 -0800 Subject: [PATCH 4/5] remove uneeded files --- GITHUB_PR_TEMPLATE.md | 66 ------------- PR.md | 212 ------------------------------------------ 2 files changed, 278 deletions(-) delete mode 100644 GITHUB_PR_TEMPLATE.md delete mode 100644 PR.md diff --git a/GITHUB_PR_TEMPLATE.md b/GITHUB_PR_TEMPLATE.md deleted file mode 100644 index 0966ff899..000000000 --- a/GITHUB_PR_TEMPLATE.md +++ /dev/null @@ -1,66 +0,0 @@ -## Description -Add sparse metrics support for mathematically correct domain averaging in multi-domain environments. This feature enables selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments like ProfBench. - -**Key improvements:** -- Chemistry domain: `avg - 72.9 (relevant: 2/12)` instead of diluted `avg - 12.3` -- Physics domain: `avg - 66.2 (relevant: 10/12)` instead of diluted `avg - 56.2` -- Visual distinction: Shows `-` for sparse values instead of misleading `0.0` - -## Type of Change -- [ ] Bug fix (non-breaking change which fixes an issue) -- [x] New feature (non-breaking change which adds functionality) -- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) -- [ ] Documentation update -- [ ] Test improvement - -## Testing -- [ ] All existing tests pass when running `uv run pytest` locally. -- [ ] New tests have been added to cover the changes - -**Manual Testing:** -Tested with ProfBench environment showing correct sparse metrics behavior: -- Domain-specific averages exclude irrelevant metrics -- Sparse values display as `-` in output -- `(relevant: X/Y)` info shows sparsity clearly - -## Checklist -- [ ] My code follows the style guidelines of this project as outlined in [AGENTS.md](https://github.com/PrimeIntellect-ai/verifiers/blob/main/AGENTS.md) -- [x] I have performed a self-review of my own code -- [x] I have commented my code, particularly in hard-to-understand areas -- [ ] I have made corresponding changes to the documentation -- [ ] My changes generate no new warnings -- [ ] Any dependent changes have been merged and published - -## Additional Notes - -### Implementation Overview -- **Types**: Added `sparse_metrics` fields to `RolloutScore`, `RolloutScores`, and `GenerateOutputs` -- **Environment**: Sparse tracking during interleaved scoring in `generate()` method -- **Rubrics**: Batch scoring with missing metrics marked as sparse in `score_rollouts()` -- **EnvGroup**: New `EnvGroupSparseRubric` class with `enable_sparse_metrics=True` opt-in -- **Display**: Sparse-aware averaging and `-` display in `eval_utils.py` - -### Backwards Compatibility -✅ Zero breaking changes - all existing environments work unchanged -✅ Opt-in only - sparse metrics activate only with `enable_sparse_metrics=True` -✅ Default behavior preserved - standard averaging remains identical - -### Testing Instructions -To test with ProfBench: -```bash -# 1. Clone ProfBench with sparse support -# ( this is a env. bounty that is in progrss of being ipmlemented, which needed this PR ) -git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench -cd prime-environments - -# 2. Clone this verifiers fork / pr branch -git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards -cd verifiers - -# 3. Install and test -cd .. -uv pip install -e . -vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 -``` - -**Expected output:** Domain averages like `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)` with `-` showing sparse values. diff --git a/PR.md b/PR.md deleted file mode 100644 index 174629152..000000000 --- a/PR.md +++ /dev/null @@ -1,212 +0,0 @@ -## Overview - -This PR implements sparse metrics / rubrics, which enables mathematically correct averaging in multi-domain environments. The key change heere is selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments. - -In environments like [`ProfBench`](https://arxiv.org/pdf/2510.18941), domain-specific scores get mixed with irrelevant zeros, making the averages misleading. - -**Example Issue:** -Evaluating GPT-4 on 12 tasks: 10 physics + 2 chemistry tasks - -``` -physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, 0, 0] # zeros for chemistry tasks -chemistry_reward: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 76] # zeros for physics tasks -``` - -- **Before**: `physics_reward: avg - 56.2` (diluted by irrelevant zeros) -- **Before**: `chemistry_reward: avg - 13.7` (misleading!) - -After, - -``` -physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, -, -] # zeros for chemistry tasks -chemistry_reward: [-, -, -, -, -, -, -, -, -, -, 88, 76] # zeros for physics tasks -``` - -- **After**: `chemistry_reward: avg - 82.0 (relevant: 2/12)` (actual chemistry skill) -- **After**: `physics_reward: avg - 66.2 (relevant: 10/12)` (pure physics performance) - -Which can all be done now within an `EnvGroup` with `enable_sparse_metrics=True`. - -we can now - -1. mark irrelevant values as sparse during scoring -2. exclude sparse values from averaging calculations -3. display sparsity clearly with `-` instead of `0.0` -4. maintain backwards compatibility with existing environments - -## Core - -### 1. type extensions @ `types.py` - -**New Fields Added:** - -```python -class RolloutScore(BaseModel): - sparse_metrics: set[str] | None = Field(default=None) - # set of metric names to exclude from averaging for this rollout - -class RolloutScores(BaseModel): - sparse_metrics: dict[str, list[bool]] | None = Field(default=None) - # per-rolout exclusion flags for batch scoring - -class GenerateOutputs(BaseModel): - sparse_metrics: dict[str, list[bool]] | None = Field(default=None) - # final sparse tracking for evaluation results -``` - -THis tracks which metric values should be excluded from averaging calculations. - -### 2. Environment Sparse Tracking @ `envs/environment.py` - -**Key Changes:** -- **Initialize sparse flags** for all metrics during interleaved scoring -- **Track sparse metrics** from rubric scoring results -- **Conditionally assign** sparse_metrics only if sparsity detected (backwards compatible) - -```python -# Initialize sparse tracking -sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names} - -# Process sparse flags from scoring -if rs.sparse_metrics: - for sparse_key in rs.sparse_metrics: - sparse_flags[sparse_key][i] = True - -# Only add if sparsity detected (backwards compatible) -if any(any(flags) for flags in sparse_flags.values()): - results.sparse_metrics = sparse_flags -``` - -this collects and aggregates sparse metadata during evaluation execution. - -### 3. Batch Scoring with Sparse Handling @ `rubrics/rubric.py` - -**Key Changes:** -- **Collect all metric keys** across rollouts (handles mixed metrics) -- **Fill missing metrics** with 0.0 and mark as sparse -- **Track sparsity flags** from individual rollout scores -- **Return sparse metadata** only if sparsity detected - -```python -# Handle missing metrics as sparse -if k in reward.metrics: - metrics[k].append(reward.metrics[k]) - is_sparse = reward.sparse_metrics and k in reward.sparse_metrics - sparse_flags[k].append(is_sparse) -else: - # Missing metric -> sparse 0.0 - metrics[k].append(0.0) - sparse_flags[k].append(True) -``` - -ensure consistent metric structure while preserving sparsity information. - -### 4. EnvGroup Sparse Architecture @ `envs/env_group.py`) - -**New Class: `EnvGroupSparseRubric`** - -Extends standard `EnvGroupRubric` with domain-specific sparse marking: - -```python -class EnvGroupSparseRubric(EnvGroupRubric): - async def score_rollout(self, ...): - # Route to domain-specific environment - env_results = await env.rubric.score_rollout(...) - - # Mark uncomputed metrics as sparse - uncomputed_metrics = set(all_rewards) - set(env_results.metrics.keys()) - sparse_metrics = uncomputed_metrics if uncomputed_metrics else None - - return RolloutScore(sparse_metrics=sparse_metrics, ...) -``` - -**Activation Logic:** -```python -# Key decision point for sparse metrics -if enable_sparse_metrics: - rubric = EnvGroupSparseRubric(self.env_map) # Sparse-aware -else: - rubric = EnvGroupRubric(self.env_map) # Standard (backwards compatible) -``` - -automatically mark domain-specific metrics as sparse when irrelevant. - -### 5. Sparse-Aware Display @ `utils/eval_utils.py` - -**Selective Averaging:** -```python -# Filter out sparse values before averaging -sparse_flags = results.sparse_metrics[k] -relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse] - -if relevant_values: - avg = sum(relevant_values) / len(relevant_values) - sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})" - print(f"{k}: avg - {avg:.3f}{sparsity_info}") -else: - print(f"{k}: no relevant data (all values sparse)") -``` - -**Enhanced Display:** -```python -# Show "-" for sparse values instead of misleading 0.0 -if sparse_flags[idx]: - trials.append("-") # Sparse (excluded from averaging) -else: - trials.append(round(v[idx], 3)) # Actual computed value -``` - -provide mathematically correct averages and clear visual distinction of sparsity. - -## Usage - -```python -# Standard behavior (backwards compatible) -env = vf.EnvGroup(envs, names) # Standard averaging - -# Sparse metrics enabled -env = vf.EnvGroup(envs, names, enable_sparse_metrics=True) # Selective averaging -``` - -```python -def load_environment(enable_sparse_metrics: bool = True): - return vf.EnvGroup( - envs=domain_envs, - env_names=domain_names, - enable_sparse_metrics=enable_sparse_metrics - ) -``` - -## To Test: - -To test sparse metrics with ProfBench: - -1. **Pull the ProfBench environment changes:** - ```bash - git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench - cd prime-environments - ``` - -2. **Pull this verifiers fork / pr with sparse metrics implementation:** - ```bash - git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards - ``` - -3. **Install verifiers in editable mode:** - ```bash - cd verifiers - uv pip install -e . - ``` - -4. **Run evaluation to see sparse metrics in action:** - ```bash - vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 - # -n must be >= 10 for sparsity to be detected, as if we do less, then profbench only loads from the first domain ( i believe physics or chemistry ) - # feel free to do -r x \in R^n - ``` - -**Expected output:** -- Domain-specific averages (e.g., `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)`) -- Sparse values displayed as `-` instead of `0.0` -- Mathematically correct averages excluding irrelevant domain scores - From 12a0aadecb5e5ec03176ddeaa0fcf685c6b2b0cc Mon Sep 17 00:00:00 2001 From: vxnuaj Date: Wed, 5 Nov 2025 11:19:58 -0800 Subject: [PATCH 5/5] remove uneeded pr bloat --- environments/math/math.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/math/math.py b/environments/math/math.py index cff3579d0..f63e6cc02 100644 --- a/environments/math/math.py +++ b/environments/math/math.py @@ -2,7 +2,7 @@ def load_environment(**kwargs) -> vf.Environment: - """ + ''' Loads a custom environment. - """ + ''' raise NotImplementedError("Implement your custom environment here.")