From a8f6e495a2a2709d8c8b550be56c068889874bcb Mon Sep 17 00:00:00 2001
From: vxnuaj <jv.100420@gmail.com>
Date: Tue, 4 Nov 2025 01:15:09 -0800
Subject: [PATCH 1/5] Add dynamic sparse rewards

---
 PR.md                         | 209 ++++++++++++++++++++++++++++++++++
 pyproject.toml                |   1 +
 verifiers/envs/env_group.py   | 102 ++++++++++++++++-
 verifiers/envs/environment.py |  30 +++++
 verifiers/rubrics/rubric.py   |  38 ++++++-
 verifiers/types.py            |  15 +++
 verifiers/utils/eval_utils.py |  48 +++++++-
 7 files changed, 432 insertions(+), 11 deletions(-)
 create mode 100644 PR.md

diff --git a/PR.md b/PR.md
new file mode 100644
index 000000000..45088e9f9
--- /dev/null
+++ b/PR.md
@@ -0,0 +1,209 @@
+## Overview
+
+This PR implements sparse metrics / rubrics, which enables mathematically correct averaging in multi-domain environments. The key change heere is selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments.
+
+In environments like [`ProfBench`](https://arxiv.org/pdf/2510.18941), domain-specific scores get mixed with irrelevant zeros, making the averages misleading.
+
+**Example Issue:**
+Evaluating GPT-4 on 12 tasks: 10 physics + 2 chemistry tasks
+
+```
+physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, 0, 0]  # zeros for chemistry tasks
+chemistry_reward: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 76]        # zeros for physics tasks
+```
+
+- **Before**: `physics_reward: avg - 56.2` (diluted by irrelevant zeros)
+- **Before**: `chemistry_reward: avg - 13.7` (misleading!)  
+
+After,
+
+```
+physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, -, -]  # zeros for chemistry tasks
+chemistry_reward: [-, -, -, -, -, -, -, -, -, -, 88, 76]        # zeros for physics tasks
+```
+
+- **After**: `chemistry_reward: avg - 82.0 (relevant: 2/12)` (actual chemistry skill)
+- **After**: `physics_reward: avg - 66.2 (relevant: 10/12)` (pure physics performance)
+
+Which can all be done now within an `EnvGroup` with `enable_sparse_metrics=True`.
+
+we can now
+
+1. mark irrelevant values as sparse during scoring
+2. exclude sparse values from averaging calculations
+3. display sparsity clearly with `-` instead of `0.0`
+4. maintain backwards compatibility with existing environments
+
+## Core 
+
+### 1. type extensions @ `types.py`
+
+**New Fields Added:**
+
+```python
+class RolloutScore(BaseModel):
+    sparse_metrics: set[str] | None = Field(default=None)
+    # set of metric names to exclude from averaging for this rollout
+
+class RolloutScores(BaseModel): 
+    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
+    # per-rolout exclusion flags for batch scoring
+
+class GenerateOutputs(BaseModel):
+    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
+    # final sparse tracking for evaluation results
+```
+
+THis tracks which metric values should be excluded from averaging calculations.
+
+### 2. Environment Sparse Tracking @ `envs/environment.py`
+
+**Key Changes:**
+- **Initialize sparse flags** for all metrics during interleaved scoring
+- **Track sparse metrics** from rubric scoring results  
+- **Conditionally assign** sparse_metrics only if sparsity detected (backwards compatible)
+
+```python
+# Initialize sparse tracking
+sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names}
+
+# Process sparse flags from scoring
+if rs.sparse_metrics:
+    for sparse_key in rs.sparse_metrics:
+        sparse_flags[sparse_key][i] = True
+
+# Only add if sparsity detected (backwards compatible)
+if any(any(flags) for flags in sparse_flags.values()):
+    results.sparse_metrics = sparse_flags
+```
+
+this collects and aggregates sparse metadata during evaluation execution.
+
+### 3. Batch Scoring with Sparse Handling @ `rubrics/rubric.py` 
+
+**Key Changes:**
+- **Collect all metric keys** across rollouts (handles mixed metrics)
+- **Fill missing metrics** with 0.0 and mark as sparse
+- **Track sparsity flags** from individual rollout scores
+- **Return sparse metadata** only if sparsity detected
+
+```python
+# Handle missing metrics as sparse
+if k in reward.metrics:
+    metrics[k].append(reward.metrics[k])
+    is_sparse = reward.sparse_metrics and k in reward.sparse_metrics
+    sparse_flags[k].append(is_sparse)
+else:
+    # Missing metric -> sparse 0.0
+    metrics[k].append(0.0)
+    sparse_flags[k].append(True)
+```
+
+ensure consistent metric structure while preserving sparsity information.
+
+### 4. EnvGroup Sparse Architecture @ `envs/env_group.py`)
+
+**New Class: `EnvGroupSparseRubric`**
+
+Extends standard `EnvGroupRubric` with domain-specific sparse marking:
+
+```python
+class EnvGroupSparseRubric(EnvGroupRubric):
+    async def score_rollout(self, ...):
+        # Route to domain-specific environment
+        env_results = await env.rubric.score_rollout(...)
+        
+        # Mark uncomputed metrics as sparse
+        uncomputed_metrics = set(all_rewards) - set(env_results.metrics.keys())
+        sparse_metrics = uncomputed_metrics if uncomputed_metrics else None
+        
+        return RolloutScore(sparse_metrics=sparse_metrics, ...)
+```
+
+**Activation Logic:**
+```python
+# Key decision point for sparse metrics
+if enable_sparse_metrics:
+    rubric = EnvGroupSparseRubric(self.env_map)  # Sparse-aware
+else:
+    rubric = EnvGroupRubric(self.env_map)       # Standard (backwards compatible)
+```
+
+automatically mark domain-specific metrics as sparse when irrelevant.
+
+### 5. Sparse-Aware Display @ `utils/eval_utils.py`
+
+**Selective Averaging:**
+```python
+# Filter out sparse values before averaging
+sparse_flags = results.sparse_metrics[k]
+relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse]
+
+if relevant_values:
+    avg = sum(relevant_values) / len(relevant_values)
+    sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})"
+    print(f"{k}: avg - {avg:.3f}{sparsity_info}")
+else:
+    print(f"{k}: no relevant data (all values sparse)")
+```
+
+**Enhanced Display:**
+```python
+# Show "-" for sparse values instead of misleading 0.0
+if sparse_flags[idx]:
+    trials.append("-")        # Sparse (excluded from averaging)
+else:
+    trials.append(round(v[idx], 3))  # Actual computed value
+```
+
+provide mathematically correct averages and clear visual distinction of sparsity.
+
+## Usage 
+
+```python
+# Standard behavior (backwards compatible)
+env = vf.EnvGroup(envs, names)                           # Standard averaging
+
+# Sparse metrics enabled
+env = vf.EnvGroup(envs, names, enable_sparse_metrics=True)  # Selective averaging
+```
+
+```python
+def load_environment(enable_sparse_metrics: bool = True):
+    return vf.EnvGroup(
+        envs=domain_envs,
+        env_names=domain_names, 
+        enable_sparse_metrics=enable_sparse_metrics
+    )
+```
+
+## To Test:
+
+To test sparse metrics with ProfBench:
+
+1. **Pull the ProfBench environment changes:**
+   ```bash
+   git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench
+   cd prime-environments
+   ```
+
+2. **Pull this verifiers PR with sparse metrics implementation**
+
+3. **Install verifiers in editable mode:**
+   ```bash
+   cd verifiers
+   uv pip install -e .
+   ```
+
+4. **Run evaluation to see sparse metrics in action:**
+   ```bash
+   vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 
+   # -n must be >= 10 for sparsity to be detected, as if we do less, then profbench only loads from the first domain ( i believe physics or chemistry )
+   # feel free to do -r x \in R^n
+   ```
+
+**Expected output:**
+- Domain-specific averages (e.g., `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)`)
+- Sparse values displayed as `-` instead of `0.0`
+- Mathematically correct averages excluding irrelevant domain scores
+
diff --git a/pyproject.toml b/pyproject.toml
index f0953afbc..81acfc07d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     "tomli; python_version < '3.11'",
     "prime-sandboxes>=0.1.0",
     "wget>=3.2",
+    "torch>=2.8.0",
 ]
 
 [dependency-groups]
diff --git a/verifiers/envs/env_group.py b/verifiers/envs/env_group.py
index da9b4d108..c07a9f2eb 100644
--- a/verifiers/envs/env_group.py
+++ b/verifiers/envs/env_group.py
@@ -83,6 +83,88 @@ async def score_rollout(
         return RolloutScore(reward=reward, metrics=metrics)
 
 
+class EnvGroupSparseRubric(EnvGroupRubric):
+    """
+    enhanced EnvGroup rubric with domain-specific sparse tracking.
+    
+    this rubric extends EnvGroupRubric to support sparse metrics for multi-domain environments.
+    when routing scoring to domain-specific environments, it automatically marks metrics 
+    that weren't computed by the target environment as sparse (excluded from averaging).
+    
+    Key differences from standard EnvGroupRubric:
+    - marks uncomputed domain metrics as sparse (e.g., chemistry_reward=0.0 becomes sparse)
+    - enables mathematically correct domain averaging by excluding irrelevant zeros
+    - Only used when EnvGroup is initialized with enable_sparse_metrics=True
+    
+    Example: For a chemistry task in ProfBench, physics/finance/consulting rewards are marked
+    sparse, ensuring chemistry_reward averages only over actual chemistry evaluations.
+    """
+    
+    async def score_rollout(
+        self,
+        prompt: str | list[ChatMessage],
+        completion: str | list[ChatMessage],
+        answer: str = "",
+        state: State | None = None,
+        task: str = "default",
+        info: dict | None = None,
+        example_id: int | None = None,
+        **kwargs,
+    ) -> RolloutScore:
+        """
+        Route scoring with sparse metrics support for multi-domain environments.
+        
+        This method handles scoring by:
+        1. Routing the task to the appropriate domain-specific environment
+        2. Computing metrics using that environment's rubric  
+        3. Filling uncomputed metrics with 0.0 and marking them as sparse
+        4. Returning results with sparse flags for proper averaging
+        
+        Only used when EnvGroup has enable_sparse_metrics=True.
+        """
+        state = state or {}
+        info = info or {}
+
+        # pre-initialize all known metrics to 0.0
+        # this ensures consistent metric structure across all rollouts
+        # uncomputed metrics will remain 0.0 and be marked sparse
+        metrics = {name: 0.0 for name in self.all_reward_names}
+        reward = 0.0
+
+        # Route to appropriate domain environment based on task
+        env = self.env_map.get(task)
+        if env is None:
+            self.logger.warning(f"No environment found for task '{task}'")
+            return RolloutScore(reward=reward, metrics=metrics)
+
+        # Score using the domain-specific environment's rubric
+        # this computes only the metrics relevant to this domain
+        env_results = await env.rubric.score_rollout(
+            prompt, completion, answer, state, task, info, example_id, **kwargs
+        )
+
+        # update metrics with computed values from domain environment
+        # metrics not computed by this environment remain at 0.0
+        for reward_name, score in env_results.metrics.items():
+            if reward_name in metrics:
+                metrics[reward_name] = score
+
+        # mark uncomputed metrics as sparse for exclusion from averaging
+        # example: for chemistry task, physics/finance/consulting rewards marked sparse
+        # this enables mathematically correct domain averaging
+        uncomputed_metrics = set(self.all_reward_names) - set(env_results.metrics.keys())
+        sparse_metrics = uncomputed_metrics if uncomputed_metrics else None
+
+        # Overall reward comes from the domain environment
+        reward = env_results.reward
+
+        return RolloutScore(
+            reward=reward, 
+            metrics=metrics,
+            sparse_metrics=sparse_metrics
+        )
+
+
 class EnvGroup(Environment):
     """
     Environment group that acts as a mixture of multiple environments.
@@ -91,7 +173,8 @@ class EnvGroup(Environment):
     """
 
     def __init__(
-        self, envs: list[Environment], env_names: list[str] | None = None, **kwargs
+        self, envs: list[Environment], env_names: list[str] | None = None, 
+        enable_sparse_metrics: bool = False, **kwargs
     ):
         """
         Initialize EnvGroup with a list of environments.
@@ -100,6 +183,7 @@ def __init__(
             envs: list of Environment instances
             env_names: Optional list of names for each environment.
                       If not provided, uses "env_0", "env_1", etc.
+            enable_sparse_metrics: Enable sparse metrics for mathematically correct domain averaging
             **kwargs: Additional arguments passed to parent Environment
         """
         if not envs:
@@ -133,10 +217,18 @@ def add_task(example):
                 eval_datasets.append(env_eval_dataset)
         dataset = concatenate_datasets(datasets) if datasets else None
         eval_dataset = concatenate_datasets(eval_datasets) if eval_datasets else None
-        # wrap rubrics
-        rubric = EnvGroupRubric(self.env_map)
-
-        # Don't set oai_tools at the group level since different sub-environments
+        # choose rubric type based on enable_sparse_metrics flag
+        # this is the key decision point for sparse metrics activation
+        if enable_sparse_metrics:
+            # use sparse-aware rubric that marks uncomputed domain metrics as sparse
+            # enables mathematically correct averaging by excluding irrelevant zeros
+            rubric = EnvGroupSparseRubric(self.env_map)
+        else:
+            # use standard rubric that includes all values in averaging (backwards compatible)
+            # this preserves existing behavior for environments without sparse metrics
+            rubric = EnvGroupRubric(self.env_map)
+
+        # don't set oai_tools at the group level since different sub-environments
         # may have different tools. Instead, set them per-task in rollout().
         # initialize parent Environment
         super().__init__(
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
index 9e9e39158..83e0c6af5 100644
--- a/verifiers/envs/environment.py
+++ b/verifiers/envs/environment.py
@@ -605,6 +605,12 @@ async def generate(
         if interleave_scoring and score_rollouts:
             # interleaved pipeline: separate semaphores for generation and scoring
             # pre-allocate metrics using known reward function names
+            reward_func_names = self.rubric.get_reward_func_names()
+            sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names}
+            # ^^ initialize sparse tracking flags for each metric
+            # sparse_flags tracks which rollout values should be excluded from averaging
+            # Initially all values are marked as relevant (False = not sparse)
+            # Rubrics can mark specific rollouts as sparse during scoring
             maybe_gen_sem = generation_semaphore or (
                 semaphore or await maybe_semaphore(gen_limit)
             )
@@ -658,7 +664,19 @@ async def run_one(i: int) -> None:
                     # ensure key exists in case of EnvGroup/RubricGroup
                     if k not in results.metrics:
                         results.metrics[k] = [0.0] * n
+                        sparse_flags[k] = [False] * n  
+                        # ^^ initialize sparse flags for dynamically discovered metrics
                     results.metrics[k][i] = v
+                    
+                # process sparse metric flags from rubric scoring
+                # when a rubric marks certain metrics as sparse for this rollout,
+                # we set the corresponding sparse flags to True to exclude them from averaging
+                if rs.sparse_metrics:
+                    for sparse_key in rs.sparse_metrics:
+                        if sparse_key not in sparse_flags:
+                            # handle metrics marked sparse that weren't pre-allocated
+                            sparse_flags[sparse_key] = [False] * n
+                        sparse_flags[sparse_key][i] = True
                 num_completed += 1
                 if save_every > 0 and num_completed % save_every == 0:
                     self.logger.debug(f"Saving results to {results_path}")
@@ -718,6 +736,9 @@ async def run_one(i: int) -> None:
                 )
                 results.reward = rollout_scores.reward
                 results.metrics = rollout_scores.metrics
+                # pass through sparse_metrics if present
+                if hasattr(rollout_scores, 'sparse_metrics') and rollout_scores.sparse_metrics:
+                    results.sparse_metrics = rollout_scores.sparse_metrics
             else:
                 results.reward = []
                 results.metrics = {}
@@ -739,6 +760,15 @@ async def run_one(i: int) -> None:
         results.metadata.avg_reward = avg_reward
         results.metadata.avg_metrics = avg_metrics
 
+        # conditionally add sparse tracking to results
+        # only include sparse_metrics if:
+        # 1. We're using interleaved scoring (where sparse tracking occurs)
+        # 2. Score rollouts is enabled (metrics are being computed)  
+        # 3. At least one metric has sparse values (maintains backwards compatibility)
+        # this ensures existing environments without sparse metrics remain unchanged
+        if interleave_scoring and score_rollouts and any(any(flags) for flags in sparse_flags.values()):
+            results.sparse_metrics = sparse_flags
+
         return results
 
     # alias for backward compatibility
diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py
index 1fd0220b7..f9b1ced53 100644
--- a/verifiers/rubrics/rubric.py
+++ b/verifiers/rubrics/rubric.py
@@ -262,11 +262,43 @@ async def score_rollouts(
             return RolloutScores(
                 reward=[],
                 metrics={name: [] for name in reward_func_names},
+                # return sparse tracking only if needed (this is backwrds compatible)
+                sparse_metrics={name: [] for name in reward_func_names} if any(r.sparse_metrics for r in rewards if r.sparse_metrics) else None
             )
 
+        # collect all possible metric keys across all rollouts
+        # this handles cases where different rollouts may have different metrics
+        # (e.g., multi-domain environments where some metrics don't apply to all tasks)
+        all_metric_keys = set()
+        for reward in rewards:
+            all_metric_keys.update(reward.metrics.keys())
+
+        # build unified metrics dict with sparse tracking
+        # ensures all metric keys are present in all rollout results, filling missing
+        # values with 0.0 and marking them as sparse (excluded from averaging)
+        metrics = {}
+        sparse_flags = {}
+        for k in all_metric_keys:
+            metrics[k] = []
+            sparse_flags[k] = []
+            
+            for reward in rewards:
+                if k in reward.metrics:
+                    # metric computed for this rollout - include the actual value
+                    metrics[k].append(reward.metrics[k])
+                    # check if rubric marked this metric as sparse for this rollout
+                    is_sparse = reward.sparse_metrics and k in reward.sparse_metrics
+                    sparse_flags[k].append(is_sparse)
+                else:
+                    # metric not computed for this rollout - fill with sparse 0.0
+                    # this handles domain-specific metrics that don't apply to all tasks
+                    metrics[k].append(0.0)
+                    sparse_flags[k].append(True)
+
         return RolloutScores(
             reward=[reward.reward for reward in rewards],
-            metrics={
-                k: [item.metrics[k] for item in rewards] for k in rewards[0].metrics
-            },
+            metrics=metrics,
+            # only include sparse_metrics if at least one metric has sparse values
+            # this maintains backwards compatibility - environments without sparse metrics get None
+            sparse_metrics=sparse_flags if any(any(flags) for flags in sparse_flags.values()) else None
         )
diff --git a/verifiers/types.py b/verifiers/types.py
index daa3eb780..eeef7a091 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -86,6 +86,13 @@ class GenerateOutputs(BaseModel):
     reward: list[float]
     metrics: dict[str, list[float]] = Field(default_factory=dict)
     metadata: GenerateMetadata
+    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
+    # ^^ pptional sparse tracking for multi-domain environments
+    # When present, sparse_metrics[metric_name] indicates which rollout values should be 
+    # excluded from averaging (e.g., domain-specific metrics evaluated on irrelevant tasks).
+    # True = sparse (exclude from average), False = relevant (include in average)
+    # Example: chemistry_reward=[50.0, 0.0, 75.0] with sparse_metrics={"chemistry_reward": [False, True, False]}
+    # would average to 62.5 instead of 41.7, excluding the irrelevant 0.0 score.
 
 
 class RolloutScore(BaseModel):
@@ -93,6 +100,10 @@ class RolloutScore(BaseModel):
 
     reward: float
     metrics: dict[str, float] = Field(default_factory=dict)
+    sparse_metrics: set[str] | None = Field(default=None)
+    # ^^ set of metric names that should be excluded from averaging for this rollout
+    # Used by rubrics to mark domain-specific metrics as irrelevant for certain tasks
+    # Example: {"chemistry_reward", "physics_reward"} when evaluating a finance task
 
 
 class RolloutScores(BaseModel):
@@ -100,6 +111,10 @@ class RolloutScores(BaseModel):
 
     reward: list[float]
     metrics: dict[str, list[float]] = Field(default_factory=dict)
+    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
+    # ^^ per-rollout exclusion flags for batch scoring
+    # Maps metric names to lists of boolean flags (True = sparse, False = relevant)
+    # Length matches the rollout lists in reward/metrics. Aggregated from individual RolloutScore.sparse_metrics
 
 
 class ProcessedOutputs(BaseModel):
diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
index 3c2c18d38..2a9187f56 100644
--- a/verifiers/utils/eval_utils.py
+++ b/verifiers/utils/eval_utils.py
@@ -89,10 +89,52 @@ def print_results(results: GenerateOutputs, num_samples: int = 1):
         print(out)
     for k in results.metrics:
         v = results.metrics[k]
-        print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}")
+        
+        # selective averaging that excludes sparse values  
+        # only average over relevant (non-sparse) values
+        # instead of including misleading zeros in the calculation
+        if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics:
+            # filter out sparse values from averaging calculation
+            # sparse_flags[i] = True means exclude rollout i from averaging
+            sparse_flags = results.sparse_metrics[k]
+            relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse]
+            
+            if relevant_values:
+                # calculate statistics over only the relevant (non-sparse) values
+                # this gives mathematically correct domain-specific averages
+                avg = sum(relevant_values) / len(relevant_values)
+                std = np.std(relevant_values)
+                sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})"
+                print(f"{k}: avg - {avg:.3f}, std - {std:.3f}{sparsity_info}")
+            else:
+                # all values marked sparse - no relevant data to average
+                print(f"{k}: no relevant data (all values sparse)")
+        else:
+            # standard averaging for non-sparse metrics (backwards compatible)
+            # this preserves existing behavior for environments without sparse metrics
+            print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}")
+        
+        # enhanced rollout display that shows sparsity clearly
+        # Instead of showing misleading 0.0 values, display "-" for sparse metrics
+        # This makes it immediately obvious which rollouts are relevant vs excluded
         for i in range(r):
-            # rounded to 3 decimal places
-            trials = [round(v[(i * n) + j], 3) for j in range(n)]
+            if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics:
+                # For sparse metrics: "-" indicates sparse (irrelevant), numbers show actual values
+                # This visual distinction prevents confusion about which values contribute to averages
+                sparse_flags = results.sparse_metrics[k]
+                trials = []
+                for j in range(n):
+                    idx = (i * n) + j
+                    if sparse_flags[idx]:
+                        # sparse value - show "-" instead of 0.0 to indicate exclusion from averaging
+                        trials.append("-")  
+                    else:
+                        # non-sparse value - show actual computed score
+                        trials.append(round(v[idx], 3))  
+            else:
+                # standard rollout printing for non-sparse metrics (backwards compatible)
+                # all values shown as numbers since none are excluded from averaging
+                trials = [round(v[(i * n) + j], 3) for j in range(n)]
             out = f"r{i + 1}: {trials}"
             print(out)
 

From fe41304772b4d68d408b24f4d5a3f436c15c483c Mon Sep 17 00:00:00 2001
From: vxnuaj <jv.100420@gmail.com>
Date: Tue, 4 Nov 2025 01:21:31 -0800
Subject: [PATCH 2/5] edit PR.md

---
 PR.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/PR.md b/PR.md
index 45088e9f9..174629152 100644
--- a/PR.md
+++ b/PR.md
@@ -187,7 +187,10 @@ To test sparse metrics with ProfBench:
    cd prime-environments
    ```
 
-2. **Pull this verifiers PR with sparse metrics implementation**
+2. **Pull this verifiers fork / pr with sparse metrics implementation:**
+   ```bash
+   git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards
+   ```
 
 3. **Install verifiers in editable mode:**
    ```bash

From 849b1c18c09c73fe73e7bd880c36be56b4be4c31 Mon Sep 17 00:00:00 2001
From: vxnuaj <jv.100420@gmail.com>
Date: Tue, 4 Nov 2025 01:26:34 -0800
Subject: [PATCH 3/5] ruff, pre-commit, pytest

---
 GITHUB_PR_TEMPLATE.md         | 66 +++++++++++++++++++++++++++++++++++
 environments/math/math.py     |  4 +--
 verifiers/envs/env_group.py   | 31 ++++++++--------
 verifiers/envs/environment.py | 21 +++++++----
 verifiers/rubrics/rubric.py   | 10 ++++--
 verifiers/types.py            |  2 +-
 verifiers/utils/eval_utils.py | 28 ++++++++++-----
 7 files changed, 127 insertions(+), 35 deletions(-)
 create mode 100644 GITHUB_PR_TEMPLATE.md

diff --git a/GITHUB_PR_TEMPLATE.md b/GITHUB_PR_TEMPLATE.md
new file mode 100644
index 000000000..0966ff899
--- /dev/null
+++ b/GITHUB_PR_TEMPLATE.md
@@ -0,0 +1,66 @@
+## Description
+Add sparse metrics support for mathematically correct domain averaging in multi-domain environments. This feature enables selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments like ProfBench.
+
+**Key improvements:**
+- Chemistry domain: `avg - 72.9 (relevant: 2/12)` instead of diluted `avg - 12.3`
+- Physics domain: `avg - 66.2 (relevant: 10/12)` instead of diluted `avg - 56.2`
+- Visual distinction: Shows `-` for sparse values instead of misleading `0.0`
+
+## Type of Change
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [x] New feature (non-breaking change which adds functionality)
+- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
+- [ ] Documentation update
+- [ ] Test improvement
+
+## Testing
+- [ ] All existing tests pass when running `uv run pytest` locally.
+- [ ] New tests have been added to cover the changes
+
+**Manual Testing:**
+Tested with ProfBench environment showing correct sparse metrics behavior:
+- Domain-specific averages exclude irrelevant metrics
+- Sparse values display as `-` in output
+- `(relevant: X/Y)` info shows sparsity clearly
+
+## Checklist
+- [ ] My code follows the style guidelines of this project as outlined in [AGENTS.md](https://github.com/PrimeIntellect-ai/verifiers/blob/main/AGENTS.md)
+- [x] I have performed a self-review of my own code
+- [x] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] My changes generate no new warnings
+- [ ] Any dependent changes have been merged and published
+
+## Additional Notes
+
+### Implementation Overview
+- **Types**: Added `sparse_metrics` fields to `RolloutScore`, `RolloutScores`, and `GenerateOutputs`
+- **Environment**: Sparse tracking during interleaved scoring in `generate()` method
+- **Rubrics**: Batch scoring with missing metrics marked as sparse in `score_rollouts()`
+- **EnvGroup**: New `EnvGroupSparseRubric` class with `enable_sparse_metrics=True` opt-in
+- **Display**: Sparse-aware averaging and `-` display in `eval_utils.py`
+
+### Backwards Compatibility
+✅ Zero breaking changes - all existing environments work unchanged  
+✅ Opt-in only - sparse metrics activate only with `enable_sparse_metrics=True`  
+✅ Default behavior preserved - standard averaging remains identical  
+
+### Testing Instructions
+To test with ProfBench:
+```bash
+# 1. Clone ProfBench with sparse support
+#  ( this is a env. bounty that is in progrss of being ipmlemented, which needed this PR )
+git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench
+cd prime-environments
+
+# 2. Clone this verifiers fork / pr branch
+git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards
+cd verifiers
+
+# 3. Install and test
+cd ..
+uv pip install -e .
+vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1
+```
+
+**Expected output:** Domain averages like `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)` with `-` showing sparse values.
diff --git a/environments/math/math.py b/environments/math/math.py
index f63e6cc02..cff3579d0 100644
--- a/environments/math/math.py
+++ b/environments/math/math.py
@@ -2,7 +2,7 @@
 
 
 def load_environment(**kwargs) -> vf.Environment:
-    '''
+    """
     Loads a custom environment.
-    '''
+    """
     raise NotImplementedError("Implement your custom environment here.")
diff --git a/verifiers/envs/env_group.py b/verifiers/envs/env_group.py
index 0d56ca25b..db60b2a48 100644
--- a/verifiers/envs/env_group.py
+++ b/verifiers/envs/env_group.py
@@ -87,20 +87,20 @@ async def score_rollout(
 class EnvGroupSparseRubric(EnvGroupRubric):
     """
     enhanced EnvGroup rubric with domain-specific sparse tracking.
-    
+
     this rubric extends EnvGroupRubric to support sparse metrics for multi-domain environments.
-    when routing scoring to domain-specific environments, it automatically marks metrics 
+    when routing scoring to domain-specific environments, it automatically marks metrics
     that weren't computed by the target environment as sparse (excluded from averaging).
-    
+
     Key differences from standard EnvGroupRubric:
     - marks uncomputed domain metrics as sparse (e.g., chemistry_reward=0.0 becomes sparse)
     - enables mathematically correct domain averaging by excluding irrelevant zeros
     - Only used when EnvGroup is initialized with enable_sparse_metrics=True
-    
+
     Example: For a chemistry task in ProfBench, physics/finance/consulting rewards are marked
     sparse, ensuring chemistry_reward averages only over actual chemistry evaluations.
     """
-    
+
     async def score_rollout(
         self,
         prompt: str | list[ChatMessage],
@@ -114,13 +114,13 @@ async def score_rollout(
     ) -> RolloutScore:
         """
         Route scoring with sparse metrics support for multi-domain environments.
-        
+
         This method handles scoring by:
         1. Routing the task to the appropriate domain-specific environment
-        2. Computing metrics using that environment's rubric  
+        2. Computing metrics using that environment's rubric
         3. Filling uncomputed metrics with 0.0 and marking them as sparse
         4. Returning results with sparse flags for proper averaging
-        
+
         Only used when EnvGroup has enable_sparse_metrics=True.
         """
         state = state or {}
@@ -153,16 +153,16 @@ async def score_rollout(
         # mark uncomputed metrics as sparse for exclusion from averaging
         # example: for chemistry task, physics/finance/consulting rewards marked sparse
         # this enables mathematically correct domain averaging
-        uncomputed_metrics = set(self.all_reward_names) - set(env_results.metrics.keys())
+        uncomputed_metrics = set(self.all_reward_names) - set(
+            env_results.metrics.keys()
+        )
         sparse_metrics = uncomputed_metrics if uncomputed_metrics else None
 
         # Overall reward comes from the domain environment
         reward = env_results.reward
 
         return RolloutScore(
-            reward=reward, 
-            metrics=metrics,
-            sparse_metrics=sparse_metrics
+            reward=reward, metrics=metrics, sparse_metrics=sparse_metrics
         )
 
 
@@ -174,8 +174,11 @@ class EnvGroup(Environment):
     """
 
     def __init__(
-        self, envs: list[Environment], env_names: list[str] | None = None, 
-        enable_sparse_metrics: bool = False, **kwargs
+        self,
+        envs: list[Environment],
+        env_names: list[str] | None = None,
+        enable_sparse_metrics: bool = False,
+        **kwargs,
     ):
         """
         Initialize EnvGroup with a list of environments.
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
index 83e0c6af5..c68b7ad5b 100644
--- a/verifiers/envs/environment.py
+++ b/verifiers/envs/environment.py
@@ -606,7 +606,9 @@ async def generate(
             # interleaved pipeline: separate semaphores for generation and scoring
             # pre-allocate metrics using known reward function names
             reward_func_names = self.rubric.get_reward_func_names()
-            sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names}
+            sparse_flags: dict[str, list[bool]] = {
+                name: [False] * n for name in reward_func_names
+            }
             # ^^ initialize sparse tracking flags for each metric
             # sparse_flags tracks which rollout values should be excluded from averaging
             # Initially all values are marked as relevant (False = not sparse)
@@ -664,10 +666,10 @@ async def run_one(i: int) -> None:
                     # ensure key exists in case of EnvGroup/RubricGroup
                     if k not in results.metrics:
                         results.metrics[k] = [0.0] * n
-                        sparse_flags[k] = [False] * n  
+                        sparse_flags[k] = [False] * n
                         # ^^ initialize sparse flags for dynamically discovered metrics
                     results.metrics[k][i] = v
-                    
+
                 # process sparse metric flags from rubric scoring
                 # when a rubric marks certain metrics as sparse for this rollout,
                 # we set the corresponding sparse flags to True to exclude them from averaging
@@ -737,7 +739,10 @@ async def run_one(i: int) -> None:
                 results.reward = rollout_scores.reward
                 results.metrics = rollout_scores.metrics
                 # pass through sparse_metrics if present
-                if hasattr(rollout_scores, 'sparse_metrics') and rollout_scores.sparse_metrics:
+                if (
+                    hasattr(rollout_scores, "sparse_metrics")
+                    and rollout_scores.sparse_metrics
+                ):
                     results.sparse_metrics = rollout_scores.sparse_metrics
             else:
                 results.reward = []
@@ -763,10 +768,14 @@ async def run_one(i: int) -> None:
         # conditionally add sparse tracking to results
         # only include sparse_metrics if:
         # 1. We're using interleaved scoring (where sparse tracking occurs)
-        # 2. Score rollouts is enabled (metrics are being computed)  
+        # 2. Score rollouts is enabled (metrics are being computed)
         # 3. At least one metric has sparse values (maintains backwards compatibility)
         # this ensures existing environments without sparse metrics remain unchanged
-        if interleave_scoring and score_rollouts and any(any(flags) for flags in sparse_flags.values()):
+        if (
+            interleave_scoring
+            and score_rollouts
+            and any(any(flags) for flags in sparse_flags.values())
+        ):
             results.sparse_metrics = sparse_flags
 
         return results
diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py
index f9b1ced53..c0d08f789 100644
--- a/verifiers/rubrics/rubric.py
+++ b/verifiers/rubrics/rubric.py
@@ -263,7 +263,9 @@ async def score_rollouts(
                 reward=[],
                 metrics={name: [] for name in reward_func_names},
                 # return sparse tracking only if needed (this is backwrds compatible)
-                sparse_metrics={name: [] for name in reward_func_names} if any(r.sparse_metrics for r in rewards if r.sparse_metrics) else None
+                sparse_metrics={name: [] for name in reward_func_names}
+                if any(r.sparse_metrics for r in rewards if r.sparse_metrics)
+                else None,
             )
 
         # collect all possible metric keys across all rollouts
@@ -281,7 +283,7 @@ async def score_rollouts(
         for k in all_metric_keys:
             metrics[k] = []
             sparse_flags[k] = []
-            
+
             for reward in rewards:
                 if k in reward.metrics:
                     # metric computed for this rollout - include the actual value
@@ -300,5 +302,7 @@ async def score_rollouts(
             metrics=metrics,
             # only include sparse_metrics if at least one metric has sparse values
             # this maintains backwards compatibility - environments without sparse metrics get None
-            sparse_metrics=sparse_flags if any(any(flags) for flags in sparse_flags.values()) else None
+            sparse_metrics=sparse_flags
+            if any(any(flags) for flags in sparse_flags.values())
+            else None,
         )
diff --git a/verifiers/types.py b/verifiers/types.py
index eeef7a091..22b9bb7b2 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -88,7 +88,7 @@ class GenerateOutputs(BaseModel):
     metadata: GenerateMetadata
     sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
     # ^^ pptional sparse tracking for multi-domain environments
-    # When present, sparse_metrics[metric_name] indicates which rollout values should be 
+    # When present, sparse_metrics[metric_name] indicates which rollout values should be
     # excluded from averaging (e.g., domain-specific metrics evaluated on irrelevant tasks).
     # True = sparse (exclude from average), False = relevant (include in average)
     # Example: chemistry_reward=[50.0, 0.0, 75.0] with sparse_metrics={"chemistry_reward": [False, True, False]}
diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
index 2a9187f56..7211e6ec8 100644
--- a/verifiers/utils/eval_utils.py
+++ b/verifiers/utils/eval_utils.py
@@ -89,16 +89,22 @@ def print_results(results: GenerateOutputs, num_samples: int = 1):
         print(out)
     for k in results.metrics:
         v = results.metrics[k]
-        
-        # selective averaging that excludes sparse values  
+
+        # selective averaging that excludes sparse values
         # only average over relevant (non-sparse) values
         # instead of including misleading zeros in the calculation
-        if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics:
+        if (
+            hasattr(results, "sparse_metrics")
+            and results.sparse_metrics
+            and k in results.sparse_metrics
+        ):
             # filter out sparse values from averaging calculation
             # sparse_flags[i] = True means exclude rollout i from averaging
             sparse_flags = results.sparse_metrics[k]
-            relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse]
-            
+            relevant_values = [
+                val for val, is_sparse in zip(v, sparse_flags) if not is_sparse
+            ]
+
             if relevant_values:
                 # calculate statistics over only the relevant (non-sparse) values
                 # this gives mathematically correct domain-specific averages
@@ -113,12 +119,16 @@ def print_results(results: GenerateOutputs, num_samples: int = 1):
             # standard averaging for non-sparse metrics (backwards compatible)
             # this preserves existing behavior for environments without sparse metrics
             print(f"{k}: avg - {sum(v) / len(v):.3f}, std - {np.std(v):.3f}")
-        
+
         # enhanced rollout display that shows sparsity clearly
         # Instead of showing misleading 0.0 values, display "-" for sparse metrics
         # This makes it immediately obvious which rollouts are relevant vs excluded
         for i in range(r):
-            if hasattr(results, 'sparse_metrics') and results.sparse_metrics and k in results.sparse_metrics:
+            if (
+                hasattr(results, "sparse_metrics")
+                and results.sparse_metrics
+                and k in results.sparse_metrics
+            ):
                 # For sparse metrics: "-" indicates sparse (irrelevant), numbers show actual values
                 # This visual distinction prevents confusion about which values contribute to averages
                 sparse_flags = results.sparse_metrics[k]
@@ -127,10 +137,10 @@ def print_results(results: GenerateOutputs, num_samples: int = 1):
                     idx = (i * n) + j
                     if sparse_flags[idx]:
                         # sparse value - show "-" instead of 0.0 to indicate exclusion from averaging
-                        trials.append("-")  
+                        trials.append("-")
                     else:
                         # non-sparse value - show actual computed score
-                        trials.append(round(v[idx], 3))  
+                        trials.append(round(v[idx], 3))
             else:
                 # standard rollout printing for non-sparse metrics (backwards compatible)
                 # all values shown as numbers since none are excluded from averaging

From 95a954690d80aa056e6ba789cf532a43b20c1dfb Mon Sep 17 00:00:00 2001
From: vxnuaj <jv.100420@gmail.com>
Date: Tue, 4 Nov 2025 01:40:55 -0800
Subject: [PATCH 4/5] remove uneeded files

---
 GITHUB_PR_TEMPLATE.md |  66 -------------
 PR.md                 | 212 ------------------------------------------
 2 files changed, 278 deletions(-)
 delete mode 100644 GITHUB_PR_TEMPLATE.md
 delete mode 100644 PR.md

diff --git a/GITHUB_PR_TEMPLATE.md b/GITHUB_PR_TEMPLATE.md
deleted file mode 100644
index 0966ff899..000000000
--- a/GITHUB_PR_TEMPLATE.md
+++ /dev/null
@@ -1,66 +0,0 @@
-## Description
-Add sparse metrics support for mathematically correct domain averaging in multi-domain environments. This feature enables selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments like ProfBench.
-
-**Key improvements:**
-- Chemistry domain: `avg - 72.9 (relevant: 2/12)` instead of diluted `avg - 12.3`
-- Physics domain: `avg - 66.2 (relevant: 10/12)` instead of diluted `avg - 56.2`
-- Visual distinction: Shows `-` for sparse values instead of misleading `0.0`
-
-## Type of Change
-- [ ] Bug fix (non-breaking change which fixes an issue)
-- [x] New feature (non-breaking change which adds functionality)
-- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
-- [ ] Documentation update
-- [ ] Test improvement
-
-## Testing
-- [ ] All existing tests pass when running `uv run pytest` locally.
-- [ ] New tests have been added to cover the changes
-
-**Manual Testing:**
-Tested with ProfBench environment showing correct sparse metrics behavior:
-- Domain-specific averages exclude irrelevant metrics
-- Sparse values display as `-` in output
-- `(relevant: X/Y)` info shows sparsity clearly
-
-## Checklist
-- [ ] My code follows the style guidelines of this project as outlined in [AGENTS.md](https://github.com/PrimeIntellect-ai/verifiers/blob/main/AGENTS.md)
-- [x] I have performed a self-review of my own code
-- [x] I have commented my code, particularly in hard-to-understand areas
-- [ ] I have made corresponding changes to the documentation
-- [ ] My changes generate no new warnings
-- [ ] Any dependent changes have been merged and published
-
-## Additional Notes
-
-### Implementation Overview
-- **Types**: Added `sparse_metrics` fields to `RolloutScore`, `RolloutScores`, and `GenerateOutputs`
-- **Environment**: Sparse tracking during interleaved scoring in `generate()` method
-- **Rubrics**: Batch scoring with missing metrics marked as sparse in `score_rollouts()`
-- **EnvGroup**: New `EnvGroupSparseRubric` class with `enable_sparse_metrics=True` opt-in
-- **Display**: Sparse-aware averaging and `-` display in `eval_utils.py`
-
-### Backwards Compatibility
-✅ Zero breaking changes - all existing environments work unchanged  
-✅ Opt-in only - sparse metrics activate only with `enable_sparse_metrics=True`  
-✅ Default behavior preserved - standard averaging remains identical  
-
-### Testing Instructions
-To test with ProfBench:
-```bash
-# 1. Clone ProfBench with sparse support
-#  ( this is a env. bounty that is in progrss of being ipmlemented, which needed this PR )
-git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench
-cd prime-environments
-
-# 2. Clone this verifiers fork / pr branch
-git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards
-cd verifiers
-
-# 3. Install and test
-cd ..
-uv pip install -e .
-vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1
-```
-
-**Expected output:** Domain averages like `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)` with `-` showing sparse values.
diff --git a/PR.md b/PR.md
deleted file mode 100644
index 174629152..000000000
--- a/PR.md
+++ /dev/null
@@ -1,212 +0,0 @@
-## Overview
-
-This PR implements sparse metrics / rubrics, which enables mathematically correct averaging in multi-domain environments. The key change heere is selective averaging that excludes irrelevant zero values, solving the domain dilution problem in composite evaluation environments.
-
-In environments like [`ProfBench`](https://arxiv.org/pdf/2510.18941), domain-specific scores get mixed with irrelevant zeros, making the averages misleading.
-
-**Example Issue:**
-Evaluating GPT-4 on 12 tasks: 10 physics + 2 chemistry tasks
-
-```
-physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, 0, 0]  # zeros for chemistry tasks
-chemistry_reward: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 76]        # zeros for physics tasks
-```
-
-- **Before**: `physics_reward: avg - 56.2` (diluted by irrelevant zeros)
-- **Before**: `chemistry_reward: avg - 13.7` (misleading!)  
-
-After,
-
-```
-physics_reward: [65, 72, 58, 81, 45, 67, 73, 59, 68, 74, -, -]  # zeros for chemistry tasks
-chemistry_reward: [-, -, -, -, -, -, -, -, -, -, 88, 76]        # zeros for physics tasks
-```
-
-- **After**: `chemistry_reward: avg - 82.0 (relevant: 2/12)` (actual chemistry skill)
-- **After**: `physics_reward: avg - 66.2 (relevant: 10/12)` (pure physics performance)
-
-Which can all be done now within an `EnvGroup` with `enable_sparse_metrics=True`.
-
-we can now
-
-1. mark irrelevant values as sparse during scoring
-2. exclude sparse values from averaging calculations
-3. display sparsity clearly with `-` instead of `0.0`
-4. maintain backwards compatibility with existing environments
-
-## Core 
-
-### 1. type extensions @ `types.py`
-
-**New Fields Added:**
-
-```python
-class RolloutScore(BaseModel):
-    sparse_metrics: set[str] | None = Field(default=None)
-    # set of metric names to exclude from averaging for this rollout
-
-class RolloutScores(BaseModel): 
-    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
-    # per-rolout exclusion flags for batch scoring
-
-class GenerateOutputs(BaseModel):
-    sparse_metrics: dict[str, list[bool]] | None = Field(default=None)
-    # final sparse tracking for evaluation results
-```
-
-THis tracks which metric values should be excluded from averaging calculations.
-
-### 2. Environment Sparse Tracking @ `envs/environment.py`
-
-**Key Changes:**
-- **Initialize sparse flags** for all metrics during interleaved scoring
-- **Track sparse metrics** from rubric scoring results  
-- **Conditionally assign** sparse_metrics only if sparsity detected (backwards compatible)
-
-```python
-# Initialize sparse tracking
-sparse_flags: dict[str, list[bool]] = {name: [False] * n for name in reward_func_names}
-
-# Process sparse flags from scoring
-if rs.sparse_metrics:
-    for sparse_key in rs.sparse_metrics:
-        sparse_flags[sparse_key][i] = True
-
-# Only add if sparsity detected (backwards compatible)
-if any(any(flags) for flags in sparse_flags.values()):
-    results.sparse_metrics = sparse_flags
-```
-
-this collects and aggregates sparse metadata during evaluation execution.
-
-### 3. Batch Scoring with Sparse Handling @ `rubrics/rubric.py` 
-
-**Key Changes:**
-- **Collect all metric keys** across rollouts (handles mixed metrics)
-- **Fill missing metrics** with 0.0 and mark as sparse
-- **Track sparsity flags** from individual rollout scores
-- **Return sparse metadata** only if sparsity detected
-
-```python
-# Handle missing metrics as sparse
-if k in reward.metrics:
-    metrics[k].append(reward.metrics[k])
-    is_sparse = reward.sparse_metrics and k in reward.sparse_metrics
-    sparse_flags[k].append(is_sparse)
-else:
-    # Missing metric -> sparse 0.0
-    metrics[k].append(0.0)
-    sparse_flags[k].append(True)
-```
-
-ensure consistent metric structure while preserving sparsity information.
-
-### 4. EnvGroup Sparse Architecture @ `envs/env_group.py`)
-
-**New Class: `EnvGroupSparseRubric`**
-
-Extends standard `EnvGroupRubric` with domain-specific sparse marking:
-
-```python
-class EnvGroupSparseRubric(EnvGroupRubric):
-    async def score_rollout(self, ...):
-        # Route to domain-specific environment
-        env_results = await env.rubric.score_rollout(...)
-        
-        # Mark uncomputed metrics as sparse
-        uncomputed_metrics = set(all_rewards) - set(env_results.metrics.keys())
-        sparse_metrics = uncomputed_metrics if uncomputed_metrics else None
-        
-        return RolloutScore(sparse_metrics=sparse_metrics, ...)
-```
-
-**Activation Logic:**
-```python
-# Key decision point for sparse metrics
-if enable_sparse_metrics:
-    rubric = EnvGroupSparseRubric(self.env_map)  # Sparse-aware
-else:
-    rubric = EnvGroupRubric(self.env_map)       # Standard (backwards compatible)
-```
-
-automatically mark domain-specific metrics as sparse when irrelevant.
-
-### 5. Sparse-Aware Display @ `utils/eval_utils.py`
-
-**Selective Averaging:**
-```python
-# Filter out sparse values before averaging
-sparse_flags = results.sparse_metrics[k]
-relevant_values = [val for val, is_sparse in zip(v, sparse_flags) if not is_sparse]
-
-if relevant_values:
-    avg = sum(relevant_values) / len(relevant_values)
-    sparsity_info = f" (relevant: {len(relevant_values)}/{len(v)})"
-    print(f"{k}: avg - {avg:.3f}{sparsity_info}")
-else:
-    print(f"{k}: no relevant data (all values sparse)")
-```
-
-**Enhanced Display:**
-```python
-# Show "-" for sparse values instead of misleading 0.0
-if sparse_flags[idx]:
-    trials.append("-")        # Sparse (excluded from averaging)
-else:
-    trials.append(round(v[idx], 3))  # Actual computed value
-```
-
-provide mathematically correct averages and clear visual distinction of sparsity.
-
-## Usage 
-
-```python
-# Standard behavior (backwards compatible)
-env = vf.EnvGroup(envs, names)                           # Standard averaging
-
-# Sparse metrics enabled
-env = vf.EnvGroup(envs, names, enable_sparse_metrics=True)  # Selective averaging
-```
-
-```python
-def load_environment(enable_sparse_metrics: bool = True):
-    return vf.EnvGroup(
-        envs=domain_envs,
-        env_names=domain_names, 
-        enable_sparse_metrics=enable_sparse_metrics
-    )
-```
-
-## To Test:
-
-To test sparse metrics with ProfBench:
-
-1. **Pull the ProfBench environment changes:**
-   ```bash
-   git clone https://github.com/vxnuaj/prime-environments.git -b vxnuaj/profbench
-   cd prime-environments
-   ```
-
-2. **Pull this verifiers fork / pr with sparse metrics implementation:**
-   ```bash
-   git clone https://github.com/vxnuaj/verifiers.git -b vxnuaj/dynamic-sparse-rewards
-   ```
-
-3. **Install verifiers in editable mode:**
-   ```bash
-   cd verifiers
-   uv pip install -e .
-   ```
-
-4. **Run evaluation to see sparse metrics in action:**
-   ```bash
-   vf-eval -s profbench -m gpt-4.1-mini --env-args '{"judge_model": "openai/gpt-4.1-mini"}' -n 12 -r 1 
-   # -n must be >= 10 for sparsity to be detected, as if we do less, then profbench only loads from the first domain ( i believe physics or chemistry )
-   # feel free to do -r x \in R^n
-   ```
-
-**Expected output:**
-- Domain-specific averages (e.g., `chemistry_phd_reward: avg - 72.9 (relevant: 2/12)`)
-- Sparse values displayed as `-` instead of `0.0`
-- Mathematically correct averages excluding irrelevant domain scores
-

From 12a0aadecb5e5ec03176ddeaa0fcf685c6b2b0cc Mon Sep 17 00:00:00 2001
From: vxnuaj <jv.100420@gmail.com>
Date: Wed, 5 Nov 2025 11:19:58 -0800
Subject: [PATCH 5/5] remove uneeded pr bloat

---
 environments/math/math.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/environments/math/math.py b/environments/math/math.py
index cff3579d0..f63e6cc02 100644
--- a/environments/math/math.py
+++ b/environments/math/math.py
@@ -2,7 +2,7 @@
 
 
 def load_environment(**kwargs) -> vf.Environment:
-    """
+    '''
     Loads a custom environment.
-    """
+    '''
     raise NotImplementedError("Implement your custom environment here.")