From 563c8dc45e8599c91861d59437dbf5ab744d61ce Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Mon, 29 Dec 2025 20:28:34 -0500
Subject: [PATCH 01/14] Added quality evaluation metrics

---
 src/cfg/run_quality_evaluation_cfg.yaml |  15 ++++
 src/run_quality_evaluation.py           | 115 ++++++++++++++++++++++++
 src/utils/__init__.py                   |   6 ++
 src/utils/quality_evaluation_utils.py   |  79 ++++++++++++++++
 4 files changed, 215 insertions(+)
 create mode 100644 src/cfg/run_quality_evaluation_cfg.yaml
 create mode 100644 src/run_quality_evaluation.py
 create mode 100644 src/utils/quality_evaluation_utils.py

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
new file mode 100644
index 0000000..a00624c
--- /dev/null
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -0,0 +1,15 @@
+prompt_cfg:
+  sys_msg: Compute benchmark quality metrics from existing scores.
+
+quality_eval_cfg:
+  # Absolute path to the directory that directly contains per-model score folders.
+  scores_root_dir: "/projects/aieng/public/ace/artifacts/negin_ace/scores"
+  scores_subdir: "scores"
+
+exp_cfg:
+  exp_id: "quality_evaluation"
+
+defaults:
+  - _self_
+
+
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
new file mode 100644
index 0000000..a7d13e2
--- /dev/null
+++ b/src/run_quality_evaluation.py
@@ -0,0 +1,115 @@
+"""Script to compute quality metrics (e.g., benchmark difficulty) from existing scores."""
+
+import json
+import logging
+import os
+from typing import Dict, List
+
+import hydra
+from omegaconf import DictConfig
+
+from src.utils import (
+    compute_benchmark_difficulty,
+    compute_benchmark_separability,
+)
+from src.utils import constants
+from src.utils.data_utils import get_run_id
+
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
+    """Extract the accuracy metric from a single Inspect eval JSON file."""
+    try:
+        with open(json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("Failed to read %s: %s", json_path, exc)
+        return None
+
+    try:
+        scores = data["results"]["scores"]
+        if not scores:
+            return None
+        metrics = scores[0]["metrics"]
+        acc = metrics["accuracy"]["value"]
+        return float(acc)
+    except (KeyError, TypeError, ValueError) as exc:
+        logger.warning("Failed to extract accuracy from %s: %s", json_path, exc)
+        return None
+
+
+@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_cfg")
+def main(cfg: DictConfig) -> None:
+    """
+    Compute benchmark-level quality metrics from saved capability scores.
+    """
+    run_id = get_run_id(cfg)
+
+    scores_root_dir = getattr(cfg.quality_eval_cfg, "scores_root_dir", None)
+    if scores_root_dir:
+        base_scores_dir = scores_root_dir
+    else:
+        base_scores_dir = os.path.join(
+            constants.BASE_ARTIFACTS_DIR,
+            cfg.quality_eval_cfg.scores_subdir,
+            run_id,
+        )
+        logger.info("Using fallback scores directory: %s", base_scores_dir)
+
+    if not os.path.isdir(base_scores_dir):
+        logger.error(
+            "Scores directory '%s' does not exist. "
+            "Please ensure scores are generated for run_id '%s'.",
+            base_scores_dir,
+            run_id,
+        )
+        return
+
+    logger.info("Loading model accuracies from %s", base_scores_dir)
+
+    # For each model directory, walk all JSON files and average their accuracies.
+    model_to_accuracy: Dict[str, float] = {}
+    for model_name in os.listdir(base_scores_dir):
+        model_dir = os.path.join(base_scores_dir, model_name)
+        if not os.path.isdir(model_dir):
+            continue
+
+        accuracies: List[float] = []
+        for root, _dirs, files in os.walk(model_dir):
+            for fname in files:
+                if not fname.endswith(".json"):
+                    continue
+                json_path = os.path.join(root, fname)
+                acc = _extract_accuracy_from_inspect_json(json_path)
+                if acc is not None:
+                    accuracies.append(acc)
+
+        if not accuracies:
+            logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
+            continue
+
+        avg_acc = sum(accuracies) / len(accuracies)
+        model_to_accuracy[model_name] = avg_acc
+        logger.info(
+            "Model '%s' mean accuracy over %d JSON files: %.4f",
+            model_name,
+            len(accuracies),
+            avg_acc,
+        )
+
+    if not model_to_accuracy:
+        logger.error("No valid model accuracies found in %s", base_scores_dir)
+        return
+
+    difficulty = compute_benchmark_difficulty(model_to_accuracy)
+    separability = compute_benchmark_separability(model_to_accuracy)
+    logger.info("Benchmark difficulty: %.4f", difficulty)
+    logger.info("Benchmark separability: %.4f", separability)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index f313105..00d0f19 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -6,3 +6,9 @@
 """
 
 from .data_utils import load_data
+from .evaluation_utils import (
+    compute_benchmark_difficulty_from_accuracies,
+    compute_benchmark_difficulty_from_model_scores,
+    compute_benchmark_separability_from_accuracies,
+    compute_benchmark_separability_from_model_scores,
+)
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
new file mode 100644
index 0000000..4d0c6d6
--- /dev/null
+++ b/src/utils/quality_evaluation_utils.py
@@ -0,0 +1,79 @@
+"""Utility functions for evaluating benchmark-level metrics."""
+
+from __future__ import annotations
+
+from typing import Iterable, Mapping, Union
+
+
+def compute_benchmark_difficulty(
+    accuracies: Union[Iterable[float], Mapping[str, float]],
+) -> float:
+    """
+    Compute benchmark difficulty given per-model accuracies.
+
+    The difficulty of a benchmark is defined as:
+
+        DIFFICULTY(D_c, M) = 1 - max_{m in M} acc(LM_m, D_c)
+
+    i.e., one minus the highest accuracy achieved by any model on the benchmark.
+
+    Args:
+        accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model,
+            or a mapping from model name to accuracy in [0.0, 1.0].
+
+    Returns:
+        A float in [0.0, 1.0] representing the benchmark difficulty.
+
+    Raises:
+        ValueError: If no accuracies are provided.
+    """
+    # Handle Mapping by extracting values, otherwise treat as iterable
+    if isinstance(accuracies, Mapping):
+        accuracies = accuracies.values()
+    
+    accuracies = list(accuracies)
+    if not accuracies:
+        raise ValueError("Cannot compute difficulty: no accuracies provided.")
+
+    best_acc = max(accuracies)
+    # Clamp to [0, 1] in case of tiny numerical issues.
+    best_acc = max(0.0, min(1.0, best_acc))
+    return 1.0 - best_acc
+
+
+def compute_benchmark_separability(
+    accuracies: Union[Iterable[float], Mapping[str, float]],
+) -> float:
+    """
+    Compute benchmark separability given per-model accuracies.
+
+    Separability is defined as the mean absolute deviation of model accuracies
+    around their mean:
+
+        SEP(D_c, M) = mean(|v_c - mean(v_c)|)
+
+    where ``v_c`` are the accuracies of different models on the same dataset.
+
+    Args:
+        accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model,
+            or a mapping from model name to accuracy in [0.0, 1.0].
+
+    Returns:
+        A non-negative float representing separability.
+
+    Raises:
+        ValueError: If no accuracies are provided.
+    """
+    # Handle Mapping by extracting values, otherwise treat as iterable
+    if isinstance(accuracies, Mapping):
+        accuracies = accuracies.values()
+    
+    accuracies = list(accuracies)
+    if not accuracies:
+        raise ValueError("Cannot compute separability: no accuracies provided.")
+
+    mean_acc = sum(accuracies) / len(accuracies)
+    abs_devs = [abs(a - mean_acc) for a in accuracies]
+    return sum(abs_devs) / len(abs_devs)
+
+

From 8361b0eda14f99eea62e05ebc13d2573d0790e0f Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Mon, 5 Jan 2026 10:01:21 -0500
Subject: [PATCH 02/14] Addd consistency & novelty

---
 src/cfg/run_quality_evaluation_cfg.yaml |   7 +-
 src/run_quality_evaluation.py           | 181 +++++++++++++++++---
 src/run_quality_evaluation_README.md    | 185 +++++++++++++++++++++
 src/utils/__init__.py                   |  10 +-
 src/utils/quality_evaluation_utils.py   | 211 +++++++++++++++++++++++-
 5 files changed, 564 insertions(+), 30 deletions(-)
 create mode 100644 src/run_quality_evaluation_README.md

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index a00624c..5e92360 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -3,8 +3,13 @@ prompt_cfg:
 
 quality_eval_cfg:
   # Absolute path to the directory that directly contains per-model score folders.
-  scores_root_dir: "/projects/aieng/public/ace/artifacts/negin_ace/scores"
+  scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
   scores_subdir: "scores"
+  # List of absolute paths to prior datasets for novelty computation.
+  # Each path should point to a directory containing per-model score folders (same structure as scores_root_dir).
+  # Models must be consistent across all datasets.
+  prior_datasets:
+    - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500"
 
 exp_cfg:
   exp_id: "quality_evaluation"
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index a7d13e2..be4c28c 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -9,7 +9,9 @@
 from omegaconf import DictConfig
 
 from src.utils import (
+    compute_benchmark_consistency,
     compute_benchmark_difficulty,
+    compute_benchmark_novelty,
     compute_benchmark_separability,
 )
 from src.utils import constants
@@ -19,6 +21,57 @@
 logger = logging.getLogger(__name__)
 
 
+def _collect_accuracies_from_dir(directory: str) -> List[float]:
+    """
+    Collect all accuracy values from JSON files in a directory (recursively).
+    
+    Args:
+        directory: Directory to walk recursively for JSON files.
+        
+    Returns:
+        List of accuracy values found in the directory.
+    """
+    accuracies: List[float] = []
+    for root, _dirs, files in os.walk(directory):
+        for fname in files:
+            if not fname.endswith(".json"):
+                continue
+            json_path = os.path.join(root, fname)
+            acc = _extract_accuracy_from_inspect_json(json_path)
+            if acc is not None:
+                accuracies.append(acc)
+    return accuracies
+
+
+def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
+    """
+    Load model accuracies from a directory structure.
+    
+    Args:
+        base_dir: Directory containing per-model subdirectories with JSON files.
+        
+    Returns:
+        Dictionary mapping model name to average accuracy.
+    """
+    model_to_accuracy: Dict[str, float] = {}
+    
+    if not os.path.isdir(base_dir):
+        logger.warning("Directory does not exist: %s", base_dir)
+        return model_to_accuracy
+    
+    for model_name in os.listdir(base_dir):
+        model_dir = os.path.join(base_dir, model_name)
+        if not os.path.isdir(model_dir):
+            continue
+        
+        accuracies = _collect_accuracies_from_dir(model_dir)
+        if accuracies:
+            avg_acc = sum(accuracies) / len(accuracies)
+            model_to_accuracy[model_name] = avg_acc
+    
+    return model_to_accuracy
+
+
 def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
     """Extract the accuracy metric from a single Inspect eval JSON file."""
     try:
@@ -29,6 +82,11 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
         return None
 
     try:
+        # Check if file has results (successful evaluation) or error (failed evaluation)
+        if "error" in data or "results" not in data:
+            # File has error or no results, skip it
+            return None
+        
         scores = data["results"]["scores"]
         if not scores:
             return None
@@ -40,7 +98,7 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
         return None
 
 
-@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_cfg")
+@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg")
 def main(cfg: DictConfig) -> None:
     """
     Compute benchmark-level quality metrics from saved capability scores.
@@ -71,33 +129,75 @@ def main(cfg: DictConfig) -> None:
 
     # For each model directory, walk all JSON files and average their accuracies.
     model_to_accuracy: Dict[str, float] = {}
+    # For consistency: map model to list of accuracies per generation
+    model_to_generation_accuracies: Dict[str, List[float]] = {}
+    
+    # Get prior dataset names to exclude them from current dataset
+    prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
+    prior_dataset_names = set()
+    for prior_path in prior_datasets:
+        # Extract the directory name from the path
+        prior_name = os.path.basename(os.path.normpath(prior_path))
+        prior_dataset_names.add(prior_name)
+    
     for model_name in os.listdir(base_scores_dir):
+        # Skip if this is a prior dataset directory
+        if model_name in prior_dataset_names:
+            logger.debug("Skipping prior dataset directory: %s", model_name)
+            continue
         model_dir = os.path.join(base_scores_dir, model_name)
         if not os.path.isdir(model_dir):
             continue
 
-        accuracies: List[float] = []
-        for root, _dirs, files in os.walk(model_dir):
-            for fname in files:
-                if not fname.endswith(".json"):
-                    continue
-                json_path = os.path.join(root, fname)
-                acc = _extract_accuracy_from_inspect_json(json_path)
-                if acc is not None:
-                    accuracies.append(acc)
-
-        if not accuracies:
-            logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
-            continue
-
-        avg_acc = sum(accuracies) / len(accuracies)
-        model_to_accuracy[model_name] = avg_acc
-        logger.info(
-            "Model '%s' mean accuracy over %d JSON files: %.4f",
-            model_name,
-            len(accuracies),
-            avg_acc,
-        )
+        # Check if model_dir contains subdirectories (generations/runs)
+        subdirs = [
+            d for d in os.listdir(model_dir)
+            if os.path.isdir(os.path.join(model_dir, d))
+        ]
+        
+        if subdirs:
+            # Structure: model_dir/generation_dir/...json files
+            # Each subdirectory represents a different dataset generation
+            generation_accuracies: List[float] = []
+            for gen_dir_name in sorted(subdirs):
+                gen_dir = os.path.join(model_dir, gen_dir_name)
+                gen_accuracies = _collect_accuracies_from_dir(gen_dir)
+                
+                if gen_accuracies:
+                    avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies)
+                    generation_accuracies.append(avg_gen_acc)
+                    logger.debug(
+                        "Model '%s' generation '%s': %.4f (from %d JSON files)",
+                        model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies)
+                    )
+            
+            if generation_accuracies:
+                model_to_generation_accuracies[model_name] = generation_accuracies
+                # Overall average across all generations
+                avg_acc = sum(generation_accuracies) / len(generation_accuracies)
+                model_to_accuracy[model_name] = avg_acc
+                logger.info(
+                    "Model '%s' mean accuracy over %d generations: %.4f",
+                    model_name,
+                    len(generation_accuracies),
+                    avg_acc,
+                )
+        else:
+            # Structure: model_dir/...json files (no generation subdirectories)
+            accuracies = _collect_accuracies_from_dir(model_dir)
+
+            if not accuracies:
+                logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
+                continue
+
+            avg_acc = sum(accuracies) / len(accuracies)
+            model_to_accuracy[model_name] = avg_acc
+            logger.info(
+                "Model '%s' mean accuracy over %d JSON files: %.4f",
+                model_name,
+                len(accuracies),
+                avg_acc,
+            )
 
     if not model_to_accuracy:
         logger.error("No valid model accuracies found in %s", base_scores_dir)
@@ -107,6 +207,41 @@ def main(cfg: DictConfig) -> None:
     separability = compute_benchmark_separability(model_to_accuracy)
     logger.info("Benchmark difficulty: %.4f", difficulty)
     logger.info("Benchmark separability: %.4f", separability)
+    
+    # Compute consistency if we have multiple generations per model
+    if model_to_generation_accuracies:
+        try:
+            consistency = compute_benchmark_consistency(model_to_generation_accuracies)
+            logger.info("Benchmark consistency: %.4f", consistency)
+        except ValueError as e:
+            logger.warning("Could not compute consistency: %s", e)
+    
+    # Compute novelty if prior datasets are provided
+    prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
+    if prior_datasets:
+        try:
+            logger.info("Loading prior datasets for novelty computation...")
+            prior_datasets_accuracies: List[Dict[str, float]] = []
+            for prior_dir in prior_datasets:
+                prior_acc = _load_model_accuracies_from_dir(prior_dir)
+                if prior_acc:
+                    prior_datasets_accuracies.append(prior_acc)
+                    logger.info(
+                        "Loaded prior dataset from %s: %d models",
+                        prior_dir, len(prior_acc)
+                    )
+                else:
+                    logger.warning("No accuracies found in prior dataset: %s", prior_dir)
+            
+            if prior_datasets_accuracies:
+                novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies)
+                logger.info("Benchmark novelty: %.4f", novelty)
+            else:
+                logger.warning("No valid prior datasets found, skipping novelty computation.")
+        except ValueError as e:
+            logger.warning("Could not compute novelty: %s", e)
+        except Exception as e:  # noqa: BLE001
+            logger.warning("Error computing novelty: %s", e)
 
 
 if __name__ == "__main__":
diff --git a/src/run_quality_evaluation_README.md b/src/run_quality_evaluation_README.md
new file mode 100644
index 0000000..6292531
--- /dev/null
+++ b/src/run_quality_evaluation_README.md
@@ -0,0 +1,185 @@
+# Quality Evaluation Script
+
+`run_quality_evaluation.py` computes benchmark-level quality metrics from existing evaluation scores.
+
+## Overview
+
+This script analyzes model performance scores to compute several quality metrics:
+
+- **Difficulty**: Measures how hard the benchmark is (`1 - max(accuracy)`)
+- **Separability**: Measures how well the benchmark distinguishes between models (mean absolute deviation of accuracies)
+- **Consistency**: Measures stability of model performance across different dataset generations (`1 - mean(std(performance across generations))`)
+- **Novelty**: Measures how much new information the dataset reveals compared to prior benchmarks (`1 - rank_correlation(predicted, actual)`)
+
+## Usage
+
+```bash
+python src/run_quality_evaluation.py
+```
+
+The script uses Hydra for configuration management. Configuration is specified in `src/cfg/run_quality_evaluation_cfg.yaml`.
+
+## Configuration
+
+Edit `src/cfg/run_quality_evaluation_cfg.yaml`:
+
+```yaml
+quality_eval_cfg:
+  # Absolute path to directory containing per-model score folders
+  scores_root_dir: "/path/to/scores"
+  
+  # Fallback: if scores_root_dir not set, uses:
+  # {BASE_ARTIFACTS_DIR}/{scores_subdir}/{run_id}
+  scores_subdir: "scores"
+  
+  # Optional: List of prior datasets for novelty computation
+  prior_datasets:
+    - "/path/to/prior_dataset1"
+    - "/path/to/prior_dataset2"
+```
+
+## Data Structure
+
+The script expects a root directory containing **per-model subdirectories**. Two structures are supported:
+
+### Structure 1: With Multiple Generations (for Consistency)
+
+```
+scores_root_dir/
+├── model1/
+│   ├── generation1/          # First dataset generation
+│   │   └── .../*.json files (recursively)
+│   ├── generation2/          # Second dataset generation
+│   │   └── .../*.json files
+│   └── generation3/
+│       └── .../*.json files
+├── model2/
+│   └── ... (same structure)
+```
+
+**Behavior:**
+- Computes average accuracy **per generation** for each model
+- **Consistency** is computed from generation-to-generation variation
+- **Difficulty** and **Separability** use the **average across all generations**
+
+### Structure 2: Without Generations (Single Dataset)
+
+```
+scores_root_dir/
+├── model1/
+│   └── .../*.json files (recursively, any nesting allowed)
+├── model2/
+│   └── .../*.json files
+```
+
+**Behavior:**
+- Walks all JSON files recursively under each model directory
+- Computes average accuracy per model
+- **Consistency** is NOT computed (no generations available)
+- **Difficulty** and **Separability** are computed from average accuracies
+
+## JSON File Format
+
+Each `.json` file must follow the Inspect AI evaluation format:
+
+```json
+{
+  "results": {
+    "scores": [
+      {
+        "metrics": {
+          "accuracy": {
+            "value": 0.75
+          }
+        }
+      }
+    ]
+  }
+}
+```
+
+## Metrics
+
+### Difficulty
+
+Measures how difficult the benchmark is for models:
+
+```
+difficulty = 1 - max(accuracy across all models)
+```
+
+- Range: [0, 1]
+- Higher values = harder benchmark
+
+### Separability
+
+Measures how well the benchmark distinguishes between models:
+
+```
+separability = mean(|accuracy_i - mean(accuracies)|)
+```
+
+- Range: [0, 1]
+- Higher values = better model discrimination
+
+### Consistency
+
+Measures stability of model performance across dataset generations:
+
+```
+consistency = 1 - (1/n) * Σ std(performance(m_i) across generations)
+```
+
+- Range: [0, 1]
+- Higher values = more stable/consistent performance
+- **Only computed** when multiple generations are detected
+
+### Novelty
+
+Measures how much new information the dataset reveals compared to prior benchmarks:
+
+```
+1. Predict current accuracies from prior datasets using linear regression
+2. Compute rank correlation between predicted and actual rankings
+3. novelty = 1 - rank_correlation
+```
+
+- Range: [0, 1]
+- Higher values = more novel/unpredictable performance patterns
+- **Only computed** when `prior_datasets` are specified in config
+
+## Prior Datasets (for Novelty)
+
+Prior datasets should have the **same structure** as the main dataset.
+
+**Important:** Prior dataset directories should be **separate** from the main `scores_root_dir` to avoid being treated as models.
+
+Example:
+```
+data/
+├── scores_sample/          # Main dataset
+│   ├── model1/
+│   └── model2/
+└── scores_sample/
+    └── math-500/          # Prior dataset (separate directory)
+        ├── model1/
+        └── model2/
+```
+
+**Requirements:**
+- All prior datasets must have the same set of models as the current dataset
+- Models must be consistent across all datasets for novelty computation
+
+## Output
+
+The script logs all computed metrics:
+
+```
+[INFO] Model 'model1' mean accuracy over 3 generations: 0.7500
+[INFO] Model 'model2' mean accuracy over 3 generations: 0.6500
+[INFO] Benchmark difficulty: 0.2500
+[INFO] Benchmark separability: 0.0500
+[INFO] Benchmark consistency: 0.9200
+[INFO] Benchmark novelty: 0.5000
+```
+
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 00d0f19..b7be76a 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -6,9 +6,9 @@
 """
 
 from .data_utils import load_data
-from .evaluation_utils import (
-    compute_benchmark_difficulty_from_accuracies,
-    compute_benchmark_difficulty_from_model_scores,
-    compute_benchmark_separability_from_accuracies,
-    compute_benchmark_separability_from_model_scores,
+from .quality_evaluation_utils import (
+    compute_benchmark_consistency,
+    compute_benchmark_difficulty,
+    compute_benchmark_novelty,
+    compute_benchmark_separability,
 )
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index 4d0c6d6..0f5c5e0 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -2,7 +2,11 @@
 
 from __future__ import annotations
 
-from typing import Iterable, Mapping, Union
+import statistics
+from typing import Iterable, List, Mapping, Union
+
+import numpy as np
+from scipy.stats import spearmanr
 
 
 def compute_benchmark_difficulty(
@@ -77,3 +81,208 @@ def compute_benchmark_separability(
     return sum(abs_devs) / len(abs_devs)
 
 
+def compute_benchmark_consistency(
+    model_to_generation_accuracies: Mapping[str, Iterable[float]],
+) -> float:
+    """
+    Compute benchmark consistency given per-model accuracies across multiple dataset generations.
+
+    Consistency measures how stable model performance is across different dataset generations.
+    The consistency of a benchmark is defined as:
+
+        CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_{i=1}^n std({performance(m_i) | D_gen,j}_{j=1}^k)
+
+    where:
+    - n is the number of models
+    - k is the number of dataset generations
+    - For each model m_i, we compute the standard deviation of its performance
+      across k dataset generations
+    - We average these standard deviations across all models
+    - We subtract from 1 to get a consistency score (higher is better)
+
+    Args:
+        model_to_generation_accuracies: A mapping from model name to an iterable of
+            accuracy values, where each accuracy corresponds to the model's performance
+            on a different dataset generation. Each model should have the same number
+            of generations (k).
+
+    Returns:
+        A float in [0.0, 1.0] representing the benchmark consistency.
+        Higher values indicate more consistent performance across generations.
+
+    Raises:
+        ValueError: If no models are provided, or if models have inconsistent
+            numbers of generations, or if any model has fewer than 2 generations
+            (std requires at least 2 values).
+
+    Example:
+        >>> model_to_accs = {
+        ...     "model1": [0.8, 0.82, 0.79],
+        ...     "model2": [0.7, 0.71, 0.69],
+        ... }
+        >>> consistency = compute_benchmark_consistency(model_to_accs)
+    """
+    if not model_to_generation_accuracies:
+        raise ValueError("Cannot compute consistency: no models provided.")
+
+    # Convert to lists and validate
+    model_accuracies = {
+        model: list(accuracies)
+        for model, accuracies in model_to_generation_accuracies.items()
+    }
+
+    # Check that all models have the same number of generations
+    num_generations = len(next(iter(model_accuracies.values())))
+    if num_generations < 2:
+        raise ValueError(
+            f"Cannot compute consistency: need at least 2 generations per model, "
+            f"but found {num_generations}."
+        )
+
+    for model, accuracies in model_accuracies.items():
+        if len(accuracies) != num_generations:
+            raise ValueError(
+                f"Inconsistent number of generations: model '{model}' has "
+                f"{len(accuracies)} generations, but expected {num_generations}."
+            )
+
+    # Compute standard deviation for each model across generations
+    model_stds = []
+    for model, accuracies in model_accuracies.items():
+        if len(accuracies) < 2:
+            raise ValueError(
+                f"Model '{model}' has fewer than 2 generations, cannot compute std."
+            )
+        std_dev = statistics.stdev(accuracies)
+        model_stds.append(std_dev)
+
+    # Average the standard deviations across all models
+    mean_std = sum(model_stds) / len(model_stds)
+
+    # Consistency = 1 - mean_std
+    # Clamp to [0, 1] in case of numerical issues
+    consistency = max(0.0, min(1.0, 1.0 - mean_std))
+    return consistency
+
+
+def compute_benchmark_novelty(
+    current_accuracies: Mapping[str, float],
+    prior_datasets_accuracies: List[Mapping[str, float]],
+) -> float:
+    """
+    Compute benchmark novelty by comparing current dataset performance to prior datasets.
+
+    Novelty measures how much new information a dataset reveals about existing models
+    over existing benchmarks. The formula is:
+
+        NOVELTY(D_c, D_prev, M) = 1 - RANKCORR(v̂_c, v_c)
+
+    where:
+    - v_c is the current dataset's accuracy vector (M×1)
+    - V_prev is the prior datasets' accuracy matrix (M×N)
+    - v̂_c = V_prev * θ* + b* (predicted from linear regression)
+    - RANKCORR is the rank correlation (Spearman correlation)
+
+    If the new accuracy vector v_c is spanned by existing accuracy vectors,
+    RANKCORR(v_c, v̂_c) will be close to 1, resulting in low novelty.
+    If v_c discovers new patterns in model performance, RANKCORR(v_c, v̂_c)
+    will be low, resulting in high novelty.
+
+    Args:
+        current_accuracies: A mapping from model name to accuracy on the current
+            dataset. This is v_c.
+        prior_datasets_accuracies: A list of mappings, where each mapping contains
+            model name to accuracy for a prior dataset. This represents V_prev.
+            All mappings must contain the same set of models, and these models
+            must match the models in current_accuracies.
+
+    Returns:
+        A float in [0.0, 1.0] representing the benchmark novelty.
+        Higher values indicate more novel/unique performance patterns.
+
+    Raises:
+        ValueError: If no prior datasets provided, models don't match, or
+            regression fails (e.g., singular matrix).
+
+    Example:
+        >>> current = {"model1": 0.8, "model2": 0.6, "model3": 0.7}
+        >>> prior1 = {"model1": 0.75, "model2": 0.65, "model3": 0.72}
+        >>> prior2 = {"model1": 0.78, "model2": 0.62, "model3": 0.68}
+        >>> novelty = compute_benchmark_novelty(current, [prior1, prior2])
+    """
+    if not prior_datasets_accuracies:
+        raise ValueError("Cannot compute novelty: no prior datasets provided.")
+
+    # Get sorted model names to ensure consistent ordering
+    current_models = sorted(current_accuracies.keys())
+    if not current_models:
+        raise ValueError("Cannot compute novelty: current_accuracies is empty.")
+
+    # Validate that all prior datasets have the same models
+    for i, prior_acc in enumerate(prior_datasets_accuracies):
+        prior_models = sorted(prior_acc.keys())
+        if set(prior_models) != set(current_models):
+            missing = set(current_models) - set(prior_models)
+            extra = set(prior_models) - set(current_models)
+            raise ValueError(
+                f"Prior dataset {i} has mismatched models. "
+                f"Missing: {missing}, Extra: {extra}"
+            )
+
+    # Build matrices: V_prev (M×N) and v_c (M×1)
+    # M = number of models, N = number of prior datasets
+    num_models = len(current_models)
+    num_prior = len(prior_datasets_accuracies)
+
+    # V_prev: each column is a prior dataset's accuracies
+    V_prev = np.zeros((num_models, num_prior))
+    for i, prior_acc in enumerate(prior_datasets_accuracies):
+        for j, model in enumerate(current_models):
+            V_prev[j, i] = prior_acc[model]
+
+    # v_c: current dataset's accuracies
+    v_c = np.array([current_accuracies[model] for model in current_models])
+
+    # Perform linear regression: v_c = V_prev * θ + b
+    # We solve: min ||V_prev * θ + b - v_c||²
+    # To use np.linalg.lstsq, we reformulate as: [V_prev, 1] * [θ; b] = v_c
+    # where 1 is a column vector of ones (for the intercept b)
+    
+    # Augment design matrix with column of ones for intercept
+    ones = np.ones((num_models, 1))
+    X = np.hstack([V_prev, ones])
+    
+    try:
+        # Solve using least squares: X * params = v_c
+        # params = [θ; b]
+        params, residuals, rank, s = np.linalg.lstsq(X, v_c, rcond=None)
+    except np.linalg.LinAlgError as e:
+        raise ValueError(
+            f"Linear regression failed (singular matrix): {e}. "
+            "This may happen if prior datasets are linearly dependent."
+        ) from e
+
+    # Extract θ and b
+    theta = params[:-1]  # First N elements
+    b = params[-1]  # Last element (intercept)
+
+    # Compute predicted values: v̂_c = V_prev * θ + b
+    v_pred = V_prev @ theta + b
+
+    # Compute rank correlation (Spearman correlation) using scipy
+    try:
+        rank_corr, _p_value = spearmanr(v_c, v_pred)
+    except Exception as e:
+        raise ValueError(f"Rank correlation computation failed: {e}") from e
+
+    # Handle edge cases: if correlation is NaN or invalid, novelty is 1.0
+    # (NaN occurs when either array has no variation, meaning we can't predict)
+    if np.isnan(rank_corr) or not np.isfinite(rank_corr):
+        return 1.0
+
+    # Novelty = 1 - rank_correlation
+    # Clamp to [0, 1] in case of numerical issues (e.g., negative correlation)
+    novelty = max(0.0, min(1.0, 1.0 - rank_corr))
+    return novelty
+
+

From 391f3064d25683bf72a0a6f2410eee9d8a195d2c Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 8 Jan 2026 13:07:09 -0500
Subject: [PATCH 03/14] Added SYNQUE diversity metrics

---
 src/cfg/run_quality_evaluation_cfg.yaml    |  34 +-
 src/run_quality_evaluation.py              | 266 ++++++++++++++-
 src/utils/__init__.py                      |   3 +
 src/utils/diversity_metrics_dataloaders.py | 364 +++++++++++++++++++++
 src/utils/quality_evaluation_utils.py      | 165 ++++++++++
 5 files changed, 827 insertions(+), 5 deletions(-)
 create mode 100644 src/utils/diversity_metrics_dataloaders.py

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index 5e92360..932b102 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -2,14 +2,40 @@ prompt_cfg:
   sys_msg: Compute benchmark quality metrics from existing scores.
 
 quality_eval_cfg:
-  # Absolute path to the directory that directly contains per-model score folders.
   scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
   scores_subdir: "scores"
-  # List of absolute paths to prior datasets for novelty computation.
-  # Each path should point to a directory containing per-model score folders (same structure as scores_root_dir).
-  # Models must be consistent across all datasets.
   prior_datasets:
     - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500"
+  
+  capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
+  
+  real_data_dir: null
+  
+  real_dataloader_config:
+    type: "huggingface"
+    dataset_name: "HuggingFaceH4/MATH-500"
+    split: "test"
+    subset: null
+    text_field: "problem"
+  
+  # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers
+  embedding_backend: "openai"
+  embedding_model: "text-embedding-3-large"
+  # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
+  embedding_dimensions: 3072
+  
+  diversity_metrics:
+    - "pad"
+    - "mmd"
+    - "mdm"
+  
+  pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
+  
+  mmd_kernel: "polynomial"  # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid"
+  mmd_degree: 3
+  
+  mdm_n_clusters: 5
+  mdm_metric: "euclidean"
 
 exp_cfg:
   exp_id: "quality_evaluation"
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index be4c28c..da23ad5 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -3,19 +3,33 @@
 import json
 import logging
 import os
-from typing import Dict, List
+from typing import Any, Dict, List, Optional
 
 import hydra
+import numpy as np
+import torch
 from omegaconf import DictConfig
 
+from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName
 from src.utils import (
     compute_benchmark_consistency,
     compute_benchmark_difficulty,
     compute_benchmark_novelty,
     compute_benchmark_separability,
+    compute_mdm,
+    compute_mmd,
+    compute_pad,
 )
 from src.utils import constants
 from src.utils.data_utils import get_run_id
+from src.utils.diversity_metrics_dataloaders import (
+    CapabilityDataloader,
+    HuggingFaceDatasetDataloader,
+    JSONLDataloader,
+    CSVDataloader,
+    DatasetDataloader,
+    load_texts_from_dataloader,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -72,6 +86,150 @@ def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
     return model_to_accuracy
 
 
+def _create_dataloader_from_config(
+    data_path: str,
+    dataloader_config: Dict[str, Any],
+) -> DatasetDataloader:
+    """Create a dataloader from configuration.
+    
+    Args:
+        data_path: Path to the data
+        dataloader_config: Configuration dict with 'type' and other fields
+        
+    Returns:
+        DatasetDataloader instance
+    """
+    dataloader_type = dataloader_config.get("type", "capability")
+    
+    if dataloader_type == "capability":
+        return CapabilityDataloader(data_path)
+    
+    elif dataloader_type == "huggingface":
+        from datasets import load_dataset
+        dataset_name = dataloader_config.get("dataset_name")
+        split = dataloader_config.get("split", "train")
+        subset = dataloader_config.get("subset", None)
+        dataset = load_dataset(dataset_name, name=subset, split=split)
+        
+        return HuggingFaceDatasetDataloader(
+            dataset=dataset,
+            text_field=dataloader_config.get("text_field", "problem"),
+        )
+    
+    elif dataloader_type == "jsonl":
+        return JSONLDataloader(
+            jsonl_path=data_path,
+            name_field=dataloader_config.get("name_field", "name"),
+            description_field=dataloader_config.get("description_field", "description"),
+            area_field=dataloader_config.get("area_field"),
+            instructions_field=dataloader_config.get("instructions_field"),
+            task_field=dataloader_config.get("task_field", "problem"),
+        )
+    
+    elif dataloader_type == "csv":
+        return CSVDataloader(
+            csv_path=data_path,
+            name_field=dataloader_config.get("name_field", "name"),
+            description_field=dataloader_config.get("description_field", "description"),
+            area_field=dataloader_config.get("area_field"),
+            instructions_field=dataloader_config.get("instructions_field"),
+            task_field=dataloader_config.get("task_field", "problem"),
+        )
+    
+    else:
+        raise ValueError(f"Unknown dataloader type: {dataloader_type}")
+
+
+def _load_capabilities_and_generate_embeddings(
+    capabilities_dir: str,
+    embedding_model_name: str,
+    embed_dimensions: int,
+    dataloader_config: Optional[Dict[str, Any]] = None,
+    embedding_backend: str = "openai",
+) -> tuple[np.ndarray, List[Any]]:
+    """
+    Load capabilities from directory and generate embeddings.
+    
+    Supports both capability format (default) and custom dataloaders.
+    Always uses the dataloader system for consistency.
+    
+    Args:
+        capabilities_dir: Directory containing capability subdirectories OR path to data file
+        embedding_model_name: Name of embedding model to use
+        embed_dimensions: Number of embedding dimensions
+        dataloader_config: Optional configuration for custom dataloader.
+                          If None, defaults to capability format.
+        
+    Returns:
+        Tuple of (embeddings array, list of items/capabilities)
+    """
+    # Use dataloader system: default to capability format if no config provided
+    if dataloader_config:
+        logger.info("Using custom dataloader: %s", dataloader_config.get("type", "unknown"))
+        dataloader = _create_dataloader_from_config(capabilities_dir, dataloader_config)
+    else:
+        # Default: use capability format dataloader
+        if not os.path.isdir(capabilities_dir):
+            logger.error("capabilities_dir must be a directory when using default capability format: %s", capabilities_dir)
+            return np.array([]), []
+        logger.info("Using capability format dataloader for %s", capabilities_dir)
+        dataloader = CapabilityDataloader(capabilities_dir)
+    
+    # Extract texts using the dataloader
+    texts = load_texts_from_dataloader(dataloader)
+    
+    if not texts:
+        logger.warning("No texts extracted from %s", capabilities_dir)
+        return np.array([]), []
+    
+    logger.info("Extracted %d texts for embedding", len(texts))
+    
+    # Generate embeddings
+    logger.info(
+        "Generating embeddings using %s (backend=%s)",
+        embedding_model_name,
+        embedding_backend,
+    )
+    if embedding_backend.lower() == "openai":
+        # Use existing OpenAI-based EmbeddingGenerator
+        embedding_generator = EmbeddingGenerator(
+            model_name=EmbeddingModelName(embedding_model_name),
+            embed_dimensions=embed_dimensions,
+        )
+        embeddings = embedding_generator.generate_embeddings(texts)
+        embeddings_array = np.array([emb.numpy() for emb in embeddings])
+    elif embedding_backend.lower() == "huggingface":
+        # Use HuggingFace encoder models such as gte-Qwen
+        try:
+            from sentence_transformers import SentenceTransformer  # type: ignore[import]
+        except Exception as exc:  # noqa: BLE001
+            logger.error(
+                "Failed to import sentence_transformers for HuggingFace embeddings: %s",
+                exc,
+            )
+            return np.array([]), []
+        
+        hf_model = SentenceTransformer(embedding_model_name)
+        embeddings_array = hf_model.encode(
+            texts,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+        )
+        # Optionally warn if requested dim does not match actual dim
+        if embed_dimensions and embeddings_array.shape[1] != embed_dimensions:
+            logger.warning(
+                "Requested embed_dimensions=%d but HuggingFace model produced %d dims; "
+                "using model's native dimension.",
+                embed_dimensions,
+                embeddings_array.shape[1],
+            )
+    else:
+        logger.error("Unknown embedding_backend: %s", embedding_backend)
+        return np.array([]), []
+    
+    return embeddings_array, []
+
+
 def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
     """Extract the accuracy metric from a single Inspect eval JSON file."""
     try:
@@ -182,6 +340,8 @@ def main(cfg: DictConfig) -> None:
                     len(generation_accuracies),
                     avg_acc,
                 )
+            # Continue to next model if we processed subdirs
+            continue
         else:
             # Structure: model_dir/...json files (no generation subdirectories)
             accuracies = _collect_accuracies_from_dir(model_dir)
@@ -242,6 +402,110 @@ def main(cfg: DictConfig) -> None:
             logger.warning("Could not compute novelty: %s", e)
         except Exception as e:  # noqa: BLE001
             logger.warning("Error computing novelty: %s", e)
+    
+    # Compute diversity metrics if capabilities directory is provided
+    capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None)
+    if capabilities_dir:
+        metrics_to_compute = getattr(cfg.quality_eval_cfg, "diversity_metrics", ["pad", "mmd", "mdm"])
+        embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large")
+        embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai")
+        embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072)
+        
+        # Get dataloader config if provided
+        synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None)
+        if synth_dataloader_config:
+            synth_dataloader_config = dict(synth_dataloader_config)
+        
+        logger.info("Computing diversity metrics for capabilities in %s", capabilities_dir)
+        
+        # Load capabilities and generate embeddings
+        synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
+            capabilities_dir=capabilities_dir,
+            embedding_model_name=embedding_model,
+            embed_dimensions=embed_dimensions,
+            dataloader_config=synth_dataloader_config,
+            embedding_backend=embedding_backend,
+        )
+        
+        if len(synth_embeddings) == 0:
+            logger.warning("No embeddings generated, skipping diversity metrics")
+        else:
+            # Check if real data directory/file is provided for comparison
+            real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None)
+            real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None)
+            
+            # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.)
+            has_real_data = False
+            # Case 1: local path (capability/JSONL/CSV formats)
+            if real_data_dir and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)):
+                has_real_data = True
+            # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None)
+            elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface":
+                has_real_data = True
+            
+            if has_real_data:
+                # Get real data dataloader config if provided
+                if real_dataloader_config:
+                    real_dataloader_config = dict(real_dataloader_config)
+                
+                if real_data_dir:
+                    logger.info("Loading real data embeddings from %s", real_data_dir)
+                else:
+                    logger.info("Loading real data embeddings using dataloader config (no local path)")
+                real_embeddings, _ = _load_capabilities_and_generate_embeddings(
+                    # For HuggingFace, the capabilities_dir is unused; fallback to empty string
+                    capabilities_dir=real_data_dir or "",
+                    embedding_model_name=embedding_model,
+                    embed_dimensions=embed_dimensions,
+                    dataloader_config=real_dataloader_config,
+                    embedding_backend=embedding_backend,
+                )
+                
+                if len(real_embeddings) > 0:
+                    # Compute metrics that require both synthetic and real data
+                    if "pad" in metrics_to_compute:
+                        try:
+                            pad_score = compute_pad(
+                                synth_embeddings,
+                                real_embeddings,
+                                classifier_name=getattr(cfg.quality_eval_cfg, "pad_classifier", "LogisticRegression"),
+                            )
+                            logger.info("PAD score: %.4f", pad_score)
+                        except Exception as e:  # noqa: BLE001
+                            logger.warning("Error computing PAD: %s", e)
+                    
+                    if "mmd" in metrics_to_compute:
+                        try:
+                            mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial")
+                            mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3)
+                            mmd_score = compute_mmd(
+                                synth_embeddings,
+                                real_embeddings,
+                                kernel=mmd_kernel,
+                                degree=mmd_degree,
+                            )
+                            logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score)
+                        except Exception as e:  # noqa: BLE001
+                            logger.warning("Error computing MMD: %s", e)
+                else:
+                    logger.warning("No real data embeddings generated, skipping comparison metrics")
+            else:
+                logger.info("No real_data_dir provided, skipping PAD and MMD (require real data)")
+            
+            # Compute MDM (can be computed without real data - measures internal diversity)
+            if "mdm" in metrics_to_compute:
+                try:
+                    mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5)
+                    mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean")
+                    mdm_score = compute_mdm(
+                        synth_embeddings,
+                        dummy_placeholder=None,
+                        n_clusters=mdm_n_clusters,
+                        metric=mdm_metric,
+                    )
+                    logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score)
+                except Exception as e:  # noqa: BLE001
+                    logger.warning("Error computing MDM: %s", e)
 
 
 if __name__ == "__main__":
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index b7be76a..7889911 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -11,4 +11,7 @@
     compute_benchmark_difficulty,
     compute_benchmark_novelty,
     compute_benchmark_separability,
+    compute_mdm,
+    compute_mmd,
+    compute_pad,
 )
diff --git a/src/utils/diversity_metrics_dataloaders.py b/src/utils/diversity_metrics_dataloaders.py
new file mode 100644
index 0000000..5720203
--- /dev/null
+++ b/src/utils/diversity_metrics_dataloaders.py
@@ -0,0 +1,364 @@
+"""Dataloaders for extracting text from different dataset formats for diversity metrics.
+
+This module provides a flexible interface for loading data from different formats
+and extracting the text needed for embedding generation.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+import json
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetDataloader(ABC):
+    """Abstract base class for dataloaders that extract text from datasets."""
+    
+    @abstractmethod
+    def get_name(self, item: Any) -> str:
+        """Extract the name/title from a dataset item."""
+        pass
+    
+    @abstractmethod
+    def get_description(self, item: Any) -> str:
+        """Extract the description from a dataset item."""
+        pass
+    
+    def get_area(self, item: Any) -> Optional[str]:
+        """Extract the area/category from a dataset item (optional)."""
+        return None
+    
+    def get_instructions(self, item: Any) -> Optional[str]:
+        """Extract instructions from a dataset item (optional)."""
+        return None
+    
+    def get_sample_tasks(self, item: Any, max_samples: int = 5) -> List[str]:
+        """Extract sample tasks/problems from a dataset item (optional).
+        
+        Args:
+            item: The dataset item
+            max_samples: Maximum number of sample tasks to return
+            
+        Returns:
+            List of task/problem strings
+        """
+        return []
+    
+    def extract_text(self, item: Any, max_task_samples: int = 5) -> str:
+        """Extract full text representation from a dataset item.
+        
+        Args:
+            item: The dataset item
+            max_task_samples: Maximum number of sample tasks to include
+            
+        Returns:
+            Text string suitable for embedding generation
+        """
+        text_parts = [
+            f"Name: {self.get_name(item)}",
+            f"Description: {self.get_description(item)}",
+        ]
+        
+        area = self.get_area(item)
+        if area:
+            text_parts.append(f"Area: {area}")
+        
+        instructions = self.get_instructions(item)
+        if instructions:
+            text_parts.append(f"Instructions: {instructions}")
+        
+        tasks = self.get_sample_tasks(item, max_samples=max_task_samples)
+        if tasks:
+            task_texts = [f"Task: {task}" for task in tasks]
+            text_parts.append("Tasks: " + " | ".join(task_texts))
+        
+        return " | ".join(text_parts)
+
+
+class CapabilityDataloader(DatasetDataloader):
+    """Dataloader for capability format (capability.json structure).
+    
+    Can handle either:
+    - A single capability directory (contains capability.json)
+    - A parent directory containing multiple capability subdirectories
+    """
+    
+    def __init__(self, capability_dir: str):
+        """Initialize with a capability directory.
+        
+        Args:
+            capability_dir: Path to capability directory or parent directory with capability subdirectories
+        """
+        self.capability_dir = capability_dir
+        self.capabilities = self._load_capabilities()
+    
+    def _load_capabilities(self) -> List[Dict[str, Any]]:
+        """Load capabilities from directory.
+        
+        Returns:
+            List of capability data dictionaries
+        """
+        capabilities = []
+        
+        # Check if this is a single capability directory (has capability.json)
+        single_cap_json = os.path.join(self.capability_dir, "capability.json")
+        if os.path.exists(single_cap_json):
+            with open(single_cap_json, 'r') as f:
+                capabilities.append(json.load(f))
+            return capabilities
+        
+        # Otherwise, treat as parent directory with multiple capability subdirectories
+        if not os.path.isdir(self.capability_dir):
+            raise FileNotFoundError(f"Capability directory does not exist: {self.capability_dir}")
+        
+        for item_name in os.listdir(self.capability_dir):
+            item_path = os.path.join(self.capability_dir, item_name)
+            if not os.path.isdir(item_path):
+                continue
+            
+            cap_json = os.path.join(item_path, "capability.json")
+            if os.path.exists(cap_json):
+                with open(cap_json, 'r') as f:
+                    capabilities.append(json.load(f))
+        
+        if not capabilities:
+            raise FileNotFoundError(f"No capabilities found in {self.capability_dir}")
+        
+        return capabilities
+    
+    def get_name(self, item: Dict[str, Any]) -> str:
+        return item.get("capability_name", "")
+    
+    def get_description(self, item: Dict[str, Any]) -> str:
+        return item.get("capability_description", "")
+    
+    def get_area(self, item: Dict[str, Any]) -> Optional[str]:
+        return item.get("capability_area")
+    
+    def get_instructions(self, item: Dict[str, Any]) -> Optional[str]:
+        return item.get("capability_instructions")
+    
+    def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]:
+        tasks = item.get("capability_data", [])
+        problems = []
+        for task in tasks[:max_samples]:
+            if isinstance(task, dict):
+                problem = task.get('problem', '')
+                if problem:
+                    problems.append(problem)
+        return problems
+
+
+class HuggingFaceDatasetDataloader(DatasetDataloader):
+    """Dataloader for HuggingFace datasets.
+    
+    Simply extracts text from a specified field in each dataset item.
+    """
+    
+    def __init__(self, dataset, text_field: str = "problem"):
+        """Initialize with a HuggingFace dataset.
+        
+        Args:
+            dataset: HuggingFace dataset or iterable of dicts
+            text_field: Field name containing the text to embed (e.g., "problem", "text", "content")
+        """
+        self.dataset = dataset
+        self.text_field = text_field
+    
+    def get_name(self, item: Dict[str, Any]) -> str:
+        return ""  # Not used in simplified version
+    
+    def get_description(self, item: Dict[str, Any]) -> str:
+        return str(item.get(self.text_field, ""))
+    
+    def get_area(self, item: Dict[str, Any]) -> Optional[str]:
+        return None  # Not used in simplified version
+    
+    def get_instructions(self, item: Dict[str, Any]) -> Optional[str]:
+        return None  # Not used in simplified version
+    
+    def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]:
+        return []  # Not used in simplified version
+    
+    def extract_text(self, item: Any, max_task_samples: int = 5) -> str:
+        """Extract text from the specified field.
+        
+        Args:
+            item: The dataset item
+            max_task_samples: Ignored (kept for interface compatibility)
+            
+        Returns:
+            Text string from the specified field
+        """
+        if isinstance(item, dict):
+            return str(item.get(self.text_field, ""))
+        return str(item)
+
+
+class JSONLDataloader(DatasetDataloader):
+    """Dataloader for JSONL files (one JSON object per line).
+    
+    Flexible loader that can handle various JSONL formats by specifying field mappings.
+    """
+    
+    def __init__(self, jsonl_path: str, name_field: str = "name", 
+                 description_field: str = "description",
+                 area_field: Optional[str] = None,
+                 instructions_field: Optional[str] = None,
+                 task_field: Optional[str] = "problem"):
+        """Initialize with a JSONL file path.
+        
+        Args:
+            jsonl_path: Path to JSONL file
+            name_field: Field name for name/title
+            description_field: Field name for description
+            area_field: Field name for area/category (optional)
+            instructions_field: Field name for instructions (optional)
+            task_field: Field name for tasks/problems (optional)
+        """
+        self.jsonl_path = jsonl_path
+        self.name_field = name_field
+        self.description_field = description_field
+        self.area_field = area_field
+        self.instructions_field = instructions_field
+        self.task_field = task_field
+    
+    def get_name(self, item: Dict[str, Any]) -> str:
+        return str(item.get(self.name_field, ""))
+    
+    def get_description(self, item: Dict[str, Any]) -> str:
+        return str(item.get(self.description_field, ""))
+    
+    def get_area(self, item: Dict[str, Any]) -> Optional[str]:
+        if self.area_field:
+            return item.get(self.area_field)
+        return None
+    
+    def get_instructions(self, item: Dict[str, Any]) -> Optional[str]:
+        if self.instructions_field:
+            return item.get(self.instructions_field)
+        return None
+    
+    def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]:
+        if self.task_field and self.task_field in item:
+            task_value = item[self.task_field]
+            if isinstance(task_value, str):
+                return [task_value]
+            elif isinstance(task_value, list):
+                return [str(t) for t in task_value[:max_samples] if t]
+        return []
+    
+    def load_items(self) -> List[Dict[str, Any]]:
+        """Load all items from the JSONL file."""
+        items = []
+        with open(self.jsonl_path, 'r') as f:
+            for line in f:
+                if line.strip():
+                    items.append(json.loads(line))
+        return items
+
+
+class CSVDataloader(DatasetDataloader):
+    """Dataloader for CSV files."""
+    
+    def __init__(self, csv_path: str, name_field: str = "name",
+                 description_field: str = "description",
+                 area_field: Optional[str] = None,
+                 instructions_field: Optional[str] = None,
+                 task_field: Optional[str] = "problem"):
+        """Initialize with a CSV file path.
+        
+        Args:
+            csv_path: Path to CSV file
+            name_field: Column name for name/title
+            description_field: Column name for description
+            area_field: Column name for area/category (optional)
+            instructions_field: Column name for instructions (optional)
+            task_field: Column name for tasks/problems (optional)
+        """
+        import pandas as pd
+        self.df = pd.read_csv(csv_path)
+        self.name_field = name_field
+        self.description_field = description_field
+        self.area_field = area_field
+        self.instructions_field = instructions_field
+        self.task_field = task_field
+    
+    def get_name(self, item: Dict[str, Any]) -> str:
+        return str(item.get(self.name_field, ""))
+    
+    def get_description(self, item: Dict[str, Any]) -> str:
+        return str(item.get(self.description_field, ""))
+    
+    def get_area(self, item: Dict[str, Any]) -> Optional[str]:
+        if self.area_field and self.area_field in item:
+            return item.get(self.area_field)
+        return None
+    
+    def get_instructions(self, item: Dict[str, Any]) -> Optional[str]:
+        if self.instructions_field and self.instructions_field in item:
+            return item.get(self.instructions_field)
+        return None
+    
+    def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]:
+        if self.task_field and self.task_field in item:
+            task_value = item[self.task_field]
+            if isinstance(task_value, str):
+                return [task_value]
+        return []
+    
+    def load_items(self) -> List[Dict[str, Any]]:
+        """Load all items from the CSV file."""
+        return self.df.to_dict('records')
+
+
+def load_texts_from_dataloader(dataloader: DatasetDataloader) -> List[str]:
+    """Extract texts from a dataloader for embedding generation.
+    
+    Args:
+        dataloader: A DatasetDataloader instance
+        
+    Returns:
+        List of text strings ready for embedding
+    """
+    texts = []
+    
+    if isinstance(dataloader, CapabilityDataloader):
+        # Capability format: iterate over all capabilities
+        for capability_data in dataloader.capabilities:
+            texts.append(dataloader.extract_text(capability_data))
+    elif isinstance(dataloader, HuggingFaceDatasetDataloader):
+        # HuggingFace dataset: iterate over items
+        for item in dataloader.dataset:
+            texts.append(dataloader.extract_text(item))
+    elif isinstance(dataloader, JSONLDataloader):
+        # JSONL: load all items
+        items = dataloader.load_items()
+        for item in items:
+            texts.append(dataloader.extract_text(item))
+    elif isinstance(dataloader, CSVDataloader):
+        # CSV: load all items
+        items = dataloader.load_items()
+        for item in items:
+            texts.append(dataloader.extract_text(item))
+    else:
+        # Generic: try to iterate
+        try:
+            if hasattr(dataloader, 'dataset'):
+                for item in dataloader.dataset:
+                    texts.append(dataloader.extract_text(item))
+            elif hasattr(dataloader, 'load_items'):
+                items = dataloader.load_items()
+                for item in items:
+                    texts.append(dataloader.extract_text(item))
+            else:
+                logger.error("Dataloader does not have dataset or load_items method")
+                raise ValueError("Dataloader must have dataset attribute or load_items method")
+        except Exception as e:
+            logger.error(f"Could not extract texts from dataloader: {e}")
+            raise
+    
+    return texts
+
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index 0f5c5e0..7419f7e 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -7,6 +7,19 @@
 
 import numpy as np
 from scipy.stats import spearmanr
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics.pairwise import (
+    polynomial_kernel,
+    rbf_kernel,
+    laplacian_kernel,
+    linear_kernel,
+    sigmoid_kernel,
+)
+import kmedoids
+from sklearn.metrics import pairwise_distances
 
 
 def compute_benchmark_difficulty(
@@ -286,3 +299,155 @@ def compute_benchmark_novelty(
     return novelty
 
 
+# ===========================
+# ---- Diversity Metrics (PAD, MMD, MDM)
+# ===========================
+
+def compute_pad(
+    x_syn_emb: np.ndarray,
+    x_real_emb: np.ndarray,
+    classifier_name: str = "LogisticRegression",
+) -> float:
+    """
+    Compute the Proxy-A-Distance (PAD) between two sets of embeddings.
+    
+    PAD measures the distance between synthetic and real data distributions
+    by training a classifier to distinguish between them. Lower values indicate
+    more similar distributions.
+    
+    Args:
+        x_syn_emb: Embeddings of synthetic data, shape (n_samples, n_features)
+        x_real_emb: Embeddings of real data, shape (n_samples, n_features)
+        classifier_name: Classifier to use ("LogisticRegression", "RandomForest", "MLP")
+    
+    Returns:
+        float: PAD value (typically in range [0, 2], lower is better)
+    """
+    y_syn_train = np.zeros(len(x_syn_emb))
+    y_real_train = np.ones(len(x_real_emb))
+    x_train = np.concatenate([x_syn_emb, x_real_emb], axis=0)
+    y_train = np.concatenate([y_syn_train, y_real_train], axis=0)
+    
+    # Split into train/validation
+    x_train, x_val, y_train, y_val = train_test_split(
+        x_train, y_train, test_size=0.2, random_state=42
+    )
+    
+    # Classifier
+    if classifier_name == "LogisticRegression":
+        classifier = LogisticRegression(random_state=42, max_iter=1000)
+    elif classifier_name == "RandomForest":
+        classifier = RandomForestClassifier(random_state=42)
+    elif classifier_name == "MLP":
+        classifier = MLPClassifier(
+            hidden_layer_sizes=(128, 64),
+            activation='relu',
+            max_iter=200,
+            random_state=42
+        )
+    else:
+        raise ValueError(f"Unknown classifier: {classifier_name}")
+    
+    classifier.fit(x_train, y_train)
+    y_pred_proba = classifier.predict_proba(x_val)[:, 1]
+    average_loss = np.mean(np.abs(y_pred_proba - y_val))
+    return 2 * (1 - 2 * average_loss)
+
+
+def compute_mmd(
+    X: np.ndarray,
+    Y: np.ndarray,
+    kernel: str = "polynomial",
+    degree: int = 3,
+    gamma: float | None = None,
+    coef0: float = 1,
+) -> float:
+    """
+    Compute the Maximum Mean Discrepancy (MMD) between two samples: X and Y.
+    
+    MMD measures the distance between two distributions in a reproducing kernel
+    Hilbert space. Lower values indicate more similar distributions.
+    
+    Args:
+        X: First sample, shape (n_samples_X, n_features)
+        Y: Second sample, shape (n_samples_Y, n_features)
+        kernel: Kernel name ("polynomial", "rbf", "laplacian", "linear", "sigmoid")
+        degree: Degree for polynomial kernel (default: 3)
+        gamma: Gamma parameter for kernels (default: None, auto)
+        coef0: Coef0 for polynomial/sigmoid kernel
+    
+    Returns:
+        float: MMD value (non-negative, lower is better)
+    """
+    kernel = kernel.lower() if isinstance(kernel, str) else kernel
+    if kernel == "polynomial":
+        kfunc = polynomial_kernel
+        XX = kfunc(X, X, degree=degree, gamma=gamma, coef0=coef0)
+        YY = kfunc(Y, Y, degree=degree, gamma=gamma, coef0=coef0)
+        XY = kfunc(X, Y, degree=degree, gamma=gamma, coef0=coef0)
+    elif kernel == "rbf":
+        kfunc = rbf_kernel
+        XX = kfunc(X, X, gamma=gamma)
+        YY = kfunc(Y, Y, gamma=gamma)
+        XY = kfunc(X, Y, gamma=gamma)
+    elif kernel == "laplacian":
+        kfunc = laplacian_kernel
+        XX = kfunc(X, X, gamma=gamma)
+        YY = kfunc(Y, Y, gamma=gamma)
+        XY = kfunc(X, Y, gamma=gamma)
+    elif kernel == "linear":
+        kfunc = linear_kernel
+        XX = kfunc(X, X)
+        YY = kfunc(Y, Y)
+        XY = kfunc(X, Y)
+    elif kernel == "sigmoid":
+        kfunc = sigmoid_kernel
+        XX = kfunc(X, X, gamma=gamma, coef0=coef0)
+        YY = kfunc(Y, Y, gamma=gamma, coef0=coef0)
+        XY = kfunc(X, Y, gamma=gamma, coef0=coef0)
+    else:
+        raise ValueError(f"Unknown kernel: {kernel}")
+    return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY)
+
+
+def compute_mdm(
+    embeddings: np.ndarray,
+    dummy_placeholder: any = None,  # noqa: ANN001
+    n_clusters: int = 5,
+    metric: str = "euclidean",
+) -> float:
+    """
+    Compute the mean distance of points in each cluster to its medoid, then average across clusters.
+    
+    MDM measures the internal diversity/coherence of a set of embeddings by clustering
+    them and computing the average distance to cluster medoids. Lower values indicate
+    more coherent/diverse clusters.
+    
+    Args:
+        embeddings: Embedding matrix of shape (n_samples, n_features)
+        dummy_placeholder: Dummy placeholder to match the signature (unused)
+        n_clusters: Number of clusters/medoids to use
+        metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.)
+    
+    Returns:
+        float: Mean distance to medoid (averaged over all clusters)
+    """
+    n_samples = len(embeddings)
+    if n_samples < n_clusters:
+        n_clusters = max(1, n_samples)
+    
+    diss = pairwise_distances(embeddings, metric=metric)
+    pam_result = kmedoids.fasterpam(diss, n_clusters, random_state=42)
+    labels = pam_result.labels
+    medoid_indices = pam_result.medoids
+    
+    total_dist = 0.0
+    for i, medoid_idx in enumerate(medoid_indices):
+        cluster_points_idx = np.where(labels == i)[0]
+        if len(cluster_points_idx) == 0:
+            continue
+        dists = diss[cluster_points_idx, medoid_idx]
+        total_dist += np.mean(dists)
+    return total_dist / n_clusters
+
+

From 3727c44caa175c140a052c8d3fe829d111d322a6 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 8 Jan 2026 13:39:33 -0500
Subject: [PATCH 04/14] Cleaned code

---
 src/run_quality_evaluation.py              |   1 -
 src/run_quality_evaluation_README.md       | 185 ---------------------
 src/utils/diversity_metrics_dataloaders.py |  30 +---
 src/utils/quality_evaluation_utils.py      |   2 -
 4 files changed, 7 insertions(+), 211 deletions(-)
 delete mode 100644 src/run_quality_evaluation_README.md

diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index da23ad5..5cc3bc9 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -499,7 +499,6 @@ def main(cfg: DictConfig) -> None:
                     mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean")
                     mdm_score = compute_mdm(
                         synth_embeddings,
-                        dummy_placeholder=None,
                         n_clusters=mdm_n_clusters,
                         metric=mdm_metric,
                     )
diff --git a/src/run_quality_evaluation_README.md b/src/run_quality_evaluation_README.md
deleted file mode 100644
index 6292531..0000000
--- a/src/run_quality_evaluation_README.md
+++ /dev/null
@@ -1,185 +0,0 @@
-# Quality Evaluation Script
-
-`run_quality_evaluation.py` computes benchmark-level quality metrics from existing evaluation scores.
-
-## Overview
-
-This script analyzes model performance scores to compute several quality metrics:
-
-- **Difficulty**: Measures how hard the benchmark is (`1 - max(accuracy)`)
-- **Separability**: Measures how well the benchmark distinguishes between models (mean absolute deviation of accuracies)
-- **Consistency**: Measures stability of model performance across different dataset generations (`1 - mean(std(performance across generations))`)
-- **Novelty**: Measures how much new information the dataset reveals compared to prior benchmarks (`1 - rank_correlation(predicted, actual)`)
-
-## Usage
-
-```bash
-python src/run_quality_evaluation.py
-```
-
-The script uses Hydra for configuration management. Configuration is specified in `src/cfg/run_quality_evaluation_cfg.yaml`.
-
-## Configuration
-
-Edit `src/cfg/run_quality_evaluation_cfg.yaml`:
-
-```yaml
-quality_eval_cfg:
-  # Absolute path to directory containing per-model score folders
-  scores_root_dir: "/path/to/scores"
-  
-  # Fallback: if scores_root_dir not set, uses:
-  # {BASE_ARTIFACTS_DIR}/{scores_subdir}/{run_id}
-  scores_subdir: "scores"
-  
-  # Optional: List of prior datasets for novelty computation
-  prior_datasets:
-    - "/path/to/prior_dataset1"
-    - "/path/to/prior_dataset2"
-```
-
-## Data Structure
-
-The script expects a root directory containing **per-model subdirectories**. Two structures are supported:
-
-### Structure 1: With Multiple Generations (for Consistency)
-
-```
-scores_root_dir/
-├── model1/
-│   ├── generation1/          # First dataset generation
-│   │   └── .../*.json files (recursively)
-│   ├── generation2/          # Second dataset generation
-│   │   └── .../*.json files
-│   └── generation3/
-│       └── .../*.json files
-├── model2/
-│   └── ... (same structure)
-```
-
-**Behavior:**
-- Computes average accuracy **per generation** for each model
-- **Consistency** is computed from generation-to-generation variation
-- **Difficulty** and **Separability** use the **average across all generations**
-
-### Structure 2: Without Generations (Single Dataset)
-
-```
-scores_root_dir/
-├── model1/
-│   └── .../*.json files (recursively, any nesting allowed)
-├── model2/
-│   └── .../*.json files
-```
-
-**Behavior:**
-- Walks all JSON files recursively under each model directory
-- Computes average accuracy per model
-- **Consistency** is NOT computed (no generations available)
-- **Difficulty** and **Separability** are computed from average accuracies
-
-## JSON File Format
-
-Each `.json` file must follow the Inspect AI evaluation format:
-
-```json
-{
-  "results": {
-    "scores": [
-      {
-        "metrics": {
-          "accuracy": {
-            "value": 0.75
-          }
-        }
-      }
-    ]
-  }
-}
-```
-
-## Metrics
-
-### Difficulty
-
-Measures how difficult the benchmark is for models:
-
-```
-difficulty = 1 - max(accuracy across all models)
-```
-
-- Range: [0, 1]
-- Higher values = harder benchmark
-
-### Separability
-
-Measures how well the benchmark distinguishes between models:
-
-```
-separability = mean(|accuracy_i - mean(accuracies)|)
-```
-
-- Range: [0, 1]
-- Higher values = better model discrimination
-
-### Consistency
-
-Measures stability of model performance across dataset generations:
-
-```
-consistency = 1 - (1/n) * Σ std(performance(m_i) across generations)
-```
-
-- Range: [0, 1]
-- Higher values = more stable/consistent performance
-- **Only computed** when multiple generations are detected
-
-### Novelty
-
-Measures how much new information the dataset reveals compared to prior benchmarks:
-
-```
-1. Predict current accuracies from prior datasets using linear regression
-2. Compute rank correlation between predicted and actual rankings
-3. novelty = 1 - rank_correlation
-```
-
-- Range: [0, 1]
-- Higher values = more novel/unpredictable performance patterns
-- **Only computed** when `prior_datasets` are specified in config
-
-## Prior Datasets (for Novelty)
-
-Prior datasets should have the **same structure** as the main dataset.
-
-**Important:** Prior dataset directories should be **separate** from the main `scores_root_dir` to avoid being treated as models.
-
-Example:
-```
-data/
-├── scores_sample/          # Main dataset
-│   ├── model1/
-│   └── model2/
-└── scores_sample/
-    └── math-500/          # Prior dataset (separate directory)
-        ├── model1/
-        └── model2/
-```
-
-**Requirements:**
-- All prior datasets must have the same set of models as the current dataset
-- Models must be consistent across all datasets for novelty computation
-
-## Output
-
-The script logs all computed metrics:
-
-```
-[INFO] Model 'model1' mean accuracy over 3 generations: 0.7500
-[INFO] Model 'model2' mean accuracy over 3 generations: 0.6500
-[INFO] Benchmark difficulty: 0.2500
-[INFO] Benchmark separability: 0.0500
-[INFO] Benchmark consistency: 0.9200
-[INFO] Benchmark novelty: 0.5000
-```
-
diff --git a/src/utils/diversity_metrics_dataloaders.py b/src/utils/diversity_metrics_dataloaders.py
index 5720203..5affc62 100644
--- a/src/utils/diversity_metrics_dataloaders.py
+++ b/src/utils/diversity_metrics_dataloaders.py
@@ -78,38 +78,23 @@ def extract_text(self, item: Any, max_task_samples: int = 5) -> str:
 
 
 class CapabilityDataloader(DatasetDataloader):
-    """Dataloader for capability format (capability.json structure).
-    
-    Can handle either:
-    - A single capability directory (contains capability.json)
-    - A parent directory containing multiple capability subdirectories
-    """
+    """Dataloader for capability format (capability.json structure)."""
     
     def __init__(self, capability_dir: str):
-        """Initialize with a capability directory.
-        
-        Args:
-            capability_dir: Path to capability directory or parent directory with capability subdirectories
-        """
+        """Initialize with a capability directory or parent directory."""
         self.capability_dir = capability_dir
         self.capabilities = self._load_capabilities()
     
     def _load_capabilities(self) -> List[Dict[str, Any]]:
-        """Load capabilities from directory.
-        
-        Returns:
-            List of capability data dictionaries
-        """
+        """Load capabilities from directory."""
         capabilities = []
         
-        # Check if this is a single capability directory (has capability.json)
         single_cap_json = os.path.join(self.capability_dir, "capability.json")
         if os.path.exists(single_cap_json):
             with open(single_cap_json, 'r') as f:
                 capabilities.append(json.load(f))
             return capabilities
         
-        # Otherwise, treat as parent directory with multiple capability subdirectories
         if not os.path.isdir(self.capability_dir):
             raise FileNotFoundError(f"Capability directory does not exist: {self.capability_dir}")
         
@@ -168,19 +153,19 @@ def __init__(self, dataset, text_field: str = "problem"):
         self.text_field = text_field
     
     def get_name(self, item: Dict[str, Any]) -> str:
-        return ""  # Not used in simplified version
+        return ""
     
     def get_description(self, item: Dict[str, Any]) -> str:
         return str(item.get(self.text_field, ""))
     
     def get_area(self, item: Dict[str, Any]) -> Optional[str]:
-        return None  # Not used in simplified version
+        return None
     
     def get_instructions(self, item: Dict[str, Any]) -> Optional[str]:
-        return None  # Not used in simplified version
+        return None
     
     def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]:
-        return []  # Not used in simplified version
+        return []
     
     def extract_text(self, item: Any, max_task_samples: int = 5) -> str:
         """Extract text from the specified field.
@@ -326,7 +311,6 @@ def load_texts_from_dataloader(dataloader: DatasetDataloader) -> List[str]:
     texts = []
     
     if isinstance(dataloader, CapabilityDataloader):
-        # Capability format: iterate over all capabilities
         for capability_data in dataloader.capabilities:
             texts.append(dataloader.extract_text(capability_data))
     elif isinstance(dataloader, HuggingFaceDatasetDataloader):
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index 7419f7e..ea7e75c 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -412,7 +412,6 @@ def compute_mmd(
 
 def compute_mdm(
     embeddings: np.ndarray,
-    dummy_placeholder: any = None,  # noqa: ANN001
     n_clusters: int = 5,
     metric: str = "euclidean",
 ) -> float:
@@ -425,7 +424,6 @@ def compute_mdm(
     
     Args:
         embeddings: Embedding matrix of shape (n_samples, n_features)
-        dummy_placeholder: Dummy placeholder to match the signature (unused)
         n_clusters: Number of clusters/medoids to use
         metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.)
     

From 5859941f5fe34741e8655d4293500b6acc2b7dba Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Fri, 16 Jan 2026 12:58:32 -0500
Subject: [PATCH 05/14] Added InfoSynth metrics

---
 src/cfg/run_quality_evaluation_cfg.yaml |  18 ++++-
 src/run_quality_evaluation.py           |  66 ++++++++++-----
 src/utils/__init__.py                   |   2 +
 src/utils/quality_evaluation_utils.py   | 103 ++++++++++++++++++++++++
 4 files changed, 165 insertions(+), 24 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index 932b102..144ca74 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -24,10 +24,16 @@ quality_eval_cfg:
   # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
   embedding_dimensions: 3072
   
-  diversity_metrics:
-    - "pad"
-    - "mmd"
-    - "mdm"
+  # Internal diversity metrics (only need synthetic data)
+  internal_diversity_metrics:
+    - "mdm"      # Mean Distance to Medoid - measures internal coherence
+    - "entropy"  # Differential Entropy - measures diversity/uncertainty
+  
+  # Comparison metrics (need both synthetic and real data)
+  comparison_metrics:
+    - "pad"           # Proxy-A-Distance - measures distribution similarity
+    - "mmd"           # Maximum Mean Discrepancy - measures distribution distance
+    - "kl_divergence" # KL Divergence - measures novelty (how different from real)
   
   pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
   
@@ -36,6 +42,10 @@ quality_eval_cfg:
   
   mdm_n_clusters: 5
   mdm_metric: "euclidean"
+  
+  entropy_k: 4  # Number of nearest neighbors for differential entropy computation
+  
+  kl_k: 4  # Number of nearest neighbors for KL divergence computation
 
 exp_cfg:
   exp_id: "quality_evaluation"
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 5cc3bc9..8bab276 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -16,6 +16,8 @@
     compute_benchmark_difficulty,
     compute_benchmark_novelty,
     compute_benchmark_separability,
+    compute_differential_entropy,
+    compute_kl_divergence,
     compute_mdm,
     compute_mmd,
     compute_pad,
@@ -346,18 +348,18 @@ def main(cfg: DictConfig) -> None:
             # Structure: model_dir/...json files (no generation subdirectories)
             accuracies = _collect_accuracies_from_dir(model_dir)
 
-            if not accuracies:
-                logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
-                continue
+        if not accuracies:
+            logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
+            continue
 
-            avg_acc = sum(accuracies) / len(accuracies)
-            model_to_accuracy[model_name] = avg_acc
-            logger.info(
-                "Model '%s' mean accuracy over %d JSON files: %.4f",
-                model_name,
-                len(accuracies),
-                avg_acc,
-            )
+        avg_acc = sum(accuracies) / len(accuracies)
+        model_to_accuracy[model_name] = avg_acc
+        logger.info(
+            "Model '%s' mean accuracy over %d JSON files: %.4f",
+            model_name,
+            len(accuracies),
+            avg_acc,
+        )
 
     if not model_to_accuracy:
         logger.error("No valid model accuracies found in %s", base_scores_dir)
@@ -403,10 +405,11 @@ def main(cfg: DictConfig) -> None:
         except Exception as e:  # noqa: BLE001
             logger.warning("Error computing novelty: %s", e)
     
-    # Compute diversity metrics if capabilities directory is provided
+    # Compute embedding-based metrics if capabilities directory is provided
     capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None)
     if capabilities_dir:
-        metrics_to_compute = getattr(cfg.quality_eval_cfg, "diversity_metrics", ["pad", "mmd", "mdm"])
+        internal_diversity_metrics = getattr(cfg.quality_eval_cfg, "internal_diversity_metrics", ["mdm", "entropy"])
+        comparison_metrics = getattr(cfg.quality_eval_cfg, "comparison_metrics", ["pad", "mmd", "kl_divergence"])
         embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large")
         embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai")
         embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072)
@@ -416,7 +419,7 @@ def main(cfg: DictConfig) -> None:
         if synth_dataloader_config:
             synth_dataloader_config = dict(synth_dataloader_config)
         
-        logger.info("Computing diversity metrics for capabilities in %s", capabilities_dir)
+        logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir)
         
         # Load capabilities and generate embeddings
         synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
@@ -462,8 +465,8 @@ def main(cfg: DictConfig) -> None:
                 )
                 
                 if len(real_embeddings) > 0:
-                    # Compute metrics that require both synthetic and real data
-                    if "pad" in metrics_to_compute:
+                    # Compute comparison metrics that require both synthetic and real data
+                    if "pad" in comparison_metrics:
                         try:
                             pad_score = compute_pad(
                                 synth_embeddings,
@@ -474,7 +477,7 @@ def main(cfg: DictConfig) -> None:
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing PAD: %s", e)
                     
-                    if "mmd" in metrics_to_compute:
+                    if "mmd" in comparison_metrics:
                         try:
                             mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial")
                             mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3)
@@ -487,13 +490,25 @@ def main(cfg: DictConfig) -> None:
                             logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score)
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing MMD: %s", e)
+                    
+                    if "kl_divergence" in comparison_metrics:
+                        try:
+                            kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4)
+                            kl_score = compute_kl_divergence(
+                                synth_embeddings,
+                                real_embeddings,
+                                k=kl_k,
+                            )
+                            logger.info("KL divergence score (k=%d): %.4f", kl_k, kl_score)
+                        except Exception as e:  # noqa: BLE001
+                            logger.warning("Error computing KL divergence: %s", e)
                 else:
                     logger.warning("No real data embeddings generated, skipping comparison metrics")
             else:
-                logger.info("No real_data_dir provided, skipping PAD and MMD (require real data)")
+                logger.info("No real_data_dir provided, skipping comparison metrics (require real data)")
             
-            # Compute MDM (can be computed without real data - measures internal diversity)
-            if "mdm" in metrics_to_compute:
+            # Compute internal diversity metrics (only need synthetic data)
+            if "mdm" in internal_diversity_metrics:
                 try:
                     mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5)
                     mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean")
@@ -505,6 +520,17 @@ def main(cfg: DictConfig) -> None:
                     logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score)
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing MDM: %s", e)
+            
+            if "entropy" in internal_diversity_metrics:
+                try:
+                    entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4)
+                    entropy_score = compute_differential_entropy(
+                        synth_embeddings,
+                        k=entropy_k,
+                    )
+                    logger.info("Differential entropy score (k=%d): %.4f", entropy_k, entropy_score)
+                except Exception as e:  # noqa: BLE001
+                    logger.warning("Error computing differential entropy: %s", e)
 
 
 if __name__ == "__main__":
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 7889911..3c562f3 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -11,6 +11,8 @@
     compute_benchmark_difficulty,
     compute_benchmark_novelty,
     compute_benchmark_separability,
+    compute_differential_entropy,
+    compute_kl_divergence,
     compute_mdm,
     compute_mmd,
     compute_pad,
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index ea7e75c..623e6e2 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 from scipy.stats import spearmanr
+from scipy.special import digamma, gammaln
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.neural_network import MLPClassifier
@@ -18,6 +19,7 @@
     linear_kernel,
     sigmoid_kernel,
 )
+from sklearn.neighbors import NearestNeighbors
 import kmedoids
 from sklearn.metrics import pairwise_distances
 
@@ -449,3 +451,104 @@ def compute_mdm(
     return total_dist / n_clusters
 
 
+# ===========================
+# ---- Information-Theoretic Metrics (Entropy, KL-Divergence)
+# ===========================
+
+def compute_differential_entropy(
+    embeddings: np.ndarray,
+    k: int = 4,
+) -> float:
+    """
+    Compute the differential entropy of a set of embeddings using k-nearest neighbors.
+    
+    Differential entropy measures the diversity/uncertainty in the embedding distribution.
+    Higher values indicate more diverse data.
+    
+    This implementation uses the k-NN estimator for differential entropy:
+        H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps))
+    
+    where:
+    - N is the number of samples
+    - d is the embedding dimension
+    - k is the number of neighbors
+    - eps is the distance to the k-th nearest neighbor
+    
+    Args:
+        embeddings: Embedding matrix of shape (n_samples, n_features)
+        k: Number of nearest neighbors to use (default: 4)
+    
+    Returns:
+        float: Differential entropy value (higher is more diverse)
+    """
+    N, d = embeddings.shape
+    if N < k + 1:
+        raise ValueError(
+            f"Cannot compute entropy: need at least {k + 1} samples, but got {N}."
+        )
+    
+    nbrs = NearestNeighbors(n_neighbors=k + 1).fit(embeddings)
+    distances, _ = nbrs.kneighbors(embeddings)
+    eps = distances[:, -1]
+    eps[eps == 0] = np.nextafter(0, 1)
+    
+    log_vol = (d / 2) * np.log(np.pi) - gammaln(d / 2 + 1)
+    entropy = digamma(N) - digamma(k) + log_vol + d * np.mean(np.log(eps))
+    return float(entropy)
+
+
+def compute_kl_divergence(
+    p_embeddings: np.ndarray,
+    q_embeddings: np.ndarray,
+    k: int = 4,
+    eps: float = 1e-10,
+) -> float:
+    """
+    Compute the KL divergence between two sets of embeddings using k-nearest neighbors.
+    
+    KL divergence measures how different distribution P is from distribution Q.
+    Higher values indicate more novelty (P is more different from Q).
+    
+    This implementation uses the k-NN estimator for KL divergence:
+        KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1))
+    
+    where:
+    - P is the distribution of p_embeddings (n samples)
+    - Q is the distribution of q_embeddings (m samples)
+    - d is the embedding dimension
+    - rho is the distance to the k-th nearest neighbor in P
+    - nu is the distance to the k-th nearest neighbor in Q
+    
+    Args:
+        p_embeddings: Embeddings of distribution P, shape (n_samples_p, n_features)
+        q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features)
+        k: Number of nearest neighbors to use (default: 4)
+        eps: Small epsilon to avoid division by zero (default: 1e-10)
+    
+    Returns:
+        float: KL divergence value (higher is more novel/different)
+    """
+    n, d = p_embeddings.shape
+    m, _ = q_embeddings.shape
+    
+    if n < k + 1:
+        raise ValueError(
+            f"Cannot compute KL divergence: P needs at least {k + 1} samples, but got {n}."
+        )
+    if m < k:
+        raise ValueError(
+            f"Cannot compute KL divergence: Q needs at least {k} samples, but got {m}."
+        )
+    
+    # Find k-th nearest neighbor in P for each point in P
+    nbrs_p = NearestNeighbors(n_neighbors=k + 1).fit(p_embeddings)
+    rho = np.maximum(nbrs_p.kneighbors(p_embeddings)[0][:, k], eps)
+    
+    # Find k-th nearest neighbor in Q for each point in P
+    nbrs_q = NearestNeighbors(n_neighbors=k).fit(q_embeddings)
+    nu = np.maximum(nbrs_q.kneighbors(p_embeddings)[0][:, k - 1], eps)
+    
+    kl_div = (d / n) * np.sum(np.log(nu / rho)) + np.log(m / (n - 1))
+    return float(kl_div)
+
+

From 24c45f81cad9371ff11d992b970074c32254bb5e Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Wed, 28 Jan 2026 14:34:09 -0500
Subject: [PATCH 06/14] To the PR comments

---
 src/run_quality_evaluation.py         |  26 +++++-
 src/utils/quality_evaluation_utils.py | 113 +++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 8bab276..233c86e 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -59,7 +59,7 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]:
     return accuracies
 
 
-def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
+def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
     """
     Load model accuracies from a directory structure.
     
@@ -385,7 +385,7 @@ def main(cfg: DictConfig) -> None:
             logger.info("Loading prior datasets for novelty computation...")
             prior_datasets_accuracies: List[Dict[str, float]] = []
             for prior_dir in prior_datasets:
-                prior_acc = _load_model_accuracies_from_dir(prior_dir)
+                prior_acc = _load_avg_model_accuracies_from_dir(prior_dir)
                 if prior_acc:
                     prior_datasets_accuracies.append(prior_acc)
                     logger.info(
@@ -494,12 +494,21 @@ def main(cfg: DictConfig) -> None:
                     if "kl_divergence" in comparison_metrics:
                         try:
                             kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4)
+                            umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
+                            umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
+                            umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
+                            umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
                             kl_score = compute_kl_divergence(
                                 synth_embeddings,
                                 real_embeddings,
                                 k=kl_k,
+                                umap_n_components=umap_n_components,
+                                umap_n_neighbors=umap_n_neighbors,
+                                umap_min_dist=umap_min_dist,
+                                umap_metric=umap_metric,
                             )
-                            logger.info("KL divergence score (k=%d): %.4f", kl_k, kl_score)
+                            umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+                            logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing KL divergence: %s", e)
                 else:
@@ -524,11 +533,20 @@ def main(cfg: DictConfig) -> None:
             if "entropy" in internal_diversity_metrics:
                 try:
                     entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4)
+                    umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
+                    umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
+                    umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
+                    umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
                     entropy_score = compute_differential_entropy(
                         synth_embeddings,
                         k=entropy_k,
+                        umap_n_components=umap_n_components,
+                        umap_n_neighbors=umap_n_neighbors,
+                        umap_min_dist=umap_min_dist,
+                        umap_metric=umap_metric,
                     )
-                    logger.info("Differential entropy score (k=%d): %.4f", entropy_k, entropy_score)
+                    umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+                    logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score)
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing differential entropy: %s", e)
 
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index 623e6e2..dcebafb 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -3,7 +3,8 @@
 from __future__ import annotations
 
 import statistics
-from typing import Iterable, List, Mapping, Union
+import warnings
+from typing import Iterable, List, Mapping, Optional, Union
 
 import numpy as np
 from scipy.stats import spearmanr
@@ -23,7 +24,16 @@
 import kmedoids
 from sklearn.metrics import pairwise_distances
 
+# Optional UMAP import
+try:
+    from umap import UMAP
+    UMAP_AVAILABLE = True
+except ImportError:
+    UMAP_AVAILABLE = False
+    UMAP = None
 
+
+# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351
 def compute_benchmark_difficulty(
     accuracies: Union[Iterable[float], Mapping[str, float]],
 ) -> float:
@@ -60,6 +70,7 @@ def compute_benchmark_difficulty(
     return 1.0 - best_acc
 
 
+# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351
 def compute_benchmark_separability(
     accuracies: Union[Iterable[float], Mapping[str, float]],
 ) -> float:
@@ -96,6 +107,7 @@ def compute_benchmark_separability(
     return sum(abs_devs) / len(abs_devs)
 
 
+# Source paper: Data Swarms - https://arxiv.org/abs/2506.00741
 def compute_benchmark_consistency(
     model_to_generation_accuracies: Mapping[str, Iterable[float]],
 ) -> float:
@@ -180,6 +192,7 @@ def compute_benchmark_consistency(
     return consistency
 
 
+# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351
 def compute_benchmark_novelty(
     current_accuracies: Mapping[str, float],
     prior_datasets_accuracies: List[Mapping[str, float]],
@@ -305,6 +318,7 @@ def compute_benchmark_novelty(
 # ---- Diversity Metrics (PAD, MMD, MDM)
 # ===========================
 
+# Source paper: SynQue - https://arxiv.org/abs/2511.03928
 def compute_pad(
     x_syn_emb: np.ndarray,
     x_real_emb: np.ndarray,
@@ -356,6 +370,7 @@ def compute_pad(
     return 2 * (1 - 2 * average_loss)
 
 
+# Source paper: SynQue - https://arxiv.org/abs/2511.03928
 def compute_mmd(
     X: np.ndarray,
     Y: np.ndarray,
@@ -412,6 +427,7 @@ def compute_mmd(
     return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY)
 
 
+# Source paper: SynQue - https://arxiv.org/abs/2511.03928
 def compute_mdm(
     embeddings: np.ndarray,
     n_clusters: int = 5,
@@ -455,9 +471,66 @@ def compute_mdm(
 # ---- Information-Theoretic Metrics (Entropy, KL-Divergence)
 # ===========================
 
+def _apply_umap_reduction(
+    embeddings: np.ndarray,
+    n_components: Optional[int] = None,
+    n_neighbors: int = 15,
+    min_dist: float = 0.1,
+    metric: str = "cosine",
+) -> np.ndarray:
+    """
+    Optionally apply UMAP dimensionality reduction to embeddings.
+    
+    Args:
+        embeddings: Embedding matrix of shape (n_samples, n_features)
+        n_components: Target dimension. If None, returns original embeddings.
+        n_neighbors: Number of neighbors for UMAP (default: 15)
+        min_dist: Minimum distance for UMAP (default: 0.1)
+        metric: Distance metric for UMAP (default: "cosine")
+    
+    Returns:
+        Reduced embeddings if n_components is provided, otherwise original embeddings
+    """
+    if n_components is None:
+        return embeddings
+    
+    if not UMAP_AVAILABLE:
+        raise ImportError(
+            "UMAP is required for dimensionality reduction. "
+            "Install it with: pip install umap-learn"
+        )
+    
+    if embeddings.shape[1] <= n_components:
+        # Already at or below target dimension
+        return embeddings
+    
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        umap_model = UMAP(
+            n_neighbors=n_neighbors,
+            min_dist=min_dist,
+            n_components=n_components,
+            metric=metric,
+            random_state=42,
+        )
+        reduced_embeddings = umap_model.fit_transform(embeddings)
+    
+    # Renormalize (like InfoSynth does)
+    norms = np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)
+    eps = 1e-12
+    reduced_embeddings = reduced_embeddings / (norms + eps)
+    
+    return reduced_embeddings
+
+
+# Source paper: InfoSyth - https://arxiv.org/abs/2601.00575
 def compute_differential_entropy(
     embeddings: np.ndarray,
     k: int = 4,
+    umap_n_components: Optional[int] = None,
+    umap_n_neighbors: int = 15,
+    umap_min_dist: float = 0.1,
+    umap_metric: str = "cosine",
 ) -> float:
     """
     Compute the differential entropy of a set of embeddings using k-nearest neighbors.
@@ -477,10 +550,24 @@ def compute_differential_entropy(
     Args:
         embeddings: Embedding matrix of shape (n_samples, n_features)
         k: Number of nearest neighbors to use (default: 4)
+        umap_n_components: Optional UMAP target dimension. If None, uses original embeddings.
+        umap_n_neighbors: Number of neighbors for UMAP (default: 15)
+        umap_min_dist: Minimum distance for UMAP (default: 0.1)
+        umap_metric: Distance metric for UMAP (default: "cosine")
     
     Returns:
         float: Differential entropy value (higher is more diverse)
     """
+    # Apply UMAP reduction if requested
+    if umap_n_components is not None:
+        embeddings = _apply_umap_reduction(
+            embeddings,
+            n_components=umap_n_components,
+            n_neighbors=umap_n_neighbors,
+            min_dist=umap_min_dist,
+            metric=umap_metric,
+        )
+    
     N, d = embeddings.shape
     if N < k + 1:
         raise ValueError(
@@ -497,11 +584,16 @@ def compute_differential_entropy(
     return float(entropy)
 
 
+# Source paper: InfoSyth - https://arxiv.org/abs/2601.00575
 def compute_kl_divergence(
     p_embeddings: np.ndarray,
     q_embeddings: np.ndarray,
     k: int = 4,
     eps: float = 1e-10,
+    umap_n_components: Optional[int] = None,
+    umap_n_neighbors: int = 15,
+    umap_min_dist: float = 0.1,
+    umap_metric: str = "cosine",
 ) -> float:
     """
     Compute the KL divergence between two sets of embeddings using k-nearest neighbors.
@@ -524,10 +616,29 @@ def compute_kl_divergence(
         q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features)
         k: Number of nearest neighbors to use (default: 4)
         eps: Small epsilon to avoid division by zero (default: 1e-10)
+        umap_n_components: Optional UMAP target dimension. If None, uses original embeddings.
+        umap_n_neighbors: Number of neighbors for UMAP (default: 15)
+        umap_min_dist: Minimum distance for UMAP (default: 0.1)
+        umap_metric: Distance metric for UMAP (default: "cosine")
     
     Returns:
         float: KL divergence value (higher is more novel/different)
     """
+    # Apply UMAP reduction if requested (apply to both embeddings together for consistency)
+    if umap_n_components is not None:
+        # Stack embeddings, apply UMAP, then split back
+        # This ensures both distributions are reduced in the same space
+        combined_embeddings = np.vstack([p_embeddings, q_embeddings])
+        reduced_combined = _apply_umap_reduction(
+            combined_embeddings,
+            n_components=umap_n_components,
+            n_neighbors=umap_n_neighbors,
+            min_dist=umap_min_dist,
+            metric=umap_metric,
+        )
+        p_embeddings = reduced_combined[:len(p_embeddings)]
+        q_embeddings = reduced_combined[len(p_embeddings):]
+    
     n, d = p_embeddings.shape
     m, _ = q_embeddings.shape
     

From 9cec4f88c1aa099574d18a4be2976c7c9eeffb4b Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Fri, 30 Jan 2026 08:40:55 -0500
Subject: [PATCH 07/14] Updated UMAP

---
 src/run_quality_evaluation.py         | 184 ++++++++++++++------------
 src/utils/__init__.py                 |   1 +
 src/utils/quality_evaluation_utils.py | 130 ++++++------------
 3 files changed, 146 insertions(+), 169 deletions(-)

diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 233c86e..3be6a25 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -21,6 +21,7 @@
     compute_mdm,
     compute_mmd,
     compute_pad,
+    fit_umap_shared,
 )
 from src.utils import constants
 from src.utils.data_utils import get_run_id
@@ -40,10 +41,10 @@
 def _collect_accuracies_from_dir(directory: str) -> List[float]:
     """
     Collect all accuracy values from JSON files in a directory (recursively).
-    
+
     Args:
         directory: Directory to walk recursively for JSON files.
-        
+
     Returns:
         List of accuracy values found in the directory.
     """
@@ -62,29 +63,29 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]:
 def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
     """
     Load model accuracies from a directory structure.
-    
+
     Args:
         base_dir: Directory containing per-model subdirectories with JSON files.
-        
+
     Returns:
         Dictionary mapping model name to average accuracy.
     """
     model_to_accuracy: Dict[str, float] = {}
-    
+
     if not os.path.isdir(base_dir):
         logger.warning("Directory does not exist: %s", base_dir)
         return model_to_accuracy
-    
+
     for model_name in os.listdir(base_dir):
         model_dir = os.path.join(base_dir, model_name)
         if not os.path.isdir(model_dir):
             continue
-        
+
         accuracies = _collect_accuracies_from_dir(model_dir)
         if accuracies:
             avg_acc = sum(accuracies) / len(accuracies)
             model_to_accuracy[model_name] = avg_acc
-    
+
     return model_to_accuracy
 
 
@@ -93,31 +94,31 @@ def _create_dataloader_from_config(
     dataloader_config: Dict[str, Any],
 ) -> DatasetDataloader:
     """Create a dataloader from configuration.
-    
+
     Args:
         data_path: Path to the data
         dataloader_config: Configuration dict with 'type' and other fields
-        
+
     Returns:
         DatasetDataloader instance
     """
     dataloader_type = dataloader_config.get("type", "capability")
-    
+
     if dataloader_type == "capability":
         return CapabilityDataloader(data_path)
-    
+
     elif dataloader_type == "huggingface":
         from datasets import load_dataset
         dataset_name = dataloader_config.get("dataset_name")
         split = dataloader_config.get("split", "train")
         subset = dataloader_config.get("subset", None)
         dataset = load_dataset(dataset_name, name=subset, split=split)
-        
+
         return HuggingFaceDatasetDataloader(
             dataset=dataset,
             text_field=dataloader_config.get("text_field", "problem"),
         )
-    
+
     elif dataloader_type == "jsonl":
         return JSONLDataloader(
             jsonl_path=data_path,
@@ -127,7 +128,7 @@ def _create_dataloader_from_config(
             instructions_field=dataloader_config.get("instructions_field"),
             task_field=dataloader_config.get("task_field", "problem"),
         )
-    
+
     elif dataloader_type == "csv":
         return CSVDataloader(
             csv_path=data_path,
@@ -137,7 +138,7 @@ def _create_dataloader_from_config(
             instructions_field=dataloader_config.get("instructions_field"),
             task_field=dataloader_config.get("task_field", "problem"),
         )
-    
+
     else:
         raise ValueError(f"Unknown dataloader type: {dataloader_type}")
 
@@ -148,22 +149,22 @@ def _load_capabilities_and_generate_embeddings(
     embed_dimensions: int,
     dataloader_config: Optional[Dict[str, Any]] = None,
     embedding_backend: str = "openai",
-) -> tuple[np.ndarray, List[Any]]:
+) -> tuple[np.ndarray, List[str]]:
     """
     Load capabilities from directory and generate embeddings.
-    
+
     Supports both capability format (default) and custom dataloaders.
     Always uses the dataloader system for consistency.
-    
+
     Args:
         capabilities_dir: Directory containing capability subdirectories OR path to data file
         embedding_model_name: Name of embedding model to use
         embed_dimensions: Number of embedding dimensions
         dataloader_config: Optional configuration for custom dataloader.
                           If None, defaults to capability format.
-        
+
     Returns:
-        Tuple of (embeddings array, list of items/capabilities)
+        Tuple of (embeddings array, list of extracted texts)
     """
     # Use dataloader system: default to capability format if no config provided
     if dataloader_config:
@@ -176,16 +177,16 @@ def _load_capabilities_and_generate_embeddings(
             return np.array([]), []
         logger.info("Using capability format dataloader for %s", capabilities_dir)
         dataloader = CapabilityDataloader(capabilities_dir)
-    
+
     # Extract texts using the dataloader
     texts = load_texts_from_dataloader(dataloader)
-    
+
     if not texts:
         logger.warning("No texts extracted from %s", capabilities_dir)
         return np.array([]), []
-    
+
     logger.info("Extracted %d texts for embedding", len(texts))
-    
+
     # Generate embeddings
     logger.info(
         "Generating embeddings using %s (backend=%s)",
@@ -210,7 +211,7 @@ def _load_capabilities_and_generate_embeddings(
                 exc,
             )
             return np.array([]), []
-        
+
         hf_model = SentenceTransformer(embedding_model_name)
         embeddings_array = hf_model.encode(
             texts,
@@ -228,8 +229,8 @@ def _load_capabilities_and_generate_embeddings(
     else:
         logger.error("Unknown embedding_backend: %s", embedding_backend)
         return np.array([]), []
-    
-    return embeddings_array, []
+
+    return embeddings_array, texts
 
 
 def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
@@ -246,7 +247,7 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
         if "error" in data or "results" not in data:
             # File has error or no results, skip it
             return None
-        
+
         scores = data["results"]["scores"]
         if not scores:
             return None
@@ -291,7 +292,7 @@ def main(cfg: DictConfig) -> None:
     model_to_accuracy: Dict[str, float] = {}
     # For consistency: map model to list of accuracies per generation
     model_to_generation_accuracies: Dict[str, List[float]] = {}
-    
+
     # Get prior dataset names to exclude them from current dataset
     prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
     prior_dataset_names = set()
@@ -299,7 +300,7 @@ def main(cfg: DictConfig) -> None:
         # Extract the directory name from the path
         prior_name = os.path.basename(os.path.normpath(prior_path))
         prior_dataset_names.add(prior_name)
-    
+
     for model_name in os.listdir(base_scores_dir):
         # Skip if this is a prior dataset directory
         if model_name in prior_dataset_names:
@@ -314,7 +315,7 @@ def main(cfg: DictConfig) -> None:
             d for d in os.listdir(model_dir)
             if os.path.isdir(os.path.join(model_dir, d))
         ]
-        
+
         if subdirs:
             # Structure: model_dir/generation_dir/...json files
             # Each subdirectory represents a different dataset generation
@@ -322,7 +323,7 @@ def main(cfg: DictConfig) -> None:
             for gen_dir_name in sorted(subdirs):
                 gen_dir = os.path.join(model_dir, gen_dir_name)
                 gen_accuracies = _collect_accuracies_from_dir(gen_dir)
-                
+
                 if gen_accuracies:
                     avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies)
                     generation_accuracies.append(avg_gen_acc)
@@ -330,7 +331,7 @@ def main(cfg: DictConfig) -> None:
                         "Model '%s' generation '%s': %.4f (from %d JSON files)",
                         model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies)
                     )
-            
+
             if generation_accuracies:
                 model_to_generation_accuracies[model_name] = generation_accuracies
                 # Overall average across all generations
@@ -369,7 +370,7 @@ def main(cfg: DictConfig) -> None:
     separability = compute_benchmark_separability(model_to_accuracy)
     logger.info("Benchmark difficulty: %.4f", difficulty)
     logger.info("Benchmark separability: %.4f", separability)
-    
+
     # Compute consistency if we have multiple generations per model
     if model_to_generation_accuracies:
         try:
@@ -377,7 +378,7 @@ def main(cfg: DictConfig) -> None:
             logger.info("Benchmark consistency: %.4f", consistency)
         except ValueError as e:
             logger.warning("Could not compute consistency: %s", e)
-    
+
     # Compute novelty if prior datasets are provided
     prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
     if prior_datasets:
@@ -394,7 +395,7 @@ def main(cfg: DictConfig) -> None:
                     )
                 else:
                     logger.warning("No accuracies found in prior dataset: %s", prior_dir)
-            
+
             if prior_datasets_accuracies:
                 novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies)
                 logger.info("Benchmark novelty: %.4f", novelty)
@@ -404,7 +405,7 @@ def main(cfg: DictConfig) -> None:
             logger.warning("Could not compute novelty: %s", e)
         except Exception as e:  # noqa: BLE001
             logger.warning("Error computing novelty: %s", e)
-    
+
     # Compute embedding-based metrics if capabilities directory is provided
     capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None)
     if capabilities_dir:
@@ -413,14 +414,14 @@ def main(cfg: DictConfig) -> None:
         embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large")
         embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai")
         embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072)
-        
+
         # Get dataloader config if provided
         synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None)
         if synth_dataloader_config:
             synth_dataloader_config = dict(synth_dataloader_config)
-        
+
         logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir)
-        
+
         # Load capabilities and generate embeddings
         synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
             capabilities_dir=capabilities_dir,
@@ -429,14 +430,15 @@ def main(cfg: DictConfig) -> None:
             dataloader_config=synth_dataloader_config,
             embedding_backend=embedding_backend,
         )
-        
+
         if len(synth_embeddings) == 0:
             logger.warning("No embeddings generated, skipping diversity metrics")
         else:
+            real_embeddings = None
             # Check if real data directory/file is provided for comparison
             real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None)
             real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None)
-            
+
             # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.)
             has_real_data = False
             # Case 1: local path (capability/JSONL/CSV formats)
@@ -445,12 +447,12 @@ def main(cfg: DictConfig) -> None:
             # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None)
             elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface":
                 has_real_data = True
-            
+
             if has_real_data:
                 # Get real data dataloader config if provided
                 if real_dataloader_config:
                     real_dataloader_config = dict(real_dataloader_config)
-                
+
                 if real_data_dir:
                     logger.info("Loading real data embeddings from %s", real_data_dir)
                 else:
@@ -463,7 +465,7 @@ def main(cfg: DictConfig) -> None:
                     dataloader_config=real_dataloader_config,
                     embedding_backend=embedding_backend,
                 )
-                
+
                 if len(real_embeddings) > 0:
                     # Compute comparison metrics that require both synthetic and real data
                     if "pad" in comparison_metrics:
@@ -476,7 +478,7 @@ def main(cfg: DictConfig) -> None:
                             logger.info("PAD score: %.4f", pad_score)
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing PAD: %s", e)
-                    
+
                     if "mmd" in comparison_metrics:
                         try:
                             mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial")
@@ -490,32 +492,60 @@ def main(cfg: DictConfig) -> None:
                             logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score)
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing MMD: %s", e)
-                    
-                    if "kl_divergence" in comparison_metrics:
-                        try:
-                            kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4)
-                            umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
-                            umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
-                            umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
-                            umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
-                            kl_score = compute_kl_divergence(
-                                synth_embeddings,
-                                real_embeddings,
-                                k=kl_k,
-                                umap_n_components=umap_n_components,
-                                umap_n_neighbors=umap_n_neighbors,
-                                umap_min_dist=umap_min_dist,
-                                umap_metric=umap_metric,
-                            )
-                            umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-                            logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
-                        except Exception as e:  # noqa: BLE001
-                            logger.warning("Error computing KL divergence: %s", e)
                 else:
                     logger.warning("No real data embeddings generated, skipping comparison metrics")
             else:
                 logger.info("No real_data_dir provided, skipping comparison metrics (require real data)")
-            
+
+            # Joint UMAP (InfoSynth-style): fit on all datasets so entropy/KL are in a shared space
+            umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
+            umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
+            umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
+            umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
+            need_joint_umap = (
+                umap_n_components is not None
+                and (
+                    "entropy" in internal_diversity_metrics
+                    or (
+                        "kl_divergence" in comparison_metrics
+                        and real_embeddings is not None
+                        and len(real_embeddings) > 0
+                    )
+                )
+            )
+            synth_reduced = None
+            real_reduced = None
+            if need_joint_umap:
+                all_emb = [synth_embeddings]
+                if real_embeddings is not None and len(real_embeddings) > 0:
+                    all_emb.append(real_embeddings)
+                reduced_list = fit_umap_shared(
+                    all_emb,
+                    umap_n_components,
+                    n_neighbors=umap_n_neighbors,
+                    min_dist=umap_min_dist,
+                    metric=umap_metric,
+                )
+                synth_reduced = reduced_list[0]
+                if len(reduced_list) > 1:
+                    real_reduced = reduced_list[1]
+
+            # KL divergence (uses joint UMAP when need_joint_umap so synth and real share a space)
+            if (
+                "kl_divergence" in comparison_metrics
+                and real_embeddings is not None
+                and len(real_embeddings) > 0
+            ):
+                try:
+                    kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4)
+                    kl_synth = synth_reduced if real_reduced is not None else synth_embeddings
+                    kl_real = real_reduced if real_reduced is not None else real_embeddings
+                    kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k)
+                    umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+                    logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
+                except Exception as e:  # noqa: BLE001
+                    logger.warning("Error computing KL divergence: %s", e)
+
             # Compute internal diversity metrics (only need synthetic data)
             if "mdm" in internal_diversity_metrics:
                 try:
@@ -529,22 +559,12 @@ def main(cfg: DictConfig) -> None:
                     logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score)
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing MDM: %s", e)
-            
+
             if "entropy" in internal_diversity_metrics:
                 try:
                     entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4)
-                    umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
-                    umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
-                    umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
-                    umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
-                    entropy_score = compute_differential_entropy(
-                        synth_embeddings,
-                        k=entropy_k,
-                        umap_n_components=umap_n_components,
-                        umap_n_neighbors=umap_n_neighbors,
-                        umap_min_dist=umap_min_dist,
-                        umap_metric=umap_metric,
-                    )
+                    entropy_emb = synth_reduced if synth_reduced is not None else synth_embeddings
+                    entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k)
                     umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
                     logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score)
                 except Exception as e:  # noqa: BLE001
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 3c562f3..02d2cef 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -16,4 +16,5 @@
     compute_mdm,
     compute_mmd,
     compute_pad,
+    fit_umap_shared,
 )
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index dcebafb..e31456a 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -471,39 +471,40 @@ def compute_mdm(
 # ---- Information-Theoretic Metrics (Entropy, KL-Divergence)
 # ===========================
 
-def _apply_umap_reduction(
-    embeddings: np.ndarray,
-    n_components: Optional[int] = None,
+def fit_umap_shared(
+    embeddings_list: List[np.ndarray],
+    n_components: int,
     n_neighbors: int = 15,
     min_dist: float = 0.1,
     metric: str = "cosine",
-) -> np.ndarray:
+) -> List[np.ndarray]:
     """
-    Optionally apply UMAP dimensionality reduction to embeddings.
-    
+    Fit UMAP on the concatenation of all embedding arrays, then split back (InfoSynth-style).
+
+    This ensures entropy and KL divergence are comparable across datasets by using
+    a single shared low-dimensional space.
+
     Args:
-        embeddings: Embedding matrix of shape (n_samples, n_features)
-        n_components: Target dimension. If None, returns original embeddings.
-        n_neighbors: Number of neighbors for UMAP (default: 15)
-        min_dist: Minimum distance for UMAP (default: 0.1)
-        metric: Distance metric for UMAP (default: "cosine")
-    
+        embeddings_list: List of embedding matrices, each shape (n_i, n_features).
+        n_components: UMAP target dimension.
+        n_neighbors: Number of neighbors for UMAP (default: 15).
+        min_dist: Minimum distance for UMAP (default: 0.1).
+        metric: Distance metric for UMAP (default: "cosine").
+
     Returns:
-        Reduced embeddings if n_components is provided, otherwise original embeddings
+        List of reduced embedding arrays in the same order as embeddings_list.
     """
-    if n_components is None:
-        return embeddings
-    
     if not UMAP_AVAILABLE:
         raise ImportError(
-            "UMAP is required for dimensionality reduction. "
-            "Install it with: pip install umap-learn"
+            "UMAP is required. Install it with: pip install umap-learn"
         )
-    
-    if embeddings.shape[1] <= n_components:
-        # Already at or below target dimension
-        return embeddings
-    
+    if not embeddings_list:
+        return []
+    counts = [emb.shape[0] for emb in embeddings_list]
+    split_indices = np.cumsum(counts)[:-1]
+    combined = np.vstack(embeddings_list)
+    if combined.shape[1] <= n_components:
+        return [emb.copy() for emb in embeddings_list]
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         umap_model = UMAP(
@@ -513,61 +514,38 @@ def _apply_umap_reduction(
             metric=metric,
             random_state=42,
         )
-        reduced_embeddings = umap_model.fit_transform(embeddings)
-    
-    # Renormalize (like InfoSynth does)
-    norms = np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)
+        reduced = umap_model.fit_transform(combined)
+    norms = np.linalg.norm(reduced, axis=1, keepdims=True)
     eps = 1e-12
-    reduced_embeddings = reduced_embeddings / (norms + eps)
-    
-    return reduced_embeddings
+    reduced = reduced / (norms + eps)
+    return np.split(reduced, split_indices, axis=0)
 
 
 # Source paper: InfoSyth - https://arxiv.org/abs/2601.00575
-def compute_differential_entropy(
-    embeddings: np.ndarray,
-    k: int = 4,
-    umap_n_components: Optional[int] = None,
-    umap_n_neighbors: int = 15,
-    umap_min_dist: float = 0.1,
-    umap_metric: str = "cosine",
-) -> float:
+def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float:
     """
     Compute the differential entropy of a set of embeddings using k-nearest neighbors.
-    
+
     Differential entropy measures the diversity/uncertainty in the embedding distribution.
-    Higher values indicate more diverse data.
-    
+    Higher values indicate more diverse data. For a shared space across datasets, apply
+    UMAP (e.g. fit_umap_shared) to embeddings before calling this function.
+
     This implementation uses the k-NN estimator for differential entropy:
         H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps))
-    
+
     where:
     - N is the number of samples
     - d is the embedding dimension
     - k is the number of neighbors
     - eps is the distance to the k-th nearest neighbor
-    
+
     Args:
         embeddings: Embedding matrix of shape (n_samples, n_features)
         k: Number of nearest neighbors to use (default: 4)
-        umap_n_components: Optional UMAP target dimension. If None, uses original embeddings.
-        umap_n_neighbors: Number of neighbors for UMAP (default: 15)
-        umap_min_dist: Minimum distance for UMAP (default: 0.1)
-        umap_metric: Distance metric for UMAP (default: "cosine")
-    
+
     Returns:
         float: Differential entropy value (higher is more diverse)
     """
-    # Apply UMAP reduction if requested
-    if umap_n_components is not None:
-        embeddings = _apply_umap_reduction(
-            embeddings,
-            n_components=umap_n_components,
-            n_neighbors=umap_n_neighbors,
-            min_dist=umap_min_dist,
-            metric=umap_metric,
-        )
-    
     N, d = embeddings.shape
     if N < k + 1:
         raise ValueError(
@@ -590,55 +568,33 @@ def compute_kl_divergence(
     q_embeddings: np.ndarray,
     k: int = 4,
     eps: float = 1e-10,
-    umap_n_components: Optional[int] = None,
-    umap_n_neighbors: int = 15,
-    umap_min_dist: float = 0.1,
-    umap_metric: str = "cosine",
 ) -> float:
     """
     Compute the KL divergence between two sets of embeddings using k-nearest neighbors.
-    
+
     KL divergence measures how different distribution P is from distribution Q.
-    Higher values indicate more novelty (P is more different from Q).
-    
+    Higher values indicate more novelty (P is more different from Q). For a shared
+    space, apply UMAP (e.g. fit_umap_shared) to [P, Q] before calling this function.
+
     This implementation uses the k-NN estimator for KL divergence:
         KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1))
-    
+
     where:
     - P is the distribution of p_embeddings (n samples)
     - Q is the distribution of q_embeddings (m samples)
     - d is the embedding dimension
     - rho is the distance to the k-th nearest neighbor in P
     - nu is the distance to the k-th nearest neighbor in Q
-    
+
     Args:
         p_embeddings: Embeddings of distribution P, shape (n_samples_p, n_features)
         q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features)
         k: Number of nearest neighbors to use (default: 4)
         eps: Small epsilon to avoid division by zero (default: 1e-10)
-        umap_n_components: Optional UMAP target dimension. If None, uses original embeddings.
-        umap_n_neighbors: Number of neighbors for UMAP (default: 15)
-        umap_min_dist: Minimum distance for UMAP (default: 0.1)
-        umap_metric: Distance metric for UMAP (default: "cosine")
-    
+
     Returns:
         float: KL divergence value (higher is more novel/different)
     """
-    # Apply UMAP reduction if requested (apply to both embeddings together for consistency)
-    if umap_n_components is not None:
-        # Stack embeddings, apply UMAP, then split back
-        # This ensures both distributions are reduced in the same space
-        combined_embeddings = np.vstack([p_embeddings, q_embeddings])
-        reduced_combined = _apply_umap_reduction(
-            combined_embeddings,
-            n_components=umap_n_components,
-            n_neighbors=umap_n_neighbors,
-            min_dist=umap_min_dist,
-            metric=umap_metric,
-        )
-        p_embeddings = reduced_combined[:len(p_embeddings)]
-        q_embeddings = reduced_combined[len(p_embeddings):]
-    
     n, d = p_embeddings.shape
     m, _ = q_embeddings.shape
     

From 5a2a4a42e551325abe21419baa866aae9819f475 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Fri, 30 Jan 2026 11:45:47 -0500
Subject: [PATCH 08/14] Removed default values

---
 src/cfg/run_quality_evaluation_cfg.yaml |  31 +--
 src/run_quality_evaluation.py           | 241 +++++++++++++++---------
 src/utils/quality_evaluation_utils.py   | 230 +++++++++++-----------
 3 files changed, 288 insertions(+), 214 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index 144ca74..f92b0a9 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -6,51 +6,56 @@ quality_eval_cfg:
   scores_subdir: "scores"
   prior_datasets:
     - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500"
-  
+
   capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
-  
+  synthetic_dataloader_config: null  # Optional: custom dataloader for capabilities_dir (e.g. jsonl, csv, huggingface)
+
   real_data_dir: null
-  
+
   real_dataloader_config:
     type: "huggingface"
     dataset_name: "HuggingFaceH4/MATH-500"
     split: "test"
     subset: null
     text_field: "problem"
-  
+
   # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers
   embedding_backend: "openai"
   embedding_model: "text-embedding-3-large"
   # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
   embedding_dimensions: 3072
-  
+
   # Internal diversity metrics (only need synthetic data)
   internal_diversity_metrics:
     - "mdm"      # Mean Distance to Medoid - measures internal coherence
     - "entropy"  # Differential Entropy - measures diversity/uncertainty
-  
+
   # Comparison metrics (need both synthetic and real data)
   comparison_metrics:
     - "pad"           # Proxy-A-Distance - measures distribution similarity
     - "mmd"           # Maximum Mean Discrepancy - measures distribution distance
     - "kl_divergence" # KL Divergence - measures novelty (how different from real)
-  
+
   pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
-  
+
   mmd_kernel: "polynomial"  # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid"
   mmd_degree: 3
-  
+
   mdm_n_clusters: 5
   mdm_metric: "euclidean"
-  
+
   entropy_k: 4  # Number of nearest neighbors for differential entropy computation
-  
+
   kl_k: 4  # Number of nearest neighbors for KL divergence computation
 
+  # Optional UMAP dimensionality reduction (like InfoSynth)
+  umap_n_components: 10  # Set to null to disable and use original embeddings
+  umap_n_neighbors: 15  # Number of neighbors for UMAP
+  umap_min_dist: 0.1  # Minimum distance for UMAP
+  umap_metric: "cosine"  # Distance metric for UMAP
+
 exp_cfg:
   exp_id: "quality_evaluation"
 
 defaults:
   - _self_
-
-
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 3be6a25..82e8228 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -1,13 +1,12 @@
-"""Script to compute quality metrics (e.g., benchmark difficulty) from existing scores."""
+"""Compute quality metrics (e.g., benchmark difficulty) from existing scores."""
 
 import json
 import logging
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Mapping, Optional, cast
 
 import hydra
 import numpy as np
-import torch
 from omegaconf import DictConfig
 
 from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName
@@ -21,16 +20,16 @@
     compute_mdm,
     compute_mmd,
     compute_pad,
+    constants,
     fit_umap_shared,
 )
-from src.utils import constants
 from src.utils.data_utils import get_run_id
 from src.utils.diversity_metrics_dataloaders import (
     CapabilityDataloader,
-    HuggingFaceDatasetDataloader,
-    JSONLDataloader,
     CSVDataloader,
     DatasetDataloader,
+    HuggingFaceDatasetDataloader,
+    JSONLDataloader,
     load_texts_from_dataloader,
 )
 
@@ -45,7 +44,8 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]:
     Args:
         directory: Directory to walk recursively for JSON files.
 
-    Returns:
+    Returns
+    -------
         List of accuracy values found in the directory.
     """
     accuracies: List[float] = []
@@ -67,7 +67,8 @@ def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
     Args:
         base_dir: Directory containing per-model subdirectories with JSON files.
 
-    Returns:
+    Returns
+    -------
         Dictionary mapping model name to average accuracy.
     """
     model_to_accuracy: Dict[str, float] = {}
@@ -99,7 +100,8 @@ def _create_dataloader_from_config(
         data_path: Path to the data
         dataloader_config: Configuration dict with 'type' and other fields
 
-    Returns:
+    Returns
+    -------
         DatasetDataloader instance
     """
     dataloader_type = dataloader_config.get("type", "capability")
@@ -107,11 +109,12 @@ def _create_dataloader_from_config(
     if dataloader_type == "capability":
         return CapabilityDataloader(data_path)
 
-    elif dataloader_type == "huggingface":
+    if dataloader_type == "huggingface":
         from datasets import load_dataset
+
         dataset_name = dataloader_config.get("dataset_name")
         split = dataloader_config.get("split", "train")
-        subset = dataloader_config.get("subset", None)
+        subset = dataloader_config.get("subset")
         dataset = load_dataset(dataset_name, name=subset, split=split)
 
         return HuggingFaceDatasetDataloader(
@@ -119,7 +122,7 @@ def _create_dataloader_from_config(
             text_field=dataloader_config.get("text_field", "problem"),
         )
 
-    elif dataloader_type == "jsonl":
+    if dataloader_type == "jsonl":
         return JSONLDataloader(
             jsonl_path=data_path,
             name_field=dataloader_config.get("name_field", "name"),
@@ -129,7 +132,7 @@ def _create_dataloader_from_config(
             task_field=dataloader_config.get("task_field", "problem"),
         )
 
-    elif dataloader_type == "csv":
+    if dataloader_type == "csv":
         return CSVDataloader(
             csv_path=data_path,
             name_field=dataloader_config.get("name_field", "name"),
@@ -139,8 +142,7 @@ def _create_dataloader_from_config(
             task_field=dataloader_config.get("task_field", "problem"),
         )
 
-    else:
-        raise ValueError(f"Unknown dataloader type: {dataloader_type}")
+    raise ValueError(f"Unknown dataloader type: {dataloader_type}")
 
 
 def _load_capabilities_and_generate_embeddings(
@@ -157,23 +159,29 @@ def _load_capabilities_and_generate_embeddings(
     Always uses the dataloader system for consistency.
 
     Args:
-        capabilities_dir: Directory containing capability subdirectories OR path to data file
+        capabilities_dir: Dir with capability subdirs or path to data file
         embedding_model_name: Name of embedding model to use
         embed_dimensions: Number of embedding dimensions
         dataloader_config: Optional configuration for custom dataloader.
                           If None, defaults to capability format.
 
-    Returns:
+    Returns
+    -------
         Tuple of (embeddings array, list of extracted texts)
     """
     # Use dataloader system: default to capability format if no config provided
     if dataloader_config:
-        logger.info("Using custom dataloader: %s", dataloader_config.get("type", "unknown"))
+        logger.info(
+            "Using custom dataloader: %s", dataloader_config.get("type", "unknown")
+        )
         dataloader = _create_dataloader_from_config(capabilities_dir, dataloader_config)
     else:
         # Default: use capability format dataloader
         if not os.path.isdir(capabilities_dir):
-            logger.error("capabilities_dir must be a directory when using default capability format: %s", capabilities_dir)
+            logger.error(
+                "capabilities_dir must be a directory when using default capability format: %s",
+                capabilities_dir,
+            )
             return np.array([]), []
         logger.info("Using capability format dataloader for %s", capabilities_dir)
         dataloader = CapabilityDataloader(capabilities_dir)
@@ -204,7 +212,7 @@ def _load_capabilities_and_generate_embeddings(
     elif embedding_backend.lower() == "huggingface":
         # Use HuggingFace encoder models such as gte-Qwen
         try:
-            from sentence_transformers import SentenceTransformer  # type: ignore[import]
+            from sentence_transformers import SentenceTransformer
         except Exception as exc:  # noqa: BLE001
             logger.error(
                 "Failed to import sentence_transformers for HuggingFace embeddings: %s",
@@ -259,14 +267,14 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
         return None
 
 
-@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg")
+@hydra.main(
+    version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg"
+)
 def main(cfg: DictConfig) -> None:
-    """
-    Compute benchmark-level quality metrics from saved capability scores.
-    """
+    """Compute benchmark-level quality metrics from saved capability scores."""
     run_id = get_run_id(cfg)
 
-    scores_root_dir = getattr(cfg.quality_eval_cfg, "scores_root_dir", None)
+    scores_root_dir = cfg.quality_eval_cfg.scores_root_dir
     if scores_root_dir:
         base_scores_dir = scores_root_dir
     else:
@@ -294,7 +302,7 @@ def main(cfg: DictConfig) -> None:
     model_to_generation_accuracies: Dict[str, List[float]] = {}
 
     # Get prior dataset names to exclude them from current dataset
-    prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
+    prior_datasets = cfg.quality_eval_cfg.prior_datasets
     prior_dataset_names = set()
     for prior_path in prior_datasets:
         # Extract the directory name from the path
@@ -312,7 +320,8 @@ def main(cfg: DictConfig) -> None:
 
         # Check if model_dir contains subdirectories (generations/runs)
         subdirs = [
-            d for d in os.listdir(model_dir)
+            d
+            for d in os.listdir(model_dir)
             if os.path.isdir(os.path.join(model_dir, d))
         ]
 
@@ -329,7 +338,10 @@ def main(cfg: DictConfig) -> None:
                     generation_accuracies.append(avg_gen_acc)
                     logger.debug(
                         "Model '%s' generation '%s': %.4f (from %d JSON files)",
-                        model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies)
+                        model_name,
+                        gen_dir_name,
+                        avg_gen_acc,
+                        len(gen_accuracies),
                     )
 
             if generation_accuracies:
@@ -345,12 +357,13 @@ def main(cfg: DictConfig) -> None:
                 )
             # Continue to next model if we processed subdirs
             continue
-        else:
-            # Structure: model_dir/...json files (no generation subdirectories)
-            accuracies = _collect_accuracies_from_dir(model_dir)
+        # Structure: model_dir/...json files (no generation subdirectories)
+        accuracies = _collect_accuracies_from_dir(model_dir)
 
         if not accuracies:
-            logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir)
+            logger.warning(
+                "No accuracies found for model '%s' in %s", model_name, model_dir
+            )
             continue
 
         avg_acc = sum(accuracies) / len(accuracies)
@@ -380,7 +393,7 @@ def main(cfg: DictConfig) -> None:
             logger.warning("Could not compute consistency: %s", e)
 
     # Compute novelty if prior datasets are provided
-    prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", [])
+    prior_datasets = cfg.quality_eval_cfg.prior_datasets
     if prior_datasets:
         try:
             logger.info("Loading prior datasets for novelty computation...")
@@ -391,36 +404,46 @@ def main(cfg: DictConfig) -> None:
                     prior_datasets_accuracies.append(prior_acc)
                     logger.info(
                         "Loaded prior dataset from %s: %d models",
-                        prior_dir, len(prior_acc)
+                        prior_dir,
+                        len(prior_acc),
                     )
                 else:
-                    logger.warning("No accuracies found in prior dataset: %s", prior_dir)
+                    logger.warning(
+                        "No accuracies found in prior dataset: %s", prior_dir
+                    )
 
             if prior_datasets_accuracies:
-                novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies)
+                novelty = compute_benchmark_novelty(
+                    model_to_accuracy,
+                    cast(List[Mapping[str, float]], prior_datasets_accuracies),
+                )
                 logger.info("Benchmark novelty: %.4f", novelty)
             else:
-                logger.warning("No valid prior datasets found, skipping novelty computation.")
+                logger.warning(
+                    "No valid prior datasets found, skipping novelty computation."
+                )
         except ValueError as e:
             logger.warning("Could not compute novelty: %s", e)
         except Exception as e:  # noqa: BLE001
             logger.warning("Error computing novelty: %s", e)
 
     # Compute embedding-based metrics if capabilities directory is provided
-    capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None)
+    capabilities_dir = cfg.quality_eval_cfg.capabilities_dir
     if capabilities_dir:
-        internal_diversity_metrics = getattr(cfg.quality_eval_cfg, "internal_diversity_metrics", ["mdm", "entropy"])
-        comparison_metrics = getattr(cfg.quality_eval_cfg, "comparison_metrics", ["pad", "mmd", "kl_divergence"])
-        embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large")
-        embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai")
-        embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072)
+        internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics
+        comparison_metrics = cfg.quality_eval_cfg.comparison_metrics
+        embedding_model = cfg.quality_eval_cfg.embedding_model
+        embedding_backend = cfg.quality_eval_cfg.embedding_backend
+        embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions
 
         # Get dataloader config if provided
-        synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None)
+        synth_dataloader_config = cfg.quality_eval_cfg.synthetic_dataloader_config
         if synth_dataloader_config:
             synth_dataloader_config = dict(synth_dataloader_config)
 
-        logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir)
+        logger.info(
+            "Computing embedding-based metrics for capabilities in %s", capabilities_dir
+        )
 
         # Load capabilities and generate embeddings
         synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
@@ -436,16 +459,18 @@ def main(cfg: DictConfig) -> None:
         else:
             real_embeddings = None
             # Check if real data directory/file is provided for comparison
-            real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None)
-            real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None)
+            real_data_dir = cfg.quality_eval_cfg.real_data_dir
+            real_dataloader_config = cfg.quality_eval_cfg.real_dataloader_config
 
-            # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.)
+            # Real data: valid path or dataloader config (e.g. HuggingFace)
             has_real_data = False
             # Case 1: local path (capability/JSONL/CSV formats)
-            if real_data_dir and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)):
-                has_real_data = True
-            # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None)
-            elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface":
+            if (
+                real_data_dir
+                and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir))
+                or real_dataloader_config
+                and real_dataloader_config.get("type") == "huggingface"
+            ):
                 has_real_data = True
 
             if has_real_data:
@@ -456,9 +481,11 @@ def main(cfg: DictConfig) -> None:
                 if real_data_dir:
                     logger.info("Loading real data embeddings from %s", real_data_dir)
                 else:
-                    logger.info("Loading real data embeddings using dataloader config (no local path)")
+                    logger.info(
+                        "Loading real data embeddings using dataloader config (no local path)"
+                    )
                 real_embeddings, _ = _load_capabilities_and_generate_embeddings(
-                    # For HuggingFace, the capabilities_dir is unused; fallback to empty string
+                    # HuggingFace: capabilities_dir unused, pass empty string
                     capabilities_dir=real_data_dir or "",
                     embedding_model_name=embedding_model,
                     embed_dimensions=embed_dimensions,
@@ -467,13 +494,13 @@ def main(cfg: DictConfig) -> None:
                 )
 
                 if len(real_embeddings) > 0:
-                    # Compute comparison metrics that require both synthetic and real data
+                    # Comparison metrics (need both synth and real)
                     if "pad" in comparison_metrics:
                         try:
                             pad_score = compute_pad(
                                 synth_embeddings,
                                 real_embeddings,
-                                classifier_name=getattr(cfg.quality_eval_cfg, "pad_classifier", "LogisticRegression"),
+                                classifier_name=cfg.quality_eval_cfg.pad_classifier,
                             )
                             logger.info("PAD score: %.4f", pad_score)
                         except Exception as e:  # noqa: BLE001
@@ -481,36 +508,39 @@ def main(cfg: DictConfig) -> None:
 
                     if "mmd" in comparison_metrics:
                         try:
-                            mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial")
-                            mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3)
+                            mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
+                            mmd_degree = cfg.quality_eval_cfg.mmd_degree
                             mmd_score = compute_mmd(
                                 synth_embeddings,
                                 real_embeddings,
                                 kernel=mmd_kernel,
                                 degree=mmd_degree,
                             )
-                            logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score)
+                            logger.info(
+                                "MMD score (%s kernel): %.4f", mmd_kernel, mmd_score
+                            )
                         except Exception as e:  # noqa: BLE001
                             logger.warning("Error computing MMD: %s", e)
                 else:
-                    logger.warning("No real data embeddings generated, skipping comparison metrics")
-            else:
-                logger.info("No real_data_dir provided, skipping comparison metrics (require real data)")
-
-            # Joint UMAP (InfoSynth-style): fit on all datasets so entropy/KL are in a shared space
-            umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None)
-            umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15)
-            umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1)
-            umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine")
-            need_joint_umap = (
-                umap_n_components is not None
-                and (
-                    "entropy" in internal_diversity_metrics
-                    or (
-                        "kl_divergence" in comparison_metrics
-                        and real_embeddings is not None
-                        and len(real_embeddings) > 0
+                    logger.warning(
+                        "No real data embeddings generated, skipping comparison metrics"
                     )
+            else:
+                logger.info(
+                    "No real_data_dir provided, skipping comparison metrics (require real data)"
+                )
+
+            # Joint UMAP
+            umap_n_components = cfg.quality_eval_cfg.umap_n_components
+            umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors
+            umap_min_dist = cfg.quality_eval_cfg.umap_min_dist
+            umap_metric = cfg.quality_eval_cfg.umap_metric
+            need_joint_umap = umap_n_components is not None and (
+                "entropy" in internal_diversity_metrics
+                or (
+                    "kl_divergence" in comparison_metrics
+                    and real_embeddings is not None
+                    and len(real_embeddings) > 0
                 )
             )
             synth_reduced = None
@@ -530,48 +560,73 @@ def main(cfg: DictConfig) -> None:
                 if len(reduced_list) > 1:
                     real_reduced = reduced_list[1]
 
-            # KL divergence (uses joint UMAP when need_joint_umap so synth and real share a space)
+            # KL divergence (joint UMAP so synth and real share a space)
             if (
                 "kl_divergence" in comparison_metrics
                 and real_embeddings is not None
                 and len(real_embeddings) > 0
             ):
                 try:
-                    kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4)
-                    kl_synth = synth_reduced if real_reduced is not None else synth_embeddings
-                    kl_real = real_reduced if real_reduced is not None else real_embeddings
-                    kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k)
-                    umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-                    logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
+                    kl_k = cfg.quality_eval_cfg.kl_k
+                    kl_synth = (
+                        synth_reduced if real_reduced is not None else synth_embeddings
+                    )
+                    kl_real = (
+                        real_reduced if real_reduced is not None else real_embeddings
+                    )
+                    if kl_synth is not None and kl_real is not None:
+                        kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k)
+                    else:
+                        kl_score = 0.0
+                    umap_info = (
+                        f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+                    )
+                    logger.info(
+                        "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score
+                    )
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing KL divergence: %s", e)
 
             # Compute internal diversity metrics (only need synthetic data)
             if "mdm" in internal_diversity_metrics:
                 try:
-                    mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5)
-                    mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean")
+                    mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters
+                    mdm_metric = cfg.quality_eval_cfg.mdm_metric
                     mdm_score = compute_mdm(
                         synth_embeddings,
                         n_clusters=mdm_n_clusters,
                         metric=mdm_metric,
                     )
-                    logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score)
+                    logger.info(
+                        "MDM score (%d clusters, %s metric): %.4f",
+                        mdm_n_clusters,
+                        mdm_metric,
+                        mdm_score,
+                    )
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing MDM: %s", e)
 
             if "entropy" in internal_diversity_metrics:
                 try:
-                    entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4)
-                    entropy_emb = synth_reduced if synth_reduced is not None else synth_embeddings
-                    entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k)
-                    umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-                    logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score)
+                    entropy_k = cfg.quality_eval_cfg.entropy_k
+                    entropy_emb = (
+                        synth_reduced if synth_reduced is not None else synth_embeddings
+                    )
+                    entropy_score = compute_differential_entropy(
+                        entropy_emb, k=entropy_k
+                    )
+                    umap_info = (
+                        f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+                    )
+                    logger.info(
+                        "Differential entropy score (k=%d)%s: %.4f",
+                        entropy_k,
+                        umap_info,
+                        entropy_score,
+                    )
                 except Exception as e:  # noqa: BLE001
                     logger.warning("Error computing differential entropy: %s", e)
 
 
 if __name__ == "__main__":
     main()
-
-
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index e31456a..f5a9154 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -4,29 +4,31 @@
 
 import statistics
 import warnings
-from typing import Iterable, List, Mapping, Optional, Union
+from typing import Iterable, List, Mapping, Union
 
+import kmedoids
 import numpy as np
-from scipy.stats import spearmanr
 from scipy.special import digamma, gammaln
-from sklearn.linear_model import LogisticRegression
+from scipy.stats import spearmanr
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.neural_network import MLPClassifier
-from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import pairwise_distances
 from sklearn.metrics.pairwise import (
-    polynomial_kernel,
-    rbf_kernel,
     laplacian_kernel,
     linear_kernel,
+    polynomial_kernel,
+    rbf_kernel,
     sigmoid_kernel,
 )
+from sklearn.model_selection import train_test_split
 from sklearn.neighbors import NearestNeighbors
-import kmedoids
-from sklearn.metrics import pairwise_distances
+from sklearn.neural_network import MLPClassifier
+
 
 # Optional UMAP import
 try:
     from umap import UMAP
+
     UMAP_AVAILABLE = True
 except ImportError:
     UMAP_AVAILABLE = False
@@ -50,16 +52,18 @@ def compute_benchmark_difficulty(
         accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model,
             or a mapping from model name to accuracy in [0.0, 1.0].
 
-    Returns:
+    Returns
+    -------
         A float in [0.0, 1.0] representing the benchmark difficulty.
 
-    Raises:
+    Raises
+    ------
         ValueError: If no accuracies are provided.
     """
     # Handle Mapping by extracting values, otherwise treat as iterable
     if isinstance(accuracies, Mapping):
         accuracies = accuracies.values()
-    
+
     accuracies = list(accuracies)
     if not accuracies:
         raise ValueError("Cannot compute difficulty: no accuracies provided.")
@@ -88,16 +92,18 @@ def compute_benchmark_separability(
         accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model,
             or a mapping from model name to accuracy in [0.0, 1.0].
 
-    Returns:
+    Returns
+    -------
         A non-negative float representing separability.
 
-    Raises:
+    Raises
+    ------
         ValueError: If no accuracies are provided.
     """
     # Handle Mapping by extracting values, otherwise treat as iterable
     if isinstance(accuracies, Mapping):
         accuracies = accuracies.values()
-    
+
     accuracies = list(accuracies)
     if not accuracies:
         raise ValueError("Cannot compute separability: no accuracies provided.")
@@ -112,12 +118,14 @@ def compute_benchmark_consistency(
     model_to_generation_accuracies: Mapping[str, Iterable[float]],
 ) -> float:
     """
-    Compute benchmark consistency given per-model accuracies across multiple dataset generations.
+    Compute benchmark consistency from per-model accuracies across generations.
 
-    Consistency measures how stable model performance is across different dataset generations.
+    Consistency measures how stable model performance is across
+    different dataset generations.
     The consistency of a benchmark is defined as:
 
-        CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_{i=1}^n std({performance(m_i) | D_gen,j}_{j=1}^k)
+        CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_i std(performance(m_i))
+        over dataset generations j=1..k
 
     where:
     - n is the number of models
@@ -133,11 +141,13 @@ def compute_benchmark_consistency(
             on a different dataset generation. Each model should have the same number
             of generations (k).
 
-    Returns:
+    Returns
+    -------
         A float in [0.0, 1.0] representing the benchmark consistency.
         Higher values indicate more consistent performance across generations.
 
-    Raises:
+    Raises
+    ------
         ValueError: If no models are provided, or if models have inconsistent
             numbers of generations, or if any model has fewer than 2 generations
             (std requires at least 2 values).
@@ -186,10 +196,8 @@ def compute_benchmark_consistency(
     # Average the standard deviations across all models
     mean_std = sum(model_stds) / len(model_stds)
 
-    # Consistency = 1 - mean_std
     # Clamp to [0, 1] in case of numerical issues
-    consistency = max(0.0, min(1.0, 1.0 - mean_std))
-    return consistency
+    return max(0.0, min(1.0, 1.0 - mean_std))
 
 
 # Source paper: AutoBencher - https://arxiv.org/abs/2407.08351
@@ -198,9 +206,10 @@ def compute_benchmark_novelty(
     prior_datasets_accuracies: List[Mapping[str, float]],
 ) -> float:
     """
-    Compute benchmark novelty by comparing current dataset performance to prior datasets.
+    Compute benchmark novelty by comparing current to prior dataset performance.
 
-    Novelty measures how much new information a dataset reveals about existing models
+    Novelty measures how much new information a dataset reveals
+    about existing models
     over existing benchmarks. The formula is:
 
         NOVELTY(D_c, D_prev, M) = 1 - RANKCORR(v̂_c, v_c)
@@ -224,11 +233,13 @@ def compute_benchmark_novelty(
             All mappings must contain the same set of models, and these models
             must match the models in current_accuracies.
 
-    Returns:
+    Returns
+    -------
         A float in [0.0, 1.0] representing the benchmark novelty.
         Higher values indicate more novel/unique performance patterns.
 
-    Raises:
+    Raises
+    ------
         ValueError: If no prior datasets provided, models don't match, or
             regression fails (e.g., singular matrix).
 
@@ -262,24 +273,24 @@ def compute_benchmark_novelty(
     num_models = len(current_models)
     num_prior = len(prior_datasets_accuracies)
 
-    # V_prev: each column is a prior dataset's accuracies
-    V_prev = np.zeros((num_models, num_prior))
+    # v_prev: each column is a prior dataset's accuracies
+    v_prev = np.zeros((num_models, num_prior))
     for i, prior_acc in enumerate(prior_datasets_accuracies):
         for j, model in enumerate(current_models):
-            V_prev[j, i] = prior_acc[model]
+            v_prev[j, i] = prior_acc[model]
 
     # v_c: current dataset's accuracies
     v_c = np.array([current_accuracies[model] for model in current_models])
 
-    # Perform linear regression: v_c = V_prev * θ + b
-    # We solve: min ||V_prev * θ + b - v_c||²
-    # To use np.linalg.lstsq, we reformulate as: [V_prev, 1] * [θ; b] = v_c
+    # Perform linear regression: v_c = v_prev * θ + b
+    # We solve: min ||v_prev * θ + b - v_c||²
+    # To use np.linalg.lstsq, we reformulate as: [v_prev, 1] * [θ; b] = v_c
     # where 1 is a column vector of ones (for the intercept b)
-    
+
     # Augment design matrix with column of ones for intercept
     ones = np.ones((num_models, 1))
-    X = np.hstack([V_prev, ones])
-    
+    X = np.hstack([v_prev, ones])
+
     try:
         # Solve using least squares: X * params = v_c
         # params = [θ; b]
@@ -294,8 +305,8 @@ def compute_benchmark_novelty(
     theta = params[:-1]  # First N elements
     b = params[-1]  # Last element (intercept)
 
-    # Compute predicted values: v̂_c = V_prev * θ + b
-    v_pred = V_prev @ theta + b
+    # Compute predicted values: v̂_c = v_prev * θ + b
+    v_pred = v_prev @ theta + b
 
     # Compute rank correlation (Spearman correlation) using scipy
     try:
@@ -308,16 +319,15 @@ def compute_benchmark_novelty(
     if np.isnan(rank_corr) or not np.isfinite(rank_corr):
         return 1.0
 
-    # Novelty = 1 - rank_correlation
     # Clamp to [0, 1] in case of numerical issues (e.g., negative correlation)
-    novelty = max(0.0, min(1.0, 1.0 - rank_corr))
-    return novelty
+    return float(max(0.0, min(1.0, 1.0 - rank_corr)))
 
 
 # ===========================
 # ---- Diversity Metrics (PAD, MMD, MDM)
 # ===========================
 
+
 # Source paper: SynQue - https://arxiv.org/abs/2511.03928
 def compute_pad(
     x_syn_emb: np.ndarray,
@@ -326,29 +336,30 @@ def compute_pad(
 ) -> float:
     """
     Compute the Proxy-A-Distance (PAD) between two sets of embeddings.
-    
+
     PAD measures the distance between synthetic and real data distributions
     by training a classifier to distinguish between them. Lower values indicate
     more similar distributions.
-    
+
     Args:
         x_syn_emb: Embeddings of synthetic data, shape (n_samples, n_features)
         x_real_emb: Embeddings of real data, shape (n_samples, n_features)
         classifier_name: Classifier to use ("LogisticRegression", "RandomForest", "MLP")
-    
-    Returns:
+
+    Returns
+    -------
         float: PAD value (typically in range [0, 2], lower is better)
     """
     y_syn_train = np.zeros(len(x_syn_emb))
     y_real_train = np.ones(len(x_real_emb))
     x_train = np.concatenate([x_syn_emb, x_real_emb], axis=0)
     y_train = np.concatenate([y_syn_train, y_real_train], axis=0)
-    
+
     # Split into train/validation
     x_train, x_val, y_train, y_val = train_test_split(
         x_train, y_train, test_size=0.2, random_state=42
     )
-    
+
     # Classifier
     if classifier_name == "LogisticRegression":
         classifier = LogisticRegression(random_state=42, max_iter=1000)
@@ -357,74 +368,75 @@ def compute_pad(
     elif classifier_name == "MLP":
         classifier = MLPClassifier(
             hidden_layer_sizes=(128, 64),
-            activation='relu',
+            activation="relu",
             max_iter=200,
-            random_state=42
+            random_state=42,
         )
     else:
         raise ValueError(f"Unknown classifier: {classifier_name}")
-    
+
     classifier.fit(x_train, y_train)
     y_pred_proba = classifier.predict_proba(x_val)[:, 1]
     average_loss = np.mean(np.abs(y_pred_proba - y_val))
-    return 2 * (1 - 2 * average_loss)
+    return float(2 * (1 - 2 * average_loss))
 
 
 # Source paper: SynQue - https://arxiv.org/abs/2511.03928
 def compute_mmd(
-    X: np.ndarray,
-    Y: np.ndarray,
+    x: np.ndarray,
+    y: np.ndarray,
     kernel: str = "polynomial",
     degree: int = 3,
     gamma: float | None = None,
     coef0: float = 1,
 ) -> float:
     """
-    Compute the Maximum Mean Discrepancy (MMD) between two samples: X and Y.
-    
+    Compute the Maximum Mean Discrepancy (MMD) between two samples: x and y.
+
     MMD measures the distance between two distributions in a reproducing kernel
     Hilbert space. Lower values indicate more similar distributions.
-    
+
     Args:
-        X: First sample, shape (n_samples_X, n_features)
-        Y: Second sample, shape (n_samples_Y, n_features)
+        x: First sample, shape (n_samples_x, n_features)
+        y: Second sample, shape (n_samples_y, n_features)
         kernel: Kernel name ("polynomial", "rbf", "laplacian", "linear", "sigmoid")
         degree: Degree for polynomial kernel (default: 3)
         gamma: Gamma parameter for kernels (default: None, auto)
         coef0: Coef0 for polynomial/sigmoid kernel
-    
-    Returns:
+
+    Returns
+    -------
         float: MMD value (non-negative, lower is better)
     """
     kernel = kernel.lower() if isinstance(kernel, str) else kernel
     if kernel == "polynomial":
         kfunc = polynomial_kernel
-        XX = kfunc(X, X, degree=degree, gamma=gamma, coef0=coef0)
-        YY = kfunc(Y, Y, degree=degree, gamma=gamma, coef0=coef0)
-        XY = kfunc(X, Y, degree=degree, gamma=gamma, coef0=coef0)
+        xx = kfunc(x, x, degree=degree, gamma=gamma, coef0=coef0)
+        yy = kfunc(y, y, degree=degree, gamma=gamma, coef0=coef0)
+        xy = kfunc(x, y, degree=degree, gamma=gamma, coef0=coef0)
     elif kernel == "rbf":
         kfunc = rbf_kernel
-        XX = kfunc(X, X, gamma=gamma)
-        YY = kfunc(Y, Y, gamma=gamma)
-        XY = kfunc(X, Y, gamma=gamma)
+        xx = kfunc(x, x, gamma=gamma)
+        yy = kfunc(y, y, gamma=gamma)
+        xy = kfunc(x, y, gamma=gamma)
     elif kernel == "laplacian":
         kfunc = laplacian_kernel
-        XX = kfunc(X, X, gamma=gamma)
-        YY = kfunc(Y, Y, gamma=gamma)
-        XY = kfunc(X, Y, gamma=gamma)
+        xx = kfunc(x, x, gamma=gamma)
+        yy = kfunc(y, y, gamma=gamma)
+        xy = kfunc(x, y, gamma=gamma)
     elif kernel == "linear":
         kfunc = linear_kernel
-        XX = kfunc(X, X)
-        YY = kfunc(Y, Y)
-        XY = kfunc(X, Y)
+        xx = kfunc(x, x)
+        yy = kfunc(y, y)
+        xy = kfunc(x, y)
     elif kernel == "sigmoid":
         kfunc = sigmoid_kernel
-        XX = kfunc(X, X, gamma=gamma, coef0=coef0)
-        YY = kfunc(Y, Y, gamma=gamma, coef0=coef0)
-        XY = kfunc(X, Y, gamma=gamma, coef0=coef0)
+        xx = kfunc(x, x, gamma=gamma, coef0=coef0)
+        yy = kfunc(y, y, gamma=gamma, coef0=coef0)
+        xy = kfunc(x, y, gamma=gamma, coef0=coef0)
     else:
         raise ValueError(f"Unknown kernel: {kernel}")
-    return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY)
+    return float(np.mean(xx) + np.mean(yy) - 2 * np.mean(xy))
 
 
 # Source paper: SynQue - https://arxiv.org/abs/2511.03928
@@ -434,29 +446,31 @@ def compute_mdm(
     metric: str = "euclidean",
 ) -> float:
     """
-    Compute the mean distance of points in each cluster to its medoid, then average across clusters.
-    
-    MDM measures the internal diversity/coherence of a set of embeddings by clustering
+    Compute mean distance to medoid per cluster, then average across clusters.
+
+    MDM measures the internal diversity/coherence of a set of embeddings
+    by clustering
     them and computing the average distance to cluster medoids. Lower values indicate
     more coherent/diverse clusters.
-    
+
     Args:
         embeddings: Embedding matrix of shape (n_samples, n_features)
         n_clusters: Number of clusters/medoids to use
         metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.)
-    
-    Returns:
+
+    Returns
+    -------
         float: Mean distance to medoid (averaged over all clusters)
     """
     n_samples = len(embeddings)
     if n_samples < n_clusters:
         n_clusters = max(1, n_samples)
-    
+
     diss = pairwise_distances(embeddings, metric=metric)
     pam_result = kmedoids.fasterpam(diss, n_clusters, random_state=42)
     labels = pam_result.labels
     medoid_indices = pam_result.medoids
-    
+
     total_dist = 0.0
     for i, medoid_idx in enumerate(medoid_indices):
         cluster_points_idx = np.where(labels == i)[0]
@@ -471,6 +485,7 @@ def compute_mdm(
 # ---- Information-Theoretic Metrics (Entropy, KL-Divergence)
 # ===========================
 
+
 def fit_umap_shared(
     embeddings_list: List[np.ndarray],
     n_components: int,
@@ -479,7 +494,7 @@ def fit_umap_shared(
     metric: str = "cosine",
 ) -> List[np.ndarray]:
     """
-    Fit UMAP on the concatenation of all embedding arrays, then split back (InfoSynth-style).
+    Fit UMAP on the concatenation of all embedding arrays, then split back.
 
     This ensures entropy and KL divergence are comparable across datasets by using
     a single shared low-dimensional space.
@@ -491,13 +506,12 @@ def fit_umap_shared(
         min_dist: Minimum distance for UMAP (default: 0.1).
         metric: Distance metric for UMAP (default: "cosine").
 
-    Returns:
+    Returns
+    -------
         List of reduced embedding arrays in the same order as embeddings_list.
     """
     if not UMAP_AVAILABLE:
-        raise ImportError(
-            "UMAP is required. Install it with: pip install umap-learn"
-        )
+        raise ImportError("UMAP is required. Install it with: pip install umap-learn")
     if not embeddings_list:
         return []
     counts = [emb.shape[0] for emb in embeddings_list]
@@ -526,9 +540,9 @@ def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float:
     """
     Compute the differential entropy of a set of embeddings using k-nearest neighbors.
 
-    Differential entropy measures the diversity/uncertainty in the embedding distribution.
-    Higher values indicate more diverse data. For a shared space across datasets, apply
-    UMAP (e.g. fit_umap_shared) to embeddings before calling this function.
+    Differential entropy measures the diversity/uncertainty in the
+    embedding distribution.
+    Higher values indicate more diverse data.
 
     This implementation uses the k-NN estimator for differential entropy:
         H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps))
@@ -543,22 +557,24 @@ def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float:
         embeddings: Embedding matrix of shape (n_samples, n_features)
         k: Number of nearest neighbors to use (default: 4)
 
-    Returns:
+    Returns
+    -------
         float: Differential entropy value (higher is more diverse)
     """
-    N, d = embeddings.shape
-    if N < k + 1:
+    n_samples, d = embeddings.shape
+    if k + 1 > n_samples:
         raise ValueError(
-            f"Cannot compute entropy: need at least {k + 1} samples, but got {N}."
+            f"Cannot compute entropy: need at least {k + 1} samples, "
+            f"but got {n_samples}."
         )
-    
+
     nbrs = NearestNeighbors(n_neighbors=k + 1).fit(embeddings)
     distances, _ = nbrs.kneighbors(embeddings)
     eps = distances[:, -1]
     eps[eps == 0] = np.nextafter(0, 1)
-    
+
     log_vol = (d / 2) * np.log(np.pi) - gammaln(d / 2 + 1)
-    entropy = digamma(N) - digamma(k) + log_vol + d * np.mean(np.log(eps))
+    entropy = digamma(n_samples) - digamma(k) + log_vol + d * np.mean(np.log(eps))
     return float(entropy)
 
 
@@ -573,8 +589,7 @@ def compute_kl_divergence(
     Compute the KL divergence between two sets of embeddings using k-nearest neighbors.
 
     KL divergence measures how different distribution P is from distribution Q.
-    Higher values indicate more novelty (P is more different from Q). For a shared
-    space, apply UMAP (e.g. fit_umap_shared) to [P, Q] before calling this function.
+    Higher values indicate more novelty (P is more different from Q).
 
     This implementation uses the k-NN estimator for KL divergence:
         KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1))
@@ -592,12 +607,13 @@ def compute_kl_divergence(
         k: Number of nearest neighbors to use (default: 4)
         eps: Small epsilon to avoid division by zero (default: 1e-10)
 
-    Returns:
+    Returns
+    -------
         float: KL divergence value (higher is more novel/different)
     """
     n, d = p_embeddings.shape
     m, _ = q_embeddings.shape
-    
+
     if n < k + 1:
         raise ValueError(
             f"Cannot compute KL divergence: P needs at least {k + 1} samples, but got {n}."
@@ -606,16 +622,14 @@ def compute_kl_divergence(
         raise ValueError(
             f"Cannot compute KL divergence: Q needs at least {k} samples, but got {m}."
         )
-    
+
     # Find k-th nearest neighbor in P for each point in P
     nbrs_p = NearestNeighbors(n_neighbors=k + 1).fit(p_embeddings)
     rho = np.maximum(nbrs_p.kneighbors(p_embeddings)[0][:, k], eps)
-    
+
     # Find k-th nearest neighbor in Q for each point in P
     nbrs_q = NearestNeighbors(n_neighbors=k).fit(q_embeddings)
     nu = np.maximum(nbrs_q.kneighbors(p_embeddings)[0][:, k - 1], eps)
-    
+
     kl_div = (d / n) * np.sum(np.log(nu / rho)) + np.log(m / (n - 1))
     return float(kl_div)
-
-

From 579cedf51db4cd383fd95a2ce8fc53406a471b79 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 5 Feb 2026 08:56:09 -0500
Subject: [PATCH 09/14] Fixed real data logic

---
 src/cfg/run_quality_evaluation_cfg.yaml |  70 +++++-
 src/run_quality_evaluation.py           | 308 ++++++++++++++++--------
 src/utils/__init__.py                   |   2 +-
 src/utils/quality_evaluation_utils.py   |   2 +-
 4 files changed, 261 insertions(+), 121 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index f92b0a9..e183c0c 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -2,22 +2,55 @@ prompt_cfg:
   sys_msg: Compute benchmark quality metrics from existing scores.
 
 quality_eval_cfg:
-  scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
-  scores_subdir: "scores"
-  prior_datasets:
-    - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500"
+  # Synthetic benchmark source (scores + capabilities)
+  synthetic_source:
+    # Root directory containing per-model score subdirs for the synthetic benchmark
+    scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
+    # Optional subdirectory name when falling back to BASE_ARTIFACTS_DIR
+    scores_subdir: "scores"
+    # Capability directory for the synthetic benchmark
+    capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
 
-  capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
-  synthetic_dataloader_config: null  # Optional: custom dataloader for capabilities_dir (e.g. jsonl, csv, huggingface)
+  # Novelty: "combined" = one score from all real sources (linear regression on all);
+  # "per_dataset" = one novelty per prior (how novel vs each benchmark separately);
+  # "both" = report combined and per-dataset.
+  novelty_mode: "combined"  # "combined" | "per_dataset" | "both"
 
-  real_data_dir: null
+  # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL).
+  # real_data_source can be:
+  # - a single mapping {path, dataloader, name}, OR
+  # - a list of such mappings when you have multiple real datasets.
+  #
+  # When multiple sources are provided, real_comparison_mode controls whether
+  # they are pooled together into one real distribution ("pooled") or compared
+  # pairwise against the synthetic data ("per_dataset") for PAD/MMD.
+  real_comparison_mode: "pooled"  # or "per_dataset"
 
-  real_dataloader_config:
-    type: "huggingface"
-    dataset_name: "HuggingFaceH4/MATH-500"
-    split: "test"
-    subset: null
-    text_field: "problem"
+  # Example: multiple real datasets (HuggingFace math benchmarks).
+  # Novelty uses score dirs from each source: set scores_dir explicitly, or
+  # we use scores_root_dir/<name> when name is set.
+  real_data_source:
+    - name: "MATH-500"
+      path: null
+      # Optional: explicit scores directory for novelty; otherwise uses
+      # scores_root_dir/name
+      scores_dir: null
+      dataloader:
+        type: "huggingface"
+        dataset_name: "HuggingFaceH4/MATH-500"
+        split: "test"
+        subset: null
+        text_field: "problem"
+
+    - name: "MATH-Hard"
+      path: null
+      scores_dir: null
+      dataloader:
+        type: "huggingface"
+        dataset_name: "lighteval/MATH-Hard"
+        split: "test"
+        subset: null
+        text_field: "problem"
 
   # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers
   embedding_backend: "openai"
@@ -54,6 +87,17 @@ quality_eval_cfg:
   umap_min_dist: 0.1  # Minimum distance for UMAP
   umap_metric: "cosine"  # Distance metric for UMAP
 
+  # Evaluation settings to use if we need to (re-)evaluate prior or real datasets.
+  # These mirror the subject_llm settings in src/cfg/run_cfg.yaml.
+  evaluation_cfg:
+    subject_llm:
+      name: "o1-mini"
+      provider: "openai"
+      generation_cfg:
+        temperature: 0.7
+        max_tokens: 2048
+        seed: 42
+
 exp_cfg:
   exp_id: "quality_evaluation"
 
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 82e8228..812c367 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -7,10 +7,10 @@
 
 import hydra
 import numpy as np
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName
-from src.utils import (
+from src.utils.quality_evaluation_utils import (
     compute_benchmark_consistency,
     compute_benchmark_difficulty,
     compute_benchmark_novelty,
@@ -20,8 +20,7 @@
     compute_mdm,
     compute_mmd,
     compute_pad,
-    constants,
-    fit_umap_shared,
+    fit_umap,
 )
 from src.utils.data_utils import get_run_id
 from src.utils.diversity_metrics_dataloaders import (
@@ -274,13 +273,17 @@ def main(cfg: DictConfig) -> None:
     """Compute benchmark-level quality metrics from saved capability scores."""
     run_id = get_run_id(cfg)
 
-    scores_root_dir = cfg.quality_eval_cfg.scores_root_dir
+    # Synthetic benchmark source (scores + capabilities)
+    synthetic_cfg = cfg.quality_eval_cfg.synthetic_source
+    scores_root_dir = synthetic_cfg.get("scores_root_dir")
+    scores_subdir = synthetic_cfg.get("scores_subdir", "scores")
+
     if scores_root_dir:
         base_scores_dir = scores_root_dir
     else:
         base_scores_dir = os.path.join(
             constants.BASE_ARTIFACTS_DIR,
-            cfg.quality_eval_cfg.scores_subdir,
+            scores_subdir,
             run_id,
         )
         logger.info("Using fallback scores directory: %s", base_scores_dir)
@@ -301,19 +304,7 @@ def main(cfg: DictConfig) -> None:
     # For consistency: map model to list of accuracies per generation
     model_to_generation_accuracies: Dict[str, List[float]] = {}
 
-    # Get prior dataset names to exclude them from current dataset
-    prior_datasets = cfg.quality_eval_cfg.prior_datasets
-    prior_dataset_names = set()
-    for prior_path in prior_datasets:
-        # Extract the directory name from the path
-        prior_name = os.path.basename(os.path.normpath(prior_path))
-        prior_dataset_names.add(prior_name)
-
     for model_name in os.listdir(base_scores_dir):
-        # Skip if this is a prior dataset directory
-        if model_name in prior_dataset_names:
-            logger.debug("Skipping prior dataset directory: %s", model_name)
-            continue
         model_dir = os.path.join(base_scores_dir, model_name)
         if not os.path.isdir(model_dir):
             continue
@@ -392,16 +383,38 @@ def main(cfg: DictConfig) -> None:
         except ValueError as e:
             logger.warning("Could not compute consistency: %s", e)
 
-    # Compute novelty if prior datasets are provided
-    prior_datasets = cfg.quality_eval_cfg.prior_datasets
-    if prior_datasets:
+    # Compute novelty using score dirs derived from real_data_source.
+    novelty_score_dirs: List[str] = []
+    real_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+    real_source_configs: List[Mapping[str, Any]] = []
+    if real_source_cfg is not None:
+        cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True)
+        if isinstance(cfg_container, list):
+            real_source_configs = cfg_container  # type: ignore[list-item]
+        elif isinstance(cfg_container, Mapping):
+            real_source_configs = [cfg_container]  # type: ignore[list-item]
+
+    # Use synthetic_source.scores_root_dir for deriving default score dirs
+    scores_root_dir = synthetic_cfg.get("scores_root_dir")
+    for src in real_source_configs:
+        scores_dir = src.get("scores_dir")
+        if not scores_dir:
+            src_name = src.get("name")
+            if scores_root_dir and src_name:
+                scores_dir = os.path.join(scores_root_dir, src_name)
+        if scores_dir:
+            novelty_score_dirs.append(str(scores_dir))
+
+    if novelty_score_dirs:
         try:
             logger.info("Loading prior datasets for novelty computation...")
             prior_datasets_accuracies: List[Dict[str, float]] = []
-            for prior_dir in prior_datasets:
+            prior_labels: List[str] = []
+            for prior_dir in novelty_score_dirs:
                 prior_acc = _load_avg_model_accuracies_from_dir(prior_dir)
                 if prior_acc:
                     prior_datasets_accuracies.append(prior_acc)
+                    prior_labels.append(os.path.basename(os.path.normpath(prior_dir)))
                     logger.info(
                         "Loaded prior dataset from %s: %d models",
                         prior_dir,
@@ -413,22 +426,38 @@ def main(cfg: DictConfig) -> None:
                     )
 
             if prior_datasets_accuracies:
-                novelty = compute_benchmark_novelty(
-                    model_to_accuracy,
-                    cast(List[Mapping[str, float]], prior_datasets_accuracies),
-                )
-                logger.info("Benchmark novelty: %.4f", novelty)
+                novelty_mode = str(
+                    cfg.quality_eval_cfg.get("novelty_mode", "combined")
+                ).lower()
+                if novelty_mode in ("combined", "both"):
+                    novelty = compute_benchmark_novelty(
+                        model_to_accuracy,
+                        cast(
+                            List[Mapping[str, float]], prior_datasets_accuracies
+                        ),
+                    )
+                    logger.info("Benchmark novelty (combined): %.4f", novelty)
+                if novelty_mode in ("per_dataset", "both"):
+                    for label, prior_acc in zip(
+                        prior_labels, prior_datasets_accuracies
+                    ):
+                        n_per = compute_benchmark_novelty(
+                            model_to_accuracy, [prior_acc]
+                        )
+                        logger.info(
+                            "Novelty[%s]: %.4f", label, n_per
+                        )
             else:
                 logger.warning(
-                    "No valid prior datasets found, skipping novelty computation."
+                    "No valid real data score dirs found (real_data_source with scores_dir or name), skipping novelty computation."
                 )
         except ValueError as e:
             logger.warning("Could not compute novelty: %s", e)
         except Exception as e:  # noqa: BLE001
             logger.warning("Error computing novelty: %s", e)
 
-    # Compute embedding-based metrics if capabilities directory is provided
-    capabilities_dir = cfg.quality_eval_cfg.capabilities_dir
+    # Compute embedding-based metrics if synthetic capabilities directory is provided
+    capabilities_dir = synthetic_cfg.get("capabilities_dir")
     if capabilities_dir:
         internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics
         comparison_metrics = cfg.quality_eval_cfg.comparison_metrics
@@ -436,11 +465,6 @@ def main(cfg: DictConfig) -> None:
         embedding_backend = cfg.quality_eval_cfg.embedding_backend
         embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions
 
-        # Get dataloader config if provided
-        synth_dataloader_config = cfg.quality_eval_cfg.synthetic_dataloader_config
-        if synth_dataloader_config:
-            synth_dataloader_config = dict(synth_dataloader_config)
-
         logger.info(
             "Computing embedding-based metrics for capabilities in %s", capabilities_dir
         )
@@ -450,7 +474,7 @@ def main(cfg: DictConfig) -> None:
             capabilities_dir=capabilities_dir,
             embedding_model_name=embedding_model,
             embed_dimensions=embed_dimensions,
-            dataloader_config=synth_dataloader_config,
+            dataloader_config=None,
             embedding_backend=embedding_backend,
         )
 
@@ -458,58 +482,136 @@ def main(cfg: DictConfig) -> None:
             logger.warning("No embeddings generated, skipping diversity metrics")
         else:
             real_embeddings = None
-            # Check if real data directory/file is provided for comparison
-            real_data_dir = cfg.quality_eval_cfg.real_data_dir
-            real_dataloader_config = cfg.quality_eval_cfg.real_dataloader_config
-
-            # Real data: valid path or dataloader config (e.g. HuggingFace)
-            has_real_data = False
-            # Case 1: local path (capability/JSONL/CSV formats)
-            if (
-                real_data_dir
-                and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir))
-                or real_dataloader_config
-                and real_dataloader_config.get("type") == "huggingface"
-            ):
-                has_real_data = True
-
-            if has_real_data:
-                # Get real data dataloader config if provided
-                if real_dataloader_config:
-                    real_dataloader_config = dict(real_dataloader_config)
-
-                if real_data_dir:
-                    logger.info("Loading real data embeddings from %s", real_data_dir)
+            # Real data sources for comparison metrics (PAD, MMD, KL)
+            real_mode = str(
+                cfg.quality_eval_cfg.get("real_comparison_mode", "pooled")
+            ).lower()
+            real_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+
+            # Normalize to a list of source configs: each with optional name, path, dataloader.
+            # real_data_source can be a single mapping or a list of mappings.
+            real_source_configs: List[Dict[str, Any]] = []
+            if real_source_cfg is None:
+                logger.info(
+                    "real_data_source is not set in config; skipping comparison metrics (PAD, MMD, KL)."
+                )
+            else:
+                cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True)
+                if isinstance(cfg_container, list):
+                    raw_list: List[Any] = cfg_container
+                elif isinstance(cfg_container, Mapping):
+                    raw_list = [cfg_container]
+                else:
+                    raw_list = []
+                for i, src in enumerate(raw_list):
+                    src_dict = dict(src)
+                    src_dict.setdefault("name", f"real_{i}")
+                    real_source_configs.append(src_dict)
+
+            real_embeddings_list: List[np.ndarray] = []
+            real_names: List[str] = []
+
+            # Load embeddings for each real source
+            for src in real_source_configs:
+                name = src.get("name", "real")
+                real_data_path = src.get("path")
+                real_dataloader_cfg = src.get("dataloader")
+                if real_dataloader_cfg is not None and not isinstance(
+                    real_dataloader_cfg, dict
+                ):
+                    real_dataloader_cfg = dict(
+                        OmegaConf.to_container(real_dataloader_cfg, resolve=True)
+                    )
+
+                has_real_data = False
+                if real_data_path and (
+                    os.path.isdir(real_data_path) or os.path.isfile(real_data_path)
+                ):
+                    has_real_data = True
+                elif real_dataloader_cfg and real_dataloader_cfg.get(
+                    "type"
+                ) == "huggingface":
+                    has_real_data = True
+
+                if not has_real_data:
+                    logger.info(
+                        "Skipping real source %s: no valid path or dataloader (type=huggingface) provided",
+                        name,
+                    )
+                    continue
+
+                if real_dataloader_cfg is None:
+                    real_dataloader_cfg = {}
+
+                if real_data_path:
+                    logger.info("Loading real data embeddings from %s", real_data_path)
                 else:
                     logger.info(
-                        "Loading real data embeddings using dataloader config (no local path)"
+                        "Loading real data embeddings for %s using dataloader config (no local path)",
+                        name,
                     )
-                real_embeddings, _ = _load_capabilities_and_generate_embeddings(
-                    # HuggingFace: capabilities_dir unused, pass empty string
-                    capabilities_dir=real_data_dir or "",
+
+                emb_real, _ = _load_capabilities_and_generate_embeddings(
+                    capabilities_dir=real_data_path or "",
                     embedding_model_name=embedding_model,
                     embed_dimensions=embed_dimensions,
-                    dataloader_config=real_dataloader_config,
+                    dataloader_config=real_dataloader_cfg,
                     embedding_backend=embedding_backend,
                 )
-
-                if len(real_embeddings) > 0:
-                    # Comparison metrics (need both synth and real)
-                    if "pad" in comparison_metrics:
-                        try:
+                if emb_real is None or len(emb_real) == 0:
+                    logger.warning(
+                        "No real data embeddings generated for source %s, skipping it",
+                        name,
+                    )
+                    continue
+
+                real_embeddings_list.append(emb_real)
+                real_names.append(name)
+
+            if real_embeddings_list:
+                # Pooled real embeddings (used for KL + joint UMAP, and for PAD/MMD in 'pooled' mode)
+                real_embeddings = np.vstack(real_embeddings_list)
+
+                # Comparison metrics (need both synth and real)
+                if "pad" in comparison_metrics:
+                    try:
+                        if real_mode == "per_dataset" and len(real_embeddings_list) > 1:
+                            for name, emb_real in zip(real_names, real_embeddings_list):
+                                pad_score = compute_pad(
+                                    synth_embeddings,
+                                    emb_real,
+                                    classifier_name=cfg.quality_eval_cfg.pad_classifier,
+                                )
+                                logger.info("PAD[%s]: %.4f", name, pad_score)
+                        else:
                             pad_score = compute_pad(
                                 synth_embeddings,
                                 real_embeddings,
                                 classifier_name=cfg.quality_eval_cfg.pad_classifier,
                             )
-                            logger.info("PAD score: %.4f", pad_score)
-                        except Exception as e:  # noqa: BLE001
-                            logger.warning("Error computing PAD: %s", e)
-
-                    if "mmd" in comparison_metrics:
-                        try:
-                            mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
-                            mmd_degree = cfg.quality_eval_cfg.mmd_degree
+                            logger.info("PAD (pooled real): %.4f", pad_score)
+                    except Exception as e:  # noqa: BLE001
+                        logger.warning("Error computing PAD: %s", e)
+
+                if "mmd" in comparison_metrics:
+                    try:
+                        mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
+                        mmd_degree = cfg.quality_eval_cfg.mmd_degree
+                        if real_mode == "per_dataset" and len(real_embeddings_list) > 1:
+                            for name, emb_real in zip(real_names, real_embeddings_list):
+                                mmd_score = compute_mmd(
+                                    synth_embeddings,
+                                    emb_real,
+                                    kernel=mmd_kernel,
+                                    degree=mmd_degree,
+                                )
+                                logger.info(
+                                    "MMD[%s] (%s kernel): %.4f",
+                                    name,
+                                    mmd_kernel,
+                                    mmd_score,
+                                )
+                        else:
                             mmd_score = compute_mmd(
                                 synth_embeddings,
                                 real_embeddings,
@@ -517,55 +619,49 @@ def main(cfg: DictConfig) -> None:
                                 degree=mmd_degree,
                             )
                             logger.info(
-                                "MMD score (%s kernel): %.4f", mmd_kernel, mmd_score
+                                "MMD (pooled real, %s kernel): %.4f",
+                                mmd_kernel,
+                                mmd_score,
                             )
-                        except Exception as e:  # noqa: BLE001
-                            logger.warning("Error computing MMD: %s", e)
-                else:
-                    logger.warning(
-                        "No real data embeddings generated, skipping comparison metrics"
-                    )
-            else:
-                logger.info(
-                    "No real_data_dir provided, skipping comparison metrics (require real data)"
+                    except Exception as e:  # noqa: BLE001
+                        logger.warning("Error computing MMD: %s", e)
+            elif real_source_configs:
+                logger.warning(
+                    "No real data embeddings could be generated for any source. "
+                    "Check dataloader config (e.g. dataset_name, text_field) and embedding API/network."
                 )
+            # When real_source_configs is empty we already logged that real_data_source is not set
 
-            # Joint UMAP
+            # Joint UMAP (for entropy and/or KL in shared space)
+            has_real = (
+                real_embeddings is not None and len(real_embeddings) > 0
+            )
             umap_n_components = cfg.quality_eval_cfg.umap_n_components
             umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors
             umap_min_dist = cfg.quality_eval_cfg.umap_min_dist
             umap_metric = cfg.quality_eval_cfg.umap_metric
-            need_joint_umap = umap_n_components is not None and (
+            need_umap = umap_n_components is not None and (
                 "entropy" in internal_diversity_metrics
-                or (
-                    "kl_divergence" in comparison_metrics
-                    and real_embeddings is not None
-                    and len(real_embeddings) > 0
-                )
+                or ("kl_divergence" in comparison_metrics and has_real)
             )
             synth_reduced = None
             real_reduced = None
-            if need_joint_umap:
-                all_emb = [synth_embeddings]
-                if real_embeddings is not None and len(real_embeddings) > 0:
-                    all_emb.append(real_embeddings)
-                reduced_list = fit_umap_shared(
-                    all_emb,
+            if need_umap:
+                embeddings_to_reduce = [synth_embeddings]
+                if has_real:
+                    embeddings_to_reduce.append(real_embeddings)
+                reduced_list = fit_umap(
+                    embeddings_to_reduce,
                     umap_n_components,
                     n_neighbors=umap_n_neighbors,
                     min_dist=umap_min_dist,
                     metric=umap_metric,
                 )
                 synth_reduced = reduced_list[0]
-                if len(reduced_list) > 1:
-                    real_reduced = reduced_list[1]
+                real_reduced = reduced_list[1] if len(reduced_list) > 1 else None
 
             # KL divergence (joint UMAP so synth and real share a space)
-            if (
-                "kl_divergence" in comparison_metrics
-                and real_embeddings is not None
-                and len(real_embeddings) > 0
-            ):
+            if "kl_divergence" in comparison_metrics and has_real:
                 try:
                     kl_k = cfg.quality_eval_cfg.kl_k
                     kl_synth = (
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 02d2cef..5c96a0c 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -16,5 +16,5 @@
     compute_mdm,
     compute_mmd,
     compute_pad,
-    fit_umap_shared,
+    fit_umap,
 )
diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py
index f5a9154..e7719c5 100644
--- a/src/utils/quality_evaluation_utils.py
+++ b/src/utils/quality_evaluation_utils.py
@@ -486,7 +486,7 @@ def compute_mdm(
 # ===========================
 
 
-def fit_umap_shared(
+def fit_umap(
     embeddings_list: List[np.ndarray],
     n_components: int,
     n_neighbors: int = 15,

From 10bd2e6b31478b0569cac871f4ad230adc2923e4 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Fri, 6 Feb 2026 23:52:15 -0500
Subject: [PATCH 10/14] Refactored main function

---
 src/cfg/run_quality_evaluation_cfg.yaml |  31 +-
 src/run_quality_evaluation.py           | 975 ++++++++++++++----------
 2 files changed, 574 insertions(+), 432 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index e183c0c..00acebf 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -11,14 +11,30 @@ quality_eval_cfg:
     # Capability directory for the synthetic benchmark
     capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
 
+  # List of metrics to compute. Available metrics:
+  # - Benchmark metrics: "difficulty", "separability", "consistency"
+  # - Novelty: "novelty"
+  # - Internal diversity: "mdm", "entropy"
+  # - Comparison metrics: "pad", "mmd", "kl_divergence"
+  metrics_to_compute:
+    - "difficulty"
+    - "separability"
+    - "consistency"
+    # - "novelty"
+    - "mdm"
+    - "entropy"
+    - "pad"
+    - "mmd"
+    - "kl_divergence"
+
   # Novelty: "combined" = one score from all real sources (linear regression on all);
   # "per_dataset" = one novelty per prior (how novel vs each benchmark separately);
   # "both" = report combined and per-dataset.
   novelty_mode: "combined"  # "combined" | "per_dataset" | "both"
 
-  # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL).
+  # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL) and novelty.
   # real_data_source can be:
-  # - a single mapping {path, dataloader, name}, OR
+  # - a single mapping {path, dataloader, name, scores_dir}, OR
   # - a list of such mappings when you have multiple real datasets.
   #
   # When multiple sources are provided, real_comparison_mode controls whether
@@ -58,17 +74,6 @@ quality_eval_cfg:
   # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
   embedding_dimensions: 3072
 
-  # Internal diversity metrics (only need synthetic data)
-  internal_diversity_metrics:
-    - "mdm"      # Mean Distance to Medoid - measures internal coherence
-    - "entropy"  # Differential Entropy - measures diversity/uncertainty
-
-  # Comparison metrics (need both synthetic and real data)
-  comparison_metrics:
-    - "pad"           # Proxy-A-Distance - measures distribution similarity
-    - "mmd"           # Maximum Mean Discrepancy - measures distribution distance
-    - "kl_divergence" # KL Divergence - measures novelty (how different from real)
-
   pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
 
   mmd_kernel: "polynomial"  # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid"
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 812c367..a761de9 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -3,7 +3,7 @@
 import json
 import logging
 import os
-from typing import Any, Dict, List, Mapping, Optional, cast
+from typing import Any, Dict, List, Mapping, Optional, Tuple, cast
 
 import hydra
 import numpy as np
@@ -23,6 +23,7 @@
     fit_umap,
 )
 from src.utils.data_utils import get_run_id
+from src.utils import constants
 from src.utils.diversity_metrics_dataloaders import (
     CapabilityDataloader,
     CSVDataloader,
@@ -36,16 +37,243 @@
 logger = logging.getLogger(__name__)
 
 
-def _collect_accuracies_from_dir(directory: str) -> List[float]:
+def _validate_metric_requirements(cfg: DictConfig) -> None:
     """
-    Collect all accuracy values from JSON files in a directory (recursively).
+    Validate that all required data is provided for the requested metrics.
 
-    Args:
-        directory: Directory to walk recursively for JSON files.
+    Raises ValueError if any required data is missing.
+    """
+    metrics_to_compute = cfg.quality_eval_cfg.get("metrics_to_compute", [])
+    if not metrics_to_compute:
+        raise ValueError(
+            "metrics_to_compute must be specified in config. "
+            "Available metrics: difficulty, separability, consistency, novelty, "
+            "mdm, entropy, pad, mmd, kl_divergence"
+        )
 
-    Returns
-    -------
-        List of accuracy values found in the directory.
+    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+    reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+
+    # Benchmark metrics (difficulty, separability, consistency) need scores
+    benchmark_metrics = {"difficulty", "separability", "consistency"}
+    if benchmark_metrics.intersection(metrics_to_compute):
+        scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
+        scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores")
+        run_id = get_run_id(cfg)
+        
+        if scores_root_dir:
+            base_scores_dir = scores_root_dir
+        else:
+            base_scores_dir = os.path.join(
+                constants.BASE_ARTIFACTS_DIR, scores_subdir, run_id
+            )
+        
+        if not os.path.isdir(base_scores_dir):
+            raise ValueError(
+                f"Benchmark metrics ({benchmark_metrics.intersection(metrics_to_compute)}) "
+                f"require scores directory to exist. "
+                f"benchmark scores_root_dir or fallback directory not found: {base_scores_dir}"
+            )
+        
+        # Check that scores directory contains at least one model subdirectory
+        model_dirs = [
+            d for d in os.listdir(base_scores_dir)
+            if os.path.isdir(os.path.join(base_scores_dir, d))
+        ]
+        if not model_dirs:
+            raise ValueError(
+                f"Scores directory '{base_scores_dir}' exists but contains no model subdirectories. "
+                "Please ensure scores are generated for at least one model."
+            )
+        
+        # For consistency metric, check that at least one model has generation subdirectories
+        if "consistency" in metrics_to_compute:
+            has_generations = False
+            for model_name in model_dirs:
+                model_dir = os.path.join(base_scores_dir, model_name)
+                subdirs = [
+                    d for d in os.listdir(model_dir)
+                    if os.path.isdir(os.path.join(model_dir, d))
+                ]
+                if subdirs:
+                    has_generations = True
+                    break
+            if not has_generations:
+                raise ValueError(
+                    f"Consistency metric requires multiple generations per model "
+                    f"(subdirectories in model directories), but none found in {base_scores_dir}"
+                )
+
+    # Internal diversity metrics (mdm, entropy) need capabilities_dir
+    internal_metrics = {"mdm", "entropy"}
+    if internal_metrics.intersection(metrics_to_compute):
+        capabilities_dir = benchmark_source_cfg.get("capabilities_dir")
+        if not capabilities_dir:
+            raise ValueError(
+                f"Internal diversity metrics ({internal_metrics.intersection(metrics_to_compute)}) "
+                "require benchmark capabilities_dir"
+            )
+        if not os.path.isdir(capabilities_dir):
+            raise ValueError(
+                f"benchmark capabilities_dir does not exist: {capabilities_dir}"
+            )
+        # Check that capabilities_dir contains at least one capability.json file
+        single_cap_json = os.path.join(capabilities_dir, "capability.json")
+        if not os.path.exists(single_cap_json):
+            # Check subdirectories
+            has_capability = False
+            for item_name in os.listdir(capabilities_dir):
+                item_path = os.path.join(capabilities_dir, item_name)
+                if os.path.isdir(item_path):
+                    cap_json = os.path.join(item_path, "capability.json")
+                    if os.path.exists(cap_json):
+                        has_capability = True
+                        break
+            if not has_capability:
+                raise ValueError(
+                    f"benchmark capabilities_dir '{capabilities_dir}' exists but contains "
+                    "no capability.json files (neither directly nor in subdirectories)"
+                )
+
+    # Comparison metrics (pad, mmd, kl_divergence) need benchmark + reference data
+    comparison_metrics = {"pad", "mmd", "kl_divergence"}
+    if comparison_metrics.intersection(metrics_to_compute):
+        capabilities_dir = benchmark_source_cfg.get("capabilities_dir")
+        if not capabilities_dir:
+            raise ValueError(
+                f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
+                "require benchmark capabilities_dir"
+            )
+        if not os.path.isdir(capabilities_dir):
+            raise ValueError(
+                f"benchmark capabilities_dir does not exist: {capabilities_dir}"
+            )
+        # Check that capabilities_dir contains at least one capability.json file
+        single_cap_json = os.path.join(capabilities_dir, "capability.json")
+        if not os.path.exists(single_cap_json):
+            # Check subdirectories
+            has_capability = False
+            for item_name in os.listdir(capabilities_dir):
+                item_path = os.path.join(capabilities_dir, item_name)
+                if os.path.isdir(item_path):
+                    cap_json = os.path.join(item_path, "capability.json")
+                    if os.path.exists(cap_json):
+                        has_capability = True
+                        break
+            if not has_capability:
+                raise ValueError(
+                    f"benchmark capabilities_dir '{capabilities_dir}' exists but contains "
+                    "no capability.json files (neither directly nor in subdirectories)"
+                )
+
+        if reference_data_source_cfg is None:
+            raise ValueError(
+                f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
+                "require reference_data_source to be configured"
+            )
+
+        # Validate each reference source has either path or dataloader
+        cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
+        sources = []
+        if isinstance(cfg_container, list):
+            sources = cfg_container
+        elif isinstance(cfg_container, Mapping):
+            sources = [cfg_container]
+
+        if not sources:
+            raise ValueError(
+                f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
+                "require at least one reference_data_source entry"
+            )
+
+        for i, src in enumerate(sources):
+            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            name = src_dict.get("name", f"reference_{i}")
+            path = src_dict.get("path")
+            dataloader = src_dict.get("dataloader")
+
+            has_path = path and (os.path.isdir(path) or os.path.isfile(path))
+            has_dataloader = dataloader and dataloader.get("type") == "huggingface"
+
+            if not (has_path or has_dataloader):
+                raise ValueError(
+                    f"reference_data_source[{i}] ({name}) must have either a valid 'path' "
+                    "(existing file/directory) or 'dataloader' with type='huggingface'"
+                )
+
+    # Novelty needs reference_data_source with score directories (prior accuracies)
+    if "novelty" in metrics_to_compute:
+        if reference_data_source_cfg is None:
+            raise ValueError("Novelty metric requires reference_data_source (prior accuracies) to be configured")
+
+        cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
+        sources = []
+        if isinstance(cfg_container, list):
+            sources = cfg_container
+        elif isinstance(cfg_container, Mapping):
+            sources = [cfg_container]
+
+        if not sources:
+            raise ValueError("Novelty metric requires at least one reference_data_source entry (for prior accuracies)")
+
+        scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
+        has_valid_score_dir = False
+        checked: List[str] = []
+        for i, src in enumerate(sources):
+            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            scores_dir = src_dict.get("scores_dir")
+            if not scores_dir:
+                src_name = src_dict.get("name")
+                if scores_root_dir and src_name:
+                    scores_dir = os.path.join(scores_root_dir, src_name)
+                else:
+                    if not scores_root_dir:
+                        checked.append(
+                            f"entry {i} (name={src_dict.get('name')}): no scores_dir and scores_root_dir not set"
+                        )
+                    elif not src_name:
+                        checked.append(f"entry {i}: no scores_dir and no name to derive from scores_root_dir")
+                    continue
+            if not scores_dir:
+                continue
+            if not os.path.isdir(scores_dir):
+                checked.append(f"{scores_dir!r} (does not exist)")
+                continue
+            model_dirs = [
+                d for d in os.listdir(scores_dir)
+                if os.path.isdir(os.path.join(scores_dir, d))
+            ]
+            if not model_dirs:
+                checked.append(f"{scores_dir!r} (exists but has no model subdirectories)")
+                continue
+            has_json = False
+            for model_name in model_dirs:
+                model_dir = os.path.join(scores_dir, model_name)
+                json_files = [f for f in os.listdir(model_dir) if f.endswith(".json")]
+                if json_files:
+                    has_json = True
+                    break
+            if has_json:
+                has_valid_score_dir = True
+                break
+            checked.append(f"{scores_dir!r} (exists, has model subdirs but no .json score files)")
+
+        if not has_valid_score_dir:
+            detail = "; ".join(checked) if checked else "no scores_dir/name derived paths to check"
+            raise ValueError(
+                "Novelty uses real/reference data via prior accuracies: model scores from evaluating "
+                "models on those reference datasets (e.g. MATH-500, MATH-Hard). You must have run that "
+                "evaluation and saved scores so they exist at scores_dir (or scores_root_dir/<name>). "
+                "Each directory must contain one subdir per model with Inspect eval JSON score files. "
+                f"Checked: {detail}. "
+                "Either run evaluation on the reference datasets and save scores there, or remove 'novelty' from metrics_to_compute."
+            )
+
+
+def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]:
+    """
+    Collect all accuracy values from Inspect eval JSON files in a directory (recursively).
+    Single primitive: one dir -> list of accuracies.
     """
     accuracies: List[float] = []
     for root, _dirs, files in os.walk(directory):
@@ -53,39 +281,42 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]:
             if not fname.endswith(".json"):
                 continue
             json_path = os.path.join(root, fname)
-            acc = _extract_accuracy_from_inspect_json(json_path)
-            if acc is not None:
+            try:
+                with open(json_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("Failed to read %s: %s", json_path, exc)
+                continue
+            try:
+                if "error" in data or "results" not in data:
+                    continue
+                scores = data["results"]["scores"]
+                if not scores:
+                    continue
+                acc = float(scores[0]["metrics"]["accuracy"]["value"])
                 accuracies.append(acc)
+            except (KeyError, TypeError, ValueError) as exc:
+                logger.warning("Failed to extract accuracy from %s: %s", json_path, exc)
     return accuracies
 
 
-def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]:
+def _load_average_accuracy_per_model_from_scores_dir(base_dir: str) -> Dict[str, float]:
     """
-    Load model accuracies from a directory structure.
-
-    Args:
-        base_dir: Directory containing per-model subdirectories with JSON files.
-
-    Returns
-    -------
-        Dictionary mapping model name to average accuracy.
+    Load a scores directory with one subdir per model (each containing Inspect eval JSONs)
+    and return model name -> average accuracy. Used for prior (reference) score dirs (e.g. novelty).
+    Returns empty dict if base_dir does not exist.
     """
     model_to_accuracy: Dict[str, float] = {}
-
     if not os.path.isdir(base_dir):
         logger.warning("Directory does not exist: %s", base_dir)
         return model_to_accuracy
-
     for model_name in os.listdir(base_dir):
         model_dir = os.path.join(base_dir, model_name)
         if not os.path.isdir(model_dir):
             continue
-
-        accuracies = _collect_accuracies_from_dir(model_dir)
+        accuracies = _collect_accuracies_from_inspect_eval_dir(model_dir)
         if accuracies:
-            avg_acc = sum(accuracies) / len(accuracies)
-            model_to_accuracy[model_name] = avg_acc
-
+            model_to_accuracy[model_name] = sum(accuracies) / len(accuracies)
     return model_to_accuracy
 
 
@@ -240,43 +471,12 @@ def _load_capabilities_and_generate_embeddings(
     return embeddings_array, texts
 
 
-def _extract_accuracy_from_inspect_json(json_path: str) -> float | None:
-    """Extract the accuracy metric from a single Inspect eval JSON file."""
-    try:
-        with open(json_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    except Exception as exc:  # noqa: BLE001
-        logger.warning("Failed to read %s: %s", json_path, exc)
-        return None
-
-    try:
-        # Check if file has results (successful evaluation) or error (failed evaluation)
-        if "error" in data or "results" not in data:
-            # File has error or no results, skip it
-            return None
-
-        scores = data["results"]["scores"]
-        if not scores:
-            return None
-        metrics = scores[0]["metrics"]
-        acc = metrics["accuracy"]["value"]
-        return float(acc)
-    except (KeyError, TypeError, ValueError) as exc:
-        logger.warning("Failed to extract accuracy from %s: %s", json_path, exc)
-        return None
-
-
-@hydra.main(
-    version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg"
-)
-def main(cfg: DictConfig) -> None:
-    """Compute benchmark-level quality metrics from saved capability scores."""
+def _load_benchmark_scores(cfg: DictConfig) -> Tuple[Dict[str, float], Dict[str, List[float]]]:
+    """Load model accuracies from the benchmark (evaluated) scores directory. Validation has already run."""
     run_id = get_run_id(cfg)
-
-    # Synthetic benchmark source (scores + capabilities)
-    synthetic_cfg = cfg.quality_eval_cfg.synthetic_source
-    scores_root_dir = synthetic_cfg.get("scores_root_dir")
-    scores_subdir = synthetic_cfg.get("scores_subdir", "scores")
+    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+    scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
+    scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores")
 
     if scores_root_dir:
         base_scores_dir = scores_root_dir
@@ -286,22 +486,9 @@ def main(cfg: DictConfig) -> None:
             scores_subdir,
             run_id,
         )
-        logger.info("Using fallback scores directory: %s", base_scores_dir)
-
-    if not os.path.isdir(base_scores_dir):
-        logger.error(
-            "Scores directory '%s' does not exist. "
-            "Please ensure scores are generated for run_id '%s'.",
-            base_scores_dir,
-            run_id,
-        )
-        return
 
     logger.info("Loading model accuracies from %s", base_scores_dir)
-
-    # For each model directory, walk all JSON files and average their accuracies.
     model_to_accuracy: Dict[str, float] = {}
-    # For consistency: map model to list of accuracies per generation
     model_to_generation_accuracies: Dict[str, List[float]] = {}
 
     for model_name in os.listdir(base_scores_dir):
@@ -309,7 +496,6 @@ def main(cfg: DictConfig) -> None:
         if not os.path.isdir(model_dir):
             continue
 
-        # Check if model_dir contains subdirectories (generations/runs)
         subdirs = [
             d
             for d in os.listdir(model_dir)
@@ -317,13 +503,10 @@ def main(cfg: DictConfig) -> None:
         ]
 
         if subdirs:
-            # Structure: model_dir/generation_dir/...json files
-            # Each subdirectory represents a different dataset generation
             generation_accuracies: List[float] = []
             for gen_dir_name in sorted(subdirs):
                 gen_dir = os.path.join(model_dir, gen_dir_name)
-                gen_accuracies = _collect_accuracies_from_dir(gen_dir)
-
+                gen_accuracies = _collect_accuracies_from_inspect_eval_dir(gen_dir)
                 if gen_accuracies:
                     avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies)
                     generation_accuracies.append(avg_gen_acc)
@@ -334,10 +517,8 @@ def main(cfg: DictConfig) -> None:
                         avg_gen_acc,
                         len(gen_accuracies),
                     )
-
             if generation_accuracies:
                 model_to_generation_accuracies[model_name] = generation_accuracies
-                # Overall average across all generations
                 avg_acc = sum(generation_accuracies) / len(generation_accuracies)
                 model_to_accuracy[model_name] = avg_acc
                 logger.info(
@@ -346,17 +527,11 @@ def main(cfg: DictConfig) -> None:
                     len(generation_accuracies),
                     avg_acc,
                 )
-            # Continue to next model if we processed subdirs
             continue
-        # Structure: model_dir/...json files (no generation subdirectories)
-        accuracies = _collect_accuracies_from_dir(model_dir)
 
+        accuracies = _collect_accuracies_from_inspect_eval_dir(model_dir)
         if not accuracies:
-            logger.warning(
-                "No accuracies found for model '%s' in %s", model_name, model_dir
-            )
             continue
-
         avg_acc = sum(accuracies) / len(accuracies)
         model_to_accuracy[model_name] = avg_acc
         logger.info(
@@ -367,361 +542,323 @@ def main(cfg: DictConfig) -> None:
         )
 
     if not model_to_accuracy:
-        logger.error("No valid model accuracies found in %s", base_scores_dir)
+        raise RuntimeError(
+            f"Unexpected: No valid model accuracies found in {base_scores_dir} "
+            "despite validation passing. This may indicate a race condition or file system issue."
+        )
+    return model_to_accuracy, model_to_generation_accuracies
+
+
+def _compute_benchmark_metrics(
+    model_to_accuracy: Dict[str, float],
+    model_to_generation_accuracies: Dict[str, List[float]],
+    metrics_to_compute: set,
+) -> None:
+    """Compute difficulty, separability, and consistency from model accuracies."""
+    if "difficulty" in metrics_to_compute:
+        difficulty = compute_benchmark_difficulty(model_to_accuracy)
+        logger.info("Benchmark difficulty: %.4f", difficulty)
+
+    if "separability" in metrics_to_compute:
+        separability = compute_benchmark_separability(model_to_accuracy)
+        logger.info("Benchmark separability: %.4f", separability)
+
+    if "consistency" in metrics_to_compute:
+        if not model_to_generation_accuracies:
+            raise RuntimeError(
+                "Unexpected: No model generation accuracies found despite validation passing. "
+                "This may indicate a race condition or file system issue."
+            )
+        consistency = compute_benchmark_consistency(model_to_generation_accuracies)
+        logger.info("Benchmark consistency: %.4f", consistency)
+
+
+def _compute_novelty_metrics(
+    cfg: DictConfig,
+    benchmark_source_cfg: DictConfig,
+    model_to_accuracy: Dict[str, float],
+    metrics_to_compute: set,
+) -> None:
+    """Load previous (prior) accuracies and compute novelty vs benchmark. Combined and/or per-dataset."""
+    if "novelty" not in metrics_to_compute:
         return
 
-    difficulty = compute_benchmark_difficulty(model_to_accuracy)
-    separability = compute_benchmark_separability(model_to_accuracy)
-    logger.info("Benchmark difficulty: %.4f", difficulty)
-    logger.info("Benchmark separability: %.4f", separability)
-
-    # Compute consistency if we have multiple generations per model
-    if model_to_generation_accuracies:
-        try:
-            consistency = compute_benchmark_consistency(model_to_generation_accuracies)
-            logger.info("Benchmark consistency: %.4f", consistency)
-        except ValueError as e:
-            logger.warning("Could not compute consistency: %s", e)
-
-    # Compute novelty using score dirs derived from real_data_source.
-    novelty_score_dirs: List[str] = []
-    real_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
-    real_source_configs: List[Mapping[str, Any]] = []
-    if real_source_cfg is not None:
-        cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True)
-        if isinstance(cfg_container, list):
-            real_source_configs = cfg_container  # type: ignore[list-item]
-        elif isinstance(cfg_container, Mapping):
-            real_source_configs = [cfg_container]  # type: ignore[list-item]
+    reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+    cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
+    prior_source_configs = (
+        cfg_container if isinstance(cfg_container, list) else [cfg_container]
+    )
 
-    # Use synthetic_source.scores_root_dir for deriving default score dirs
-    scores_root_dir = synthetic_cfg.get("scores_root_dir")
-    for src in real_source_configs:
-        scores_dir = src.get("scores_dir")
+    scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
+    prior_score_dirs: List[str] = []
+    for src in prior_source_configs:
+        src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+        scores_dir = src_dict.get("scores_dir")
         if not scores_dir:
-            src_name = src.get("name")
+            src_name = src_dict.get("name")
             if scores_root_dir and src_name:
                 scores_dir = os.path.join(scores_root_dir, src_name)
         if scores_dir:
-            novelty_score_dirs.append(str(scores_dir))
+            prior_score_dirs.append(str(scores_dir))
+
+    logger.info("Loading prior (previous) accuracies for novelty computation...")
+    prior_datasets_accuracies: List[Dict[str, float]] = []
+    prior_labels: List[str] = []
+    for prior_dir in prior_score_dirs:
+        prior_acc = _load_average_accuracy_per_model_from_scores_dir(prior_dir)
+        if not prior_acc:
+            raise RuntimeError(
+                f"Unexpected: No accuracies found in prior dataset {prior_dir} "
+                "despite validation passing. This may indicate a race condition or file system issue."
+            )
+        prior_datasets_accuracies.append(prior_acc)
+        prior_labels.append(os.path.basename(os.path.normpath(prior_dir)))
+        logger.info(
+            "Loaded prior dataset from %s: %d models",
+            prior_dir,
+            len(prior_acc),
+        )
 
-    if novelty_score_dirs:
-        try:
-            logger.info("Loading prior datasets for novelty computation...")
-            prior_datasets_accuracies: List[Dict[str, float]] = []
-            prior_labels: List[str] = []
-            for prior_dir in novelty_score_dirs:
-                prior_acc = _load_avg_model_accuracies_from_dir(prior_dir)
-                if prior_acc:
-                    prior_datasets_accuracies.append(prior_acc)
-                    prior_labels.append(os.path.basename(os.path.normpath(prior_dir)))
-                    logger.info(
-                        "Loaded prior dataset from %s: %d models",
-                        prior_dir,
-                        len(prior_acc),
-                    )
-                else:
-                    logger.warning(
-                        "No accuracies found in prior dataset: %s", prior_dir
-                    )
+    novelty_mode = str(cfg.quality_eval_cfg.get("novelty_mode", "combined")).lower()
+    if novelty_mode in ("combined", "both"):
+        novelty = compute_benchmark_novelty(
+            model_to_accuracy,
+            cast(List[Mapping[str, float]], prior_datasets_accuracies),
+        )
+        logger.info("Benchmark novelty (combined): %.4f", novelty)
+    if novelty_mode in ("per_dataset", "both"):
+        for label, prior_acc in zip(prior_labels, prior_datasets_accuracies):
+            n_per = compute_benchmark_novelty(model_to_accuracy, [prior_acc])
+            logger.info("Novelty[%s]: %.4f", label, n_per)
+
+
+def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -> None:
+    """Load benchmark and reference embeddings; compute PAD, MMD, KL, MDM, entropy."""
+    internal_metrics = {"mdm", "entropy"}
+    comparison_metrics = {"pad", "mmd", "kl_divergence"}
+    needs_embeddings = bool(
+        internal_metrics.intersection(metrics_to_compute)
+        or comparison_metrics.intersection(metrics_to_compute)
+    )
+    if not needs_embeddings:
+        return
 
-            if prior_datasets_accuracies:
-                novelty_mode = str(
-                    cfg.quality_eval_cfg.get("novelty_mode", "combined")
-                ).lower()
-                if novelty_mode in ("combined", "both"):
-                    novelty = compute_benchmark_novelty(
-                        model_to_accuracy,
-                        cast(
-                            List[Mapping[str, float]], prior_datasets_accuracies
-                        ),
-                    )
-                    logger.info("Benchmark novelty (combined): %.4f", novelty)
-                if novelty_mode in ("per_dataset", "both"):
-                    for label, prior_acc in zip(
-                        prior_labels, prior_datasets_accuracies
-                    ):
-                        n_per = compute_benchmark_novelty(
-                            model_to_accuracy, [prior_acc]
-                        )
-                        logger.info(
-                            "Novelty[%s]: %.4f", label, n_per
-                        )
-            else:
-                logger.warning(
-                    "No valid real data score dirs found (real_data_source with scores_dir or name), skipping novelty computation."
-                )
-        except ValueError as e:
-            logger.warning("Could not compute novelty: %s", e)
-        except Exception as e:  # noqa: BLE001
-            logger.warning("Error computing novelty: %s", e)
-
-    # Compute embedding-based metrics if synthetic capabilities directory is provided
-    capabilities_dir = synthetic_cfg.get("capabilities_dir")
-    if capabilities_dir:
-        internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics
-        comparison_metrics = cfg.quality_eval_cfg.comparison_metrics
-        embedding_model = cfg.quality_eval_cfg.embedding_model
-        embedding_backend = cfg.quality_eval_cfg.embedding_backend
-        embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions
+    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+    capabilities_dir = benchmark_source_cfg.get("capabilities_dir")
+    embedding_model = cfg.quality_eval_cfg.embedding_model
+    embedding_backend = cfg.quality_eval_cfg.embedding_backend
+    embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions
 
-        logger.info(
-            "Computing embedding-based metrics for capabilities in %s", capabilities_dir
+    logger.info(
+        "Computing embedding-based metrics for capabilities in %s", capabilities_dir
+    )
+    benchmark_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
+        capabilities_dir=capabilities_dir,
+        embedding_model_name=embedding_model,
+        embed_dimensions=embed_dimensions,
+        dataloader_config=None,
+        embedding_backend=embedding_backend,
+    )
+    if len(benchmark_embeddings) == 0:
+        raise RuntimeError(
+            f"Unexpected: No embeddings generated from {capabilities_dir} "
+            "despite validation passing. This may indicate an embedding API/network issue."
         )
 
-        # Load capabilities and generate embeddings
-        synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings(
-            capabilities_dir=capabilities_dir,
-            embedding_model_name=embedding_model,
-            embed_dimensions=embed_dimensions,
-            dataloader_config=None,
-            embedding_backend=embedding_backend,
+    reference_embeddings = None
+    reference_embeddings_list: List[np.ndarray] = []
+    reference_names: List[str] = []
+
+    if comparison_metrics.intersection(metrics_to_compute):
+        reference_comparison_mode = str(
+            cfg.quality_eval_cfg.get("real_comparison_mode", "pooled")
+        ).lower()
+        reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+        cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
+        raw_list = (
+            cfg_container if isinstance(cfg_container, list) else [cfg_container]
         )
+        reference_source_configs: List[Dict[str, Any]] = []
+        for i, src in enumerate(raw_list):
+            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            src_dict.setdefault("name", f"reference_{i}")
+            reference_source_configs.append(src_dict)
+
+        for src in reference_source_configs:
+            name = src.get("name", "reference")
+            reference_data_path = src.get("path")
+            reference_dataloader_cfg = src.get("dataloader")
+            if reference_dataloader_cfg is not None and not isinstance(reference_dataloader_cfg, dict):
+                reference_dataloader_cfg = dict(
+                    OmegaConf.to_container(reference_dataloader_cfg, resolve=True)
+                )
+            if reference_dataloader_cfg is None:
+                reference_dataloader_cfg = {}
 
-        if len(synth_embeddings) == 0:
-            logger.warning("No embeddings generated, skipping diversity metrics")
-        else:
-            real_embeddings = None
-            # Real data sources for comparison metrics (PAD, MMD, KL)
-            real_mode = str(
-                cfg.quality_eval_cfg.get("real_comparison_mode", "pooled")
-            ).lower()
-            real_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
-
-            # Normalize to a list of source configs: each with optional name, path, dataloader.
-            # real_data_source can be a single mapping or a list of mappings.
-            real_source_configs: List[Dict[str, Any]] = []
-            if real_source_cfg is None:
+            if reference_data_path:
+                logger.info("Loading reference data embeddings from %s", reference_data_path)
+            else:
                 logger.info(
-                    "real_data_source is not set in config; skipping comparison metrics (PAD, MMD, KL)."
+                    "Loading reference data embeddings for %s using dataloader config (no local path)",
+                    name,
                 )
-            else:
-                cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True)
-                if isinstance(cfg_container, list):
-                    raw_list: List[Any] = cfg_container
-                elif isinstance(cfg_container, Mapping):
-                    raw_list = [cfg_container]
+            ref_emb, _ = _load_capabilities_and_generate_embeddings(
+                capabilities_dir=reference_data_path or "",
+                embedding_model_name=embedding_model,
+                embed_dimensions=embed_dimensions,
+                dataloader_config=reference_dataloader_cfg,
+                embedding_backend=embedding_backend,
+            )
+            if ref_emb is None or len(ref_emb) == 0:
+                raise RuntimeError(
+                    f"Failed to generate embeddings for reference source {name}. "
+                    "Config validation passed, but embedding generation failed. "
+                    "Check embedding API/network connectivity and dataloader configuration."
+                )
+            reference_embeddings_list.append(ref_emb)
+            reference_names.append(name)
+
+        if reference_embeddings_list:
+            reference_embeddings = np.vstack(reference_embeddings_list)
+
+            if "pad" in metrics_to_compute:
+                if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1:
+                    for name, ref_emb in zip(reference_names, reference_embeddings_list):
+                        pad_score = compute_pad(
+                            benchmark_embeddings,
+                            ref_emb,
+                            classifier_name=cfg.quality_eval_cfg.pad_classifier,
+                        )
+                        logger.info("PAD[%s]: %.4f", name, pad_score)
                 else:
-                    raw_list = []
-                for i, src in enumerate(raw_list):
-                    src_dict = dict(src)
-                    src_dict.setdefault("name", f"real_{i}")
-                    real_source_configs.append(src_dict)
-
-            real_embeddings_list: List[np.ndarray] = []
-            real_names: List[str] = []
-
-            # Load embeddings for each real source
-            for src in real_source_configs:
-                name = src.get("name", "real")
-                real_data_path = src.get("path")
-                real_dataloader_cfg = src.get("dataloader")
-                if real_dataloader_cfg is not None and not isinstance(
-                    real_dataloader_cfg, dict
-                ):
-                    real_dataloader_cfg = dict(
-                        OmegaConf.to_container(real_dataloader_cfg, resolve=True)
+                    pad_score = compute_pad(
+                        benchmark_embeddings,
+                        reference_embeddings,
+                        classifier_name=cfg.quality_eval_cfg.pad_classifier,
+                    )
+                    logger.info("PAD (pooled reference): %.4f", pad_score)
+
+            if "mmd" in metrics_to_compute:
+                mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
+                mmd_degree = cfg.quality_eval_cfg.mmd_degree
+                if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1:
+                    for name, ref_emb in zip(reference_names, reference_embeddings_list):
+                        mmd_score = compute_mmd(
+                            benchmark_embeddings,
+                            ref_emb,
+                            kernel=mmd_kernel,
+                            degree=mmd_degree,
+                        )
+                        logger.info(
+                            "MMD[%s] (%s kernel): %.4f",
+                            name,
+                            mmd_kernel,
+                            mmd_score,
+                        )
+                else:
+                    mmd_score = compute_mmd(
+                        benchmark_embeddings,
+                        reference_embeddings,
+                        kernel=mmd_kernel,
+                        degree=mmd_degree,
                     )
-
-                has_real_data = False
-                if real_data_path and (
-                    os.path.isdir(real_data_path) or os.path.isfile(real_data_path)
-                ):
-                    has_real_data = True
-                elif real_dataloader_cfg and real_dataloader_cfg.get(
-                    "type"
-                ) == "huggingface":
-                    has_real_data = True
-
-                if not has_real_data:
                     logger.info(
-                        "Skipping real source %s: no valid path or dataloader (type=huggingface) provided",
-                        name,
+                        "MMD (pooled reference, %s kernel): %.4f",
+                        mmd_kernel,
+                        mmd_score,
                     )
-                    continue
 
-                if real_dataloader_cfg is None:
-                    real_dataloader_cfg = {}
+    has_reference = reference_embeddings is not None and len(reference_embeddings) > 0
+    umap_n_components = cfg.quality_eval_cfg.umap_n_components
+    umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors
+    umap_min_dist = cfg.quality_eval_cfg.umap_min_dist
+    umap_metric = cfg.quality_eval_cfg.umap_metric
+    need_umap = umap_n_components is not None and (
+        "entropy" in metrics_to_compute
+        or ("kl_divergence" in metrics_to_compute and has_reference)
+    )
+    benchmark_reduced = None
+    reference_reduced = None
+    if need_umap:
+        embeddings_to_reduce = [benchmark_embeddings]
+        if has_reference:
+            embeddings_to_reduce.append(reference_embeddings)
+        reduced_list = fit_umap(
+            embeddings_to_reduce,
+            umap_n_components,
+            n_neighbors=umap_n_neighbors,
+            min_dist=umap_min_dist,
+            metric=umap_metric,
+        )
+        benchmark_reduced = reduced_list[0]
+        reference_reduced = reduced_list[1] if len(reduced_list) > 1 else None
 
-                if real_data_path:
-                    logger.info("Loading real data embeddings from %s", real_data_path)
-                else:
-                    logger.info(
-                        "Loading real data embeddings for %s using dataloader config (no local path)",
-                        name,
-                    )
+    if "kl_divergence" in metrics_to_compute:
+        kl_k = cfg.quality_eval_cfg.kl_k
+        kl_benchmark = (
+            benchmark_reduced if reference_reduced is not None else benchmark_embeddings
+        )
+        kl_reference = (
+            reference_reduced if reference_reduced is not None else reference_embeddings
+        )
+        kl_score = compute_kl_divergence(kl_benchmark, kl_reference, k=kl_k)
+        umap_info = (
+            f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+        )
+        logger.info(
+            "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score
+        )
 
-                emb_real, _ = _load_capabilities_and_generate_embeddings(
-                    capabilities_dir=real_data_path or "",
-                    embedding_model_name=embedding_model,
-                    embed_dimensions=embed_dimensions,
-                    dataloader_config=real_dataloader_cfg,
-                    embedding_backend=embedding_backend,
-                )
-                if emb_real is None or len(emb_real) == 0:
-                    logger.warning(
-                        "No real data embeddings generated for source %s, skipping it",
-                        name,
-                    )
-                    continue
+    if "mdm" in metrics_to_compute:
+        mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters
+        mdm_metric = cfg.quality_eval_cfg.mdm_metric
+        mdm_score = compute_mdm(
+            benchmark_embeddings,
+            n_clusters=mdm_n_clusters,
+            metric=mdm_metric,
+        )
+        logger.info(
+            "MDM score (%d clusters, %s metric): %.4f",
+            mdm_n_clusters,
+            mdm_metric,
+            mdm_score,
+        )
 
-                real_embeddings_list.append(emb_real)
-                real_names.append(name)
-
-            if real_embeddings_list:
-                # Pooled real embeddings (used for KL + joint UMAP, and for PAD/MMD in 'pooled' mode)
-                real_embeddings = np.vstack(real_embeddings_list)
-
-                # Comparison metrics (need both synth and real)
-                if "pad" in comparison_metrics:
-                    try:
-                        if real_mode == "per_dataset" and len(real_embeddings_list) > 1:
-                            for name, emb_real in zip(real_names, real_embeddings_list):
-                                pad_score = compute_pad(
-                                    synth_embeddings,
-                                    emb_real,
-                                    classifier_name=cfg.quality_eval_cfg.pad_classifier,
-                                )
-                                logger.info("PAD[%s]: %.4f", name, pad_score)
-                        else:
-                            pad_score = compute_pad(
-                                synth_embeddings,
-                                real_embeddings,
-                                classifier_name=cfg.quality_eval_cfg.pad_classifier,
-                            )
-                            logger.info("PAD (pooled real): %.4f", pad_score)
-                    except Exception as e:  # noqa: BLE001
-                        logger.warning("Error computing PAD: %s", e)
-
-                if "mmd" in comparison_metrics:
-                    try:
-                        mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
-                        mmd_degree = cfg.quality_eval_cfg.mmd_degree
-                        if real_mode == "per_dataset" and len(real_embeddings_list) > 1:
-                            for name, emb_real in zip(real_names, real_embeddings_list):
-                                mmd_score = compute_mmd(
-                                    synth_embeddings,
-                                    emb_real,
-                                    kernel=mmd_kernel,
-                                    degree=mmd_degree,
-                                )
-                                logger.info(
-                                    "MMD[%s] (%s kernel): %.4f",
-                                    name,
-                                    mmd_kernel,
-                                    mmd_score,
-                                )
-                        else:
-                            mmd_score = compute_mmd(
-                                synth_embeddings,
-                                real_embeddings,
-                                kernel=mmd_kernel,
-                                degree=mmd_degree,
-                            )
-                            logger.info(
-                                "MMD (pooled real, %s kernel): %.4f",
-                                mmd_kernel,
-                                mmd_score,
-                            )
-                    except Exception as e:  # noqa: BLE001
-                        logger.warning("Error computing MMD: %s", e)
-            elif real_source_configs:
-                logger.warning(
-                    "No real data embeddings could be generated for any source. "
-                    "Check dataloader config (e.g. dataset_name, text_field) and embedding API/network."
-                )
-            # When real_source_configs is empty we already logged that real_data_source is not set
+    if "entropy" in metrics_to_compute:
+        entropy_k = cfg.quality_eval_cfg.entropy_k
+        entropy_emb = (
+            benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings
+        )
+        entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k)
+        umap_info = (
+            f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+        )
+        logger.info(
+            "Differential entropy score (k=%d)%s: %.4f",
+            entropy_k,
+            umap_info,
+            entropy_score,
+        )
 
-            # Joint UMAP (for entropy and/or KL in shared space)
-            has_real = (
-                real_embeddings is not None and len(real_embeddings) > 0
-            )
-            umap_n_components = cfg.quality_eval_cfg.umap_n_components
-            umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors
-            umap_min_dist = cfg.quality_eval_cfg.umap_min_dist
-            umap_metric = cfg.quality_eval_cfg.umap_metric
-            need_umap = umap_n_components is not None and (
-                "entropy" in internal_diversity_metrics
-                or ("kl_divergence" in comparison_metrics and has_real)
-            )
-            synth_reduced = None
-            real_reduced = None
-            if need_umap:
-                embeddings_to_reduce = [synth_embeddings]
-                if has_real:
-                    embeddings_to_reduce.append(real_embeddings)
-                reduced_list = fit_umap(
-                    embeddings_to_reduce,
-                    umap_n_components,
-                    n_neighbors=umap_n_neighbors,
-                    min_dist=umap_min_dist,
-                    metric=umap_metric,
-                )
-                synth_reduced = reduced_list[0]
-                real_reduced = reduced_list[1] if len(reduced_list) > 1 else None
-
-            # KL divergence (joint UMAP so synth and real share a space)
-            if "kl_divergence" in comparison_metrics and has_real:
-                try:
-                    kl_k = cfg.quality_eval_cfg.kl_k
-                    kl_synth = (
-                        synth_reduced if real_reduced is not None else synth_embeddings
-                    )
-                    kl_real = (
-                        real_reduced if real_reduced is not None else real_embeddings
-                    )
-                    if kl_synth is not None and kl_real is not None:
-                        kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k)
-                    else:
-                        kl_score = 0.0
-                    umap_info = (
-                        f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-                    )
-                    logger.info(
-                        "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score
-                    )
-                except Exception as e:  # noqa: BLE001
-                    logger.warning("Error computing KL divergence: %s", e)
-
-            # Compute internal diversity metrics (only need synthetic data)
-            if "mdm" in internal_diversity_metrics:
-                try:
-                    mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters
-                    mdm_metric = cfg.quality_eval_cfg.mdm_metric
-                    mdm_score = compute_mdm(
-                        synth_embeddings,
-                        n_clusters=mdm_n_clusters,
-                        metric=mdm_metric,
-                    )
-                    logger.info(
-                        "MDM score (%d clusters, %s metric): %.4f",
-                        mdm_n_clusters,
-                        mdm_metric,
-                        mdm_score,
-                    )
-                except Exception as e:  # noqa: BLE001
-                    logger.warning("Error computing MDM: %s", e)
-
-            if "entropy" in internal_diversity_metrics:
-                try:
-                    entropy_k = cfg.quality_eval_cfg.entropy_k
-                    entropy_emb = (
-                        synth_reduced if synth_reduced is not None else synth_embeddings
-                    )
-                    entropy_score = compute_differential_entropy(
-                        entropy_emb, k=entropy_k
-                    )
-                    umap_info = (
-                        f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-                    )
-                    logger.info(
-                        "Differential entropy score (k=%d)%s: %.4f",
-                        entropy_k,
-                        umap_info,
-                        entropy_score,
-                    )
-                except Exception as e:  # noqa: BLE001
-                    logger.warning("Error computing differential entropy: %s", e)
+
+@hydra.main(
+    version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg"
+)
+def main(cfg: DictConfig) -> None:
+    """Compute benchmark-level quality metrics from saved capability scores."""
+    _validate_metric_requirements(cfg)
+
+    metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute)
+    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+
+    model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg)
+    _compute_benchmark_metrics(
+        model_to_accuracy,
+        model_to_generation_accuracies,
+        metrics_to_compute,
+    )
+    _compute_novelty_metrics(cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute)
+    _compute_embedding_based_metrics(cfg, metrics_to_compute)
 
 
 if __name__ == "__main__":

From 31d73c4bef9c99e199cbed0ad7d236ed5497a53d Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Thu, 12 Feb 2026 00:03:55 -0500
Subject: [PATCH 11/14] Updated quality evaluation

---
 src/cfg/run_quality_evaluation_cfg.yaml |  27 +--
 src/run_quality_evaluation.py           | 271 +++++++++++++-----------
 2 files changed, 149 insertions(+), 149 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index 00acebf..9682bfd 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -2,13 +2,8 @@ prompt_cfg:
   sys_msg: Compute benchmark quality metrics from existing scores.
 
 quality_eval_cfg:
-  # Synthetic benchmark source (scores + capabilities)
-  synthetic_source:
-    # Root directory containing per-model score subdirs for the synthetic benchmark
+  new_data_source:
     scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
-    # Optional subdirectory name when falling back to BASE_ARTIFACTS_DIR
-    scores_subdir: "scores"
-    # Capability directory for the synthetic benchmark
     capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
 
   # List of metrics to compute. Available metrics:
@@ -20,32 +15,24 @@ quality_eval_cfg:
     - "difficulty"
     - "separability"
     - "consistency"
-    # - "novelty"
+    - "novelty"
     - "mdm"
     - "entropy"
     - "pad"
     - "mmd"
     - "kl_divergence"
 
-  # Novelty: "combined" = one score from all real sources (linear regression on all);
-  # "per_dataset" = one novelty per prior (how novel vs each benchmark separately);
-  # "both" = report combined and per-dataset.
-  novelty_mode: "combined"  # "combined" | "per_dataset" | "both"
-
-  # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL) and novelty.
-  # real_data_source can be:
+  # Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty.
+  # previous_data_sources can be:
   # - a single mapping {path, dataloader, name, scores_dir}, OR
   # - a list of such mappings when you have multiple real datasets.
-  #
-  # When multiple sources are provided, real_comparison_mode controls whether
-  # they are pooled together into one real distribution ("pooled") or compared
-  # pairwise against the synthetic data ("per_dataset") for PAD/MMD.
-  real_comparison_mode: "pooled"  # or "per_dataset"
+  # PAD and MMD are always reported per previous-data source.
+  # UMAP (for entropy/KL) is fit on all new + all previous data combined.
 
   # Example: multiple real datasets (HuggingFace math benchmarks).
   # Novelty uses score dirs from each source: set scores_dir explicitly, or
   # we use scores_root_dir/<name> when name is set.
-  real_data_source:
+  previous_data_sources:
     - name: "MATH-500"
       path: null
       # Optional: explicit scores directory for novelty; otherwise uses
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index a761de9..957b945 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -10,6 +10,14 @@
 from omegaconf import DictConfig, OmegaConf
 
 from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName
+from src.utils.diversity_metrics_dataloaders import (
+    CapabilityDataloader,
+    CSVDataloader,
+    DatasetDataloader,
+    HuggingFaceDatasetDataloader,
+    JSONLDataloader,
+    load_texts_from_dataloader,
+)
 from src.utils.quality_evaluation_utils import (
     compute_benchmark_consistency,
     compute_benchmark_difficulty,
@@ -22,21 +30,26 @@
     compute_pad,
     fit_umap,
 )
-from src.utils.data_utils import get_run_id
-from src.utils import constants
-from src.utils.diversity_metrics_dataloaders import (
-    CapabilityDataloader,
-    CSVDataloader,
-    DatasetDataloader,
-    HuggingFaceDatasetDataloader,
-    JSONLDataloader,
-    load_texts_from_dataloader,
-)
 
 
 logger = logging.getLogger(__name__)
 
 
+def _as_dict(obj: Any) -> Dict[str, Any]:
+    """
+    Convert an OmegaConf container-like object to a plain dict.
+
+    Raises if the object cannot be represented as a mapping.
+    """
+    if isinstance(obj, dict):
+        return obj
+    container = OmegaConf.to_container(obj, resolve=True)
+    if isinstance(container, Mapping):
+        mapping = cast(Mapping[str, Any], container)
+        return dict(mapping)
+    raise TypeError(f"Expected mapping-like config, got: {type(container)}")
+
+
 def _validate_metric_requirements(cfg: DictConfig) -> None:
     """
     Validate that all required data is provided for the requested metrics.
@@ -51,33 +64,37 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
             "mdm, entropy, pad, mmd, kl_divergence"
         )
 
-    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
-    reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
+    reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
 
     # Benchmark metrics (difficulty, separability, consistency) need scores
     benchmark_metrics = {"difficulty", "separability", "consistency"}
     if benchmark_metrics.intersection(metrics_to_compute):
         scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
-        scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores")
-        run_id = get_run_id(cfg)
-        
-        if scores_root_dir:
-            base_scores_dir = scores_root_dir
-        else:
-            base_scores_dir = os.path.join(
-                constants.BASE_ARTIFACTS_DIR, scores_subdir, run_id
+
+        if not scores_root_dir:
+            raise ValueError(
+                "Benchmark metrics "
+                f"({benchmark_metrics.intersection(metrics_to_compute)}) "
+                "require 'scores_root_dir' to be set in "
+                "quality_eval_cfg.new_data_source. "
+                "Please provide the path to the directory containing one "
+                "subdirectory per subject model."
             )
-        
+
+        base_scores_dir = scores_root_dir
+
         if not os.path.isdir(base_scores_dir):
             raise ValueError(
                 f"Benchmark metrics ({benchmark_metrics.intersection(metrics_to_compute)}) "
                 f"require scores directory to exist. "
                 f"benchmark scores_root_dir or fallback directory not found: {base_scores_dir}"
             )
-        
+
         # Check that scores directory contains at least one model subdirectory
         model_dirs = [
-            d for d in os.listdir(base_scores_dir)
+            d
+            for d in os.listdir(base_scores_dir)
             if os.path.isdir(os.path.join(base_scores_dir, d))
         ]
         if not model_dirs:
@@ -85,14 +102,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
                 f"Scores directory '{base_scores_dir}' exists but contains no model subdirectories. "
                 "Please ensure scores are generated for at least one model."
             )
-        
-        # For consistency metric, check that at least one model has generation subdirectories
+
+        # For consistency metric, check that at least one model has generation
+        # subdirectories.
         if "consistency" in metrics_to_compute:
             has_generations = False
             for model_name in model_dirs:
                 model_dir = os.path.join(base_scores_dir, model_name)
                 subdirs = [
-                    d for d in os.listdir(model_dir)
+                    d
+                    for d in os.listdir(model_dir)
                     if os.path.isdir(os.path.join(model_dir, d))
                 ]
                 if subdirs:
@@ -169,7 +188,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
         if reference_data_source_cfg is None:
             raise ValueError(
                 f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
-                "require reference_data_source to be configured"
+                "require previous_data_sources to be configured"
             )
 
         # Validate each reference source has either path or dataloader
@@ -183,11 +202,11 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
         if not sources:
             raise ValueError(
                 f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
-                "require at least one reference_data_source entry"
+                "require at least one previous_data_sources entry"
             )
 
         for i, src in enumerate(sources):
-            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            src_dict = _as_dict(src)
             name = src_dict.get("name", f"reference_{i}")
             path = src_dict.get("path")
             dataloader = src_dict.get("dataloader")
@@ -197,14 +216,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
 
             if not (has_path or has_dataloader):
                 raise ValueError(
-                    f"reference_data_source[{i}] ({name}) must have either a valid 'path' "
+                    f"previous_data_sources[{i}] ({name}) must have either a valid 'path' "
                     "(existing file/directory) or 'dataloader' with type='huggingface'"
                 )
 
-    # Novelty needs reference_data_source with score directories (prior accuracies)
+    # Novelty needs previous_data_sources with score directories (prior accuracies)
     if "novelty" in metrics_to_compute:
         if reference_data_source_cfg is None:
-            raise ValueError("Novelty metric requires reference_data_source (prior accuracies) to be configured")
+            raise ValueError(
+                "Novelty metric requires previous_data_sources (prior accuracies) to be configured"
+            )
 
         cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
         sources = []
@@ -214,13 +235,15 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
             sources = [cfg_container]
 
         if not sources:
-            raise ValueError("Novelty metric requires at least one reference_data_source entry (for prior accuracies)")
+            raise ValueError(
+                "Novelty metric requires at least one previous_data_sources entry (for prior accuracies)"
+            )
 
         scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
         has_valid_score_dir = False
         checked: List[str] = []
         for i, src in enumerate(sources):
-            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            src_dict = _as_dict(src)
             scores_dir = src_dict.get("scores_dir")
             if not scores_dir:
                 src_name = src_dict.get("name")
@@ -232,7 +255,9 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
                             f"entry {i} (name={src_dict.get('name')}): no scores_dir and scores_root_dir not set"
                         )
                     elif not src_name:
-                        checked.append(f"entry {i}: no scores_dir and no name to derive from scores_root_dir")
+                        checked.append(
+                            f"entry {i}: no scores_dir and no name to derive from scores_root_dir"
+                        )
                     continue
             if not scores_dir:
                 continue
@@ -240,11 +265,14 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
                 checked.append(f"{scores_dir!r} (does not exist)")
                 continue
             model_dirs = [
-                d for d in os.listdir(scores_dir)
+                d
+                for d in os.listdir(scores_dir)
                 if os.path.isdir(os.path.join(scores_dir, d))
             ]
             if not model_dirs:
-                checked.append(f"{scores_dir!r} (exists but has no model subdirectories)")
+                checked.append(
+                    f"{scores_dir!r} (exists but has no model subdirectories)"
+                )
                 continue
             has_json = False
             for model_name in model_dirs:
@@ -256,10 +284,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
             if has_json:
                 has_valid_score_dir = True
                 break
-            checked.append(f"{scores_dir!r} (exists, has model subdirs but no .json score files)")
+            checked.append(
+                f"{scores_dir!r} (exists, has model subdirs but no .json score files)"
+            )
 
         if not has_valid_score_dir:
-            detail = "; ".join(checked) if checked else "no scores_dir/name derived paths to check"
+            detail = (
+                "; ".join(checked)
+                if checked
+                else "no scores_dir/name derived paths to check"
+            )
             raise ValueError(
                 "Novelty uses real/reference data via prior accuracies: model scores from evaluating "
                 "models on those reference datasets (e.g. MATH-500, MATH-Hard). You must have run that "
@@ -272,7 +306,11 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
 
 def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]:
     """
-    Collect all accuracy values from Inspect eval JSON files in a directory (recursively).
+    Collect accuracy values from Inspect eval JSON files.
+
+    Recursively walks a directory and extracts accuracy values from Inspect eval
+    JSON files.
+
     Single primitive: one dir -> list of accuracies.
     """
     accuracies: List[float] = []
@@ -302,8 +340,12 @@ def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]:
 
 def _load_average_accuracy_per_model_from_scores_dir(base_dir: str) -> Dict[str, float]:
     """
-    Load a scores directory with one subdir per model (each containing Inspect eval JSONs)
-    and return model name -> average accuracy. Used for prior (reference) score dirs (e.g. novelty).
+    Load a scores directory with one subdir per model.
+
+    Each model subdir contains Inspect eval JSON files.
+
+    And return model name -> average accuracy. Used for prior (reference) score dirs
+    (e.g. novelty).
     Returns empty dict if base_dir does not exist.
     """
     model_to_accuracy: Dict[str, float] = {}
@@ -471,22 +513,25 @@ def _load_capabilities_and_generate_embeddings(
     return embeddings_array, texts
 
 
-def _load_benchmark_scores(cfg: DictConfig) -> Tuple[Dict[str, float], Dict[str, List[float]]]:
-    """Load model accuracies from the benchmark (evaluated) scores directory. Validation has already run."""
-    run_id = get_run_id(cfg)
-    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+def _load_benchmark_scores(
+    cfg: DictConfig,
+) -> Tuple[Dict[str, float], Dict[str, List[float]]]:
+    """Load benchmark model accuracies from the evaluated scores directory.
+
+    Validation has already run.
+    """
+    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
     scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
-    scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores")
 
-    if scores_root_dir:
-        base_scores_dir = scores_root_dir
-    else:
-        base_scores_dir = os.path.join(
-            constants.BASE_ARTIFACTS_DIR,
-            scores_subdir,
-            run_id,
+    if not scores_root_dir:
+        raise ValueError(
+            "scores_root_dir must be set in quality_eval_cfg.new_data_source "
+            "to load benchmark scores. It should point to a directory that "
+            "contains one subdirectory per subject model."
         )
 
+    base_scores_dir = scores_root_dir
+
     logger.info("Loading model accuracies from %s", base_scores_dir)
     model_to_accuracy: Dict[str, float] = {}
     model_to_generation_accuracies: Dict[str, List[float]] = {}
@@ -579,11 +624,11 @@ def _compute_novelty_metrics(
     model_to_accuracy: Dict[str, float],
     metrics_to_compute: set,
 ) -> None:
-    """Load previous (prior) accuracies and compute novelty vs benchmark. Combined and/or per-dataset."""
+    """Load prior accuracies and compute one novelty metric using all priors."""
     if "novelty" not in metrics_to_compute:
         return
 
-    reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+    reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
     cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
     prior_source_configs = (
         cfg_container if isinstance(cfg_container, list) else [cfg_container]
@@ -592,7 +637,7 @@ def _compute_novelty_metrics(
     scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
     prior_score_dirs: List[str] = []
     for src in prior_source_configs:
-        src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+        src_dict = _as_dict(src)
         scores_dir = src_dict.get("scores_dir")
         if not scores_dir:
             src_name = src_dict.get("name")
@@ -603,7 +648,6 @@ def _compute_novelty_metrics(
 
     logger.info("Loading prior (previous) accuracies for novelty computation...")
     prior_datasets_accuracies: List[Dict[str, float]] = []
-    prior_labels: List[str] = []
     for prior_dir in prior_score_dirs:
         prior_acc = _load_average_accuracy_per_model_from_scores_dir(prior_dir)
         if not prior_acc:
@@ -612,24 +656,17 @@ def _compute_novelty_metrics(
                 "despite validation passing. This may indicate a race condition or file system issue."
             )
         prior_datasets_accuracies.append(prior_acc)
-        prior_labels.append(os.path.basename(os.path.normpath(prior_dir)))
         logger.info(
             "Loaded prior dataset from %s: %d models",
             prior_dir,
             len(prior_acc),
         )
 
-    novelty_mode = str(cfg.quality_eval_cfg.get("novelty_mode", "combined")).lower()
-    if novelty_mode in ("combined", "both"):
-        novelty = compute_benchmark_novelty(
-            model_to_accuracy,
-            cast(List[Mapping[str, float]], prior_datasets_accuracies),
-        )
-        logger.info("Benchmark novelty (combined): %.4f", novelty)
-    if novelty_mode in ("per_dataset", "both"):
-        for label, prior_acc in zip(prior_labels, prior_datasets_accuracies):
-            n_per = compute_benchmark_novelty(model_to_accuracy, [prior_acc])
-            logger.info("Novelty[%s]: %.4f", label, n_per)
+    novelty = compute_benchmark_novelty(
+        model_to_accuracy,
+        cast(List[Mapping[str, float]], prior_datasets_accuracies),
+    )
+    logger.info("Benchmark novelty: %.4f", novelty)
 
 
 def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -> None:
@@ -643,7 +680,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
     if not needs_embeddings:
         return
 
-    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
     capabilities_dir = benchmark_source_cfg.get("capabilities_dir")
     embedding_model = cfg.quality_eval_cfg.embedding_model
     embedding_backend = cfg.quality_eval_cfg.embedding_backend
@@ -670,17 +707,12 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
     reference_names: List[str] = []
 
     if comparison_metrics.intersection(metrics_to_compute):
-        reference_comparison_mode = str(
-            cfg.quality_eval_cfg.get("real_comparison_mode", "pooled")
-        ).lower()
-        reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source")
+        reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
         cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
-        raw_list = (
-            cfg_container if isinstance(cfg_container, list) else [cfg_container]
-        )
+        raw_list = cfg_container if isinstance(cfg_container, list) else [cfg_container]
         reference_source_configs: List[Dict[str, Any]] = []
         for i, src in enumerate(raw_list):
-            src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True))
+            src_dict = _as_dict(src)
             src_dict.setdefault("name", f"reference_{i}")
             reference_source_configs.append(src_dict)
 
@@ -688,15 +720,17 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
             name = src.get("name", "reference")
             reference_data_path = src.get("path")
             reference_dataloader_cfg = src.get("dataloader")
-            if reference_dataloader_cfg is not None and not isinstance(reference_dataloader_cfg, dict):
-                reference_dataloader_cfg = dict(
-                    OmegaConf.to_container(reference_dataloader_cfg, resolve=True)
-                )
+            if reference_dataloader_cfg is not None and not isinstance(
+                reference_dataloader_cfg, dict
+            ):
+                reference_dataloader_cfg = _as_dict(reference_dataloader_cfg)
             if reference_dataloader_cfg is None:
                 reference_dataloader_cfg = {}
 
             if reference_data_path:
-                logger.info("Loading reference data embeddings from %s", reference_data_path)
+                logger.info(
+                    "Loading reference data embeddings from %s", reference_data_path
+                )
             else:
                 logger.info(
                     "Loading reference data embeddings for %s using dataloader config (no local path)",
@@ -722,48 +756,27 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
             reference_embeddings = np.vstack(reference_embeddings_list)
 
             if "pad" in metrics_to_compute:
-                if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1:
-                    for name, ref_emb in zip(reference_names, reference_embeddings_list):
-                        pad_score = compute_pad(
-                            benchmark_embeddings,
-                            ref_emb,
-                            classifier_name=cfg.quality_eval_cfg.pad_classifier,
-                        )
-                        logger.info("PAD[%s]: %.4f", name, pad_score)
-                else:
+                for name, ref_emb in zip(reference_names, reference_embeddings_list):
                     pad_score = compute_pad(
                         benchmark_embeddings,
-                        reference_embeddings,
+                        ref_emb,
                         classifier_name=cfg.quality_eval_cfg.pad_classifier,
                     )
-                    logger.info("PAD (pooled reference): %.4f", pad_score)
+                    logger.info("PAD[%s]: %.4f", name, pad_score)
 
             if "mmd" in metrics_to_compute:
                 mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
                 mmd_degree = cfg.quality_eval_cfg.mmd_degree
-                if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1:
-                    for name, ref_emb in zip(reference_names, reference_embeddings_list):
-                        mmd_score = compute_mmd(
-                            benchmark_embeddings,
-                            ref_emb,
-                            kernel=mmd_kernel,
-                            degree=mmd_degree,
-                        )
-                        logger.info(
-                            "MMD[%s] (%s kernel): %.4f",
-                            name,
-                            mmd_kernel,
-                            mmd_score,
-                        )
-                else:
+                for name, ref_emb in zip(reference_names, reference_embeddings_list):
                     mmd_score = compute_mmd(
                         benchmark_embeddings,
-                        reference_embeddings,
+                        ref_emb,
                         kernel=mmd_kernel,
                         degree=mmd_degree,
                     )
                     logger.info(
-                        "MMD (pooled reference, %s kernel): %.4f",
+                        "MMD[%s] (%s kernel): %.4f",
+                        name,
                         mmd_kernel,
                         mmd_score,
                     )
@@ -782,6 +795,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
     if need_umap:
         embeddings_to_reduce = [benchmark_embeddings]
         if has_reference:
+            assert reference_embeddings is not None
             embeddings_to_reduce.append(reference_embeddings)
         reduced_list = fit_umap(
             embeddings_to_reduce,
@@ -795,19 +809,18 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
 
     if "kl_divergence" in metrics_to_compute:
         kl_k = cfg.quality_eval_cfg.kl_k
-        kl_benchmark = (
-            benchmark_reduced if reference_reduced is not None else benchmark_embeddings
-        )
-        kl_reference = (
-            reference_reduced if reference_reduced is not None else reference_embeddings
-        )
+        if reference_reduced is not None:
+            assert benchmark_reduced is not None
+            kl_benchmark = benchmark_reduced
+            kl_reference = reference_reduced
+        else:
+            kl_benchmark = benchmark_embeddings
+            assert reference_embeddings is not None
+            kl_reference = reference_embeddings
+        assert kl_reference is not None
         kl_score = compute_kl_divergence(kl_benchmark, kl_reference, k=kl_k)
-        umap_info = (
-            f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-        )
-        logger.info(
-            "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score
-        )
+        umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
+        logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
 
     if "mdm" in metrics_to_compute:
         mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters
@@ -830,9 +843,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
             benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings
         )
         entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k)
-        umap_info = (
-            f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
-        )
+        umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else ""
         logger.info(
             "Differential entropy score (k=%d)%s: %.4f",
             entropy_k,
@@ -849,7 +860,7 @@ def main(cfg: DictConfig) -> None:
     _validate_metric_requirements(cfg)
 
     metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute)
-    benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source
+    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
 
     model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg)
     _compute_benchmark_metrics(
@@ -857,7 +868,9 @@ def main(cfg: DictConfig) -> None:
         model_to_generation_accuracies,
         metrics_to_compute,
     )
-    _compute_novelty_metrics(cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute)
+    _compute_novelty_metrics(
+        cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute
+    )
     _compute_embedding_based_metrics(cfg, metrics_to_compute)
 
 

From 3b818bae7fb24f045c1949fbe3c0d6fdef87bbd6 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Fri, 13 Feb 2026 23:15:36 -0500
Subject: [PATCH 12/14] Chnaged names

---
 src/cfg/run_quality_evaluation_cfg.yaml | 151 ++++++++++++------------
 src/run_quality_evaluation.py           |  62 +++++-----
 2 files changed, 106 insertions(+), 107 deletions(-)

diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml
index 9682bfd..0a00388 100644
--- a/src/cfg/run_quality_evaluation_cfg.yaml
+++ b/src/cfg/run_quality_evaluation_cfg.yaml
@@ -1,94 +1,93 @@
 prompt_cfg:
   sys_msg: Compute benchmark quality metrics from existing scores.
 
-quality_eval_cfg:
-  new_data_source:
-    scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
-    capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
+target_data:
+  scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample"
+  capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/"
 
-  # List of metrics to compute. Available metrics:
-  # - Benchmark metrics: "difficulty", "separability", "consistency"
-  # - Novelty: "novelty"
-  # - Internal diversity: "mdm", "entropy"
-  # - Comparison metrics: "pad", "mmd", "kl_divergence"
-  metrics_to_compute:
-    - "difficulty"
-    - "separability"
-    - "consistency"
-    - "novelty"
-    - "mdm"
-    - "entropy"
-    - "pad"
-    - "mmd"
-    - "kl_divergence"
+# List of metrics to compute. Available metrics:
+# - Benchmark metrics: "difficulty", "separability", "consistency"
+# - Novelty: "novelty"
+# - Internal diversity: "mdm", "entropy"
+# - Comparison metrics: "pad", "mmd", "kl_divergence"
+metrics_to_compute:
+  - "difficulty"
+  - "separability"
+  - "consistency"
+  - "novelty"
+  - "mdm"
+  - "entropy"
+  - "pad"
+  - "mmd"
+  - "kl_divergence"
 
-  # Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty.
-  # previous_data_sources can be:
-  # - a single mapping {path, dataloader, name, scores_dir}, OR
-  # - a list of such mappings when you have multiple real datasets.
-  # PAD and MMD are always reported per previous-data source.
-  # UMAP (for entropy/KL) is fit on all new + all previous data combined.
+# Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty.
+# reference_datasets can be:
+# - a single mapping {path, dataloader, name, scores_dir}, OR
+# - a list of such mappings when you have multiple real datasets.
+# PAD and MMD are always reported per previous-data source.
+# UMAP (for entropy/KL) is fit on all new + all previous data combined.
 
-  # Example: multiple real datasets (HuggingFace math benchmarks).
-  # Novelty uses score dirs from each source: set scores_dir explicitly, or
-  # we use scores_root_dir/<name> when name is set.
-  previous_data_sources:
-    - name: "MATH-500"
-      path: null
-      # Optional: explicit scores directory for novelty; otherwise uses
-      # scores_root_dir/name
-      scores_dir: null
-      dataloader:
-        type: "huggingface"
-        dataset_name: "HuggingFaceH4/MATH-500"
-        split: "test"
-        subset: null
-        text_field: "problem"
+# Example: multiple real datasets (HuggingFace math benchmarks).
+# Novelty uses score dirs from each source: set scores_dir explicitly, or
+# we use scores_root_dir/<name> when name is set.
+reference_datasets:
+  - name: "MATH-500"
+    path: null
+    # Optional: explicit scores directory for novelty; otherwise uses
+    # scores_root_dir/name
+    scores_dir: null
+    dataloader:
+      type: "huggingface"
+      dataset_name: "HuggingFaceH4/MATH-500"
+      split: "test"
+      subset: null
+      text_field: "problem"
 
-    - name: "MATH-Hard"
-      path: null
-      scores_dir: null
-      dataloader:
-        type: "huggingface"
-        dataset_name: "lighteval/MATH-Hard"
-        split: "test"
-        subset: null
-        text_field: "problem"
+  - name: "MATH-Hard"
+    path: null
+    scores_dir: null
+    dataloader:
+      type: "huggingface"
+      dataset_name: "lighteval/MATH-Hard"
+      split: "test"
+      subset: null
+      text_field: "problem"
 
-  # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers
-  embedding_backend: "openai"
-  embedding_model: "text-embedding-3-large"
-  # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
-  embedding_dimensions: 3072
+# embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers
+embedding_backend: "openai"
+embedding_model: "text-embedding-3-large"
+# embedding_dimensions is ignored for HuggingFace models (uses model's native dimension)
+embedding_dimensions: 3072
 
-  pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
+pad_classifier: "LogisticRegression"  # Options: "LogisticRegression", "RandomForest", "MLP"
 
-  mmd_kernel: "polynomial"  # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid"
-  mmd_degree: 3
+mmd_kernel: "polynomial"  # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid"
+mmd_degree: 3
 
-  mdm_n_clusters: 5
-  mdm_metric: "euclidean"
+mdm_n_clusters: 5
+mdm_metric: "euclidean"
 
-  entropy_k: 4  # Number of nearest neighbors for differential entropy computation
+entropy_k: 4  # Number of nearest neighbors for differential entropy computation
 
-  kl_k: 4  # Number of nearest neighbors for KL divergence computation
+kl_k: 4  # Number of nearest neighbors for KL divergence computation
 
-  # Optional UMAP dimensionality reduction (like InfoSynth)
-  umap_n_components: 10  # Set to null to disable and use original embeddings
-  umap_n_neighbors: 15  # Number of neighbors for UMAP
-  umap_min_dist: 0.1  # Minimum distance for UMAP
-  umap_metric: "cosine"  # Distance metric for UMAP
+# Optional UMAP dimensionality reduction (like InfoSynth)
+umap_n_components: 10  # Set to null to disable and use original embeddings
+umap_n_neighbors: 15  # Number of neighbors for UMAP
+umap_min_dist: 0.1  # Minimum distance for UMAP
+umap_metric: "cosine"  # Distance metric for UMAP
 
-  # Evaluation settings to use if we need to (re-)evaluate prior or real datasets.
-  # These mirror the subject_llm settings in src/cfg/run_cfg.yaml.
-  evaluation_cfg:
-    subject_llm:
-      name: "o1-mini"
-      provider: "openai"
-      generation_cfg:
-        temperature: 0.7
-        max_tokens: 2048
-        seed: 42
+# Evaluation settings to use if we need to (re-)evaluate prior or real datasets.
+# These mirror the subject_llm settings in src/cfg/run_cfg.yaml.
+evaluation_cfg:
+  subject_llm:
+    name: "o1-mini"
+    provider: "openai"
+    generation_cfg:
+      temperature: 0.7
+      max_tokens: 2048
+      seed: 42
 
 exp_cfg:
   exp_id: "quality_evaluation"
diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py
index 957b945..75ff8d3 100644
--- a/src/run_quality_evaluation.py
+++ b/src/run_quality_evaluation.py
@@ -56,7 +56,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
 
     Raises ValueError if any required data is missing.
     """
-    metrics_to_compute = cfg.quality_eval_cfg.get("metrics_to_compute", [])
+    metrics_to_compute = cfg.get("metrics_to_compute", [])
     if not metrics_to_compute:
         raise ValueError(
             "metrics_to_compute must be specified in config. "
@@ -64,8 +64,8 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
             "mdm, entropy, pad, mmd, kl_divergence"
         )
 
-    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
-    reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
+    benchmark_source_cfg = cfg.target_data
+    reference_data_source_cfg = cfg.get("reference_datasets")
 
     # Benchmark metrics (difficulty, separability, consistency) need scores
     benchmark_metrics = {"difficulty", "separability", "consistency"}
@@ -77,7 +77,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
                 "Benchmark metrics "
                 f"({benchmark_metrics.intersection(metrics_to_compute)}) "
                 "require 'scores_root_dir' to be set in "
-                "quality_eval_cfg.new_data_source. "
+                "target_data. "
                 "Please provide the path to the directory containing one "
                 "subdirectory per subject model."
             )
@@ -188,7 +188,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
         if reference_data_source_cfg is None:
             raise ValueError(
                 f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
-                "require previous_data_sources to be configured"
+                "require reference_datasets to be configured"
             )
 
         # Validate each reference source has either path or dataloader
@@ -202,7 +202,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
         if not sources:
             raise ValueError(
                 f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) "
-                "require at least one previous_data_sources entry"
+                "require at least one reference_datasets entry"
             )
 
         for i, src in enumerate(sources):
@@ -216,15 +216,15 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
 
             if not (has_path or has_dataloader):
                 raise ValueError(
-                    f"previous_data_sources[{i}] ({name}) must have either a valid 'path' "
+                    f"reference_datasets[{i}] ({name}) must have either a valid 'path' "
                     "(existing file/directory) or 'dataloader' with type='huggingface'"
                 )
 
-    # Novelty needs previous_data_sources with score directories (prior accuracies)
+    # Novelty needs reference_datasets with score directories (prior accuracies)
     if "novelty" in metrics_to_compute:
         if reference_data_source_cfg is None:
             raise ValueError(
-                "Novelty metric requires previous_data_sources (prior accuracies) to be configured"
+                "Novelty metric requires reference_datasets (prior accuracies) to be configured"
             )
 
         cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
@@ -236,7 +236,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None:
 
         if not sources:
             raise ValueError(
-                "Novelty metric requires at least one previous_data_sources entry (for prior accuracies)"
+                "Novelty metric requires at least one reference_datasets entry (for prior accuracies)"
             )
 
         scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
@@ -520,12 +520,12 @@ def _load_benchmark_scores(
 
     Validation has already run.
     """
-    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
+    benchmark_source_cfg = cfg.target_data
     scores_root_dir = benchmark_source_cfg.get("scores_root_dir")
 
     if not scores_root_dir:
         raise ValueError(
-            "scores_root_dir must be set in quality_eval_cfg.new_data_source "
+            "scores_root_dir must be set in target_data "
             "to load benchmark scores. It should point to a directory that "
             "contains one subdirectory per subject model."
         )
@@ -628,7 +628,7 @@ def _compute_novelty_metrics(
     if "novelty" not in metrics_to_compute:
         return
 
-    reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
+    reference_data_source_cfg = cfg.get("reference_datasets")
     cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
     prior_source_configs = (
         cfg_container if isinstance(cfg_container, list) else [cfg_container]
@@ -680,11 +680,11 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
     if not needs_embeddings:
         return
 
-    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
+    benchmark_source_cfg = cfg.target_data
     capabilities_dir = benchmark_source_cfg.get("capabilities_dir")
-    embedding_model = cfg.quality_eval_cfg.embedding_model
-    embedding_backend = cfg.quality_eval_cfg.embedding_backend
-    embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions
+    embedding_model = cfg.embedding_model
+    embedding_backend = cfg.embedding_backend
+    embed_dimensions = cfg.embedding_dimensions
 
     logger.info(
         "Computing embedding-based metrics for capabilities in %s", capabilities_dir
@@ -707,7 +707,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
     reference_names: List[str] = []
 
     if comparison_metrics.intersection(metrics_to_compute):
-        reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources")
+        reference_data_source_cfg = cfg.get("reference_datasets")
         cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True)
         raw_list = cfg_container if isinstance(cfg_container, list) else [cfg_container]
         reference_source_configs: List[Dict[str, Any]] = []
@@ -760,13 +760,13 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
                     pad_score = compute_pad(
                         benchmark_embeddings,
                         ref_emb,
-                        classifier_name=cfg.quality_eval_cfg.pad_classifier,
+                        classifier_name=cfg.pad_classifier,
                     )
                     logger.info("PAD[%s]: %.4f", name, pad_score)
 
             if "mmd" in metrics_to_compute:
-                mmd_kernel = cfg.quality_eval_cfg.mmd_kernel
-                mmd_degree = cfg.quality_eval_cfg.mmd_degree
+                mmd_kernel = cfg.mmd_kernel
+                mmd_degree = cfg.mmd_degree
                 for name, ref_emb in zip(reference_names, reference_embeddings_list):
                     mmd_score = compute_mmd(
                         benchmark_embeddings,
@@ -782,10 +782,10 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
                     )
 
     has_reference = reference_embeddings is not None and len(reference_embeddings) > 0
-    umap_n_components = cfg.quality_eval_cfg.umap_n_components
-    umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors
-    umap_min_dist = cfg.quality_eval_cfg.umap_min_dist
-    umap_metric = cfg.quality_eval_cfg.umap_metric
+    umap_n_components = cfg.umap_n_components
+    umap_n_neighbors = cfg.umap_n_neighbors
+    umap_min_dist = cfg.umap_min_dist
+    umap_metric = cfg.umap_metric
     need_umap = umap_n_components is not None and (
         "entropy" in metrics_to_compute
         or ("kl_divergence" in metrics_to_compute and has_reference)
@@ -808,7 +808,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
         reference_reduced = reduced_list[1] if len(reduced_list) > 1 else None
 
     if "kl_divergence" in metrics_to_compute:
-        kl_k = cfg.quality_eval_cfg.kl_k
+        kl_k = cfg.kl_k
         if reference_reduced is not None:
             assert benchmark_reduced is not None
             kl_benchmark = benchmark_reduced
@@ -823,8 +823,8 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
         logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score)
 
     if "mdm" in metrics_to_compute:
-        mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters
-        mdm_metric = cfg.quality_eval_cfg.mdm_metric
+        mdm_n_clusters = cfg.mdm_n_clusters
+        mdm_metric = cfg.mdm_metric
         mdm_score = compute_mdm(
             benchmark_embeddings,
             n_clusters=mdm_n_clusters,
@@ -838,7 +838,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -
         )
 
     if "entropy" in metrics_to_compute:
-        entropy_k = cfg.quality_eval_cfg.entropy_k
+        entropy_k = cfg.entropy_k
         entropy_emb = (
             benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings
         )
@@ -859,8 +859,8 @@ def main(cfg: DictConfig) -> None:
     """Compute benchmark-level quality metrics from saved capability scores."""
     _validate_metric_requirements(cfg)
 
-    metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute)
-    benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source
+    metrics_to_compute = set(cfg.metrics_to_compute)
+    benchmark_source_cfg = cfg.target_data
 
     model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg)
     _compute_benchmark_metrics(

From 1eb6448ea3fb0f78032d1918a4508ce696760332 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Tue, 24 Feb 2026 00:09:58 -0500
Subject: [PATCH 13/14] Added pytest

---
 tests/src/test_differential_entropy.py | 182 +++++++++++++++++++++++++
 tests/src/test_kl_divergence.py        | 162 ++++++++++++++++++++++
 2 files changed, 344 insertions(+)
 create mode 100644 tests/src/test_differential_entropy.py
 create mode 100644 tests/src/test_kl_divergence.py

diff --git a/tests/src/test_differential_entropy.py b/tests/src/test_differential_entropy.py
new file mode 100644
index 0000000..43af4ab
--- /dev/null
+++ b/tests/src/test_differential_entropy.py
@@ -0,0 +1,182 @@
+import numpy as np
+import pytest
+from scipy.special import digamma, gammaln
+
+from src.utils import compute_differential_entropy
+
+
+def _rng(seed=0):
+    return np.random.default_rng(seed)
+
+
+def test_returns_float_and_finite():
+    rng = _rng(0)
+    x = rng.normal(size=(300, 16))
+    h = compute_differential_entropy(x, k=4)
+    assert isinstance(h, float)
+    assert np.isfinite(h)
+
+
+def test_permutation_invariance():
+    rng = _rng(1)
+    x = rng.normal(size=(250, 8))
+    h1 = compute_differential_entropy(x, k=4)
+
+    x_perm = x[rng.permutation(x.shape[0])]
+    h2 = compute_differential_entropy(x_perm, k=4)
+
+    assert np.isfinite(h1) and np.isfinite(h2)
+    assert abs(h1 - h2) < 1e-10
+
+
+def test_translation_invariance():
+    """
+    Differential entropy is translation-invariant; kNN estimators based on distances should be too.
+    """
+    rng = _rng(2)
+    x = rng.normal(size=(400, 10))
+    shift = rng.normal(size=(1, 10)) * 100.0
+
+    h1 = compute_differential_entropy(x, k=4)
+    h2 = compute_differential_entropy(x + shift, k=4)
+
+    assert np.isfinite(h1) and np.isfinite(h2)
+    assert abs(h1 - h2) < 1e-6
+
+
+def test_scaling_increases_entropy():
+    """
+    Scaling embeddings by a>1 should increase entropy by about d*log(a).
+    We don't require exact equality, just the direction and rough magnitude.
+    """
+    rng = _rng(3)
+    n, d = 1200, 6
+    x = rng.normal(size=(n, d))
+    a = 3.0
+
+    h1 = compute_differential_entropy(x, k=4)
+    h2 = compute_differential_entropy(x * a, k=4)
+
+    assert np.isfinite(h1) and np.isfinite(h2)
+    assert h2 > h1
+
+    expected_shift = d * np.log(a)
+    assert (h2 - h1) == pytest.approx(expected_shift, abs=0.5)
+
+
+def test_more_spread_more_entropy():
+    """
+    A distribution with larger variance should have higher differential entropy.
+    """
+    rng = _rng(4)
+    x_small = rng.normal(size=(800, 12)) * 0.5
+    x_large = rng.normal(size=(800, 12)) * 2.0
+
+    h_small = compute_differential_entropy(x_small, k=4)
+    h_large = compute_differential_entropy(x_large, k=4)
+
+    assert np.isfinite(h_small) and np.isfinite(h_large)
+    assert h_large > h_small
+
+
+def test_k_affects_estimate_but_is_finite():
+    """
+    Different k values should produce finite results and usually slightly different estimates.
+    """
+    rng = _rng(5)
+    x = rng.normal(size=(600, 9))
+    h4 = compute_differential_entropy(x, k=4)
+    h8 = compute_differential_entropy(x, k=8)
+
+    assert np.isfinite(h4) and np.isfinite(h8)
+    assert abs(h4 - h8) > 1e-6
+
+
+def test_rejects_non_2d_input():
+    rng = _rng(6)
+    x = rng.normal(size=(100, 5, 1))
+    with pytest.raises((ValueError, AssertionError)):
+        compute_differential_entropy(x, k=4)
+
+
+def test_rejects_empty_input():
+    x = np.empty((0, 10))
+    with pytest.raises((ValueError, AssertionError)):
+        compute_differential_entropy(x, k=4)
+
+
+def test_rejects_k_too_large():
+    rng = _rng(7)
+    x = rng.normal(size=(10, 3))
+    with pytest.raises((ValueError, AssertionError)):
+        compute_differential_entropy(x, k=10)
+
+
+def test_duplicate_points_does_not_nan():
+    """
+    Duplicate points can cause zero kNN distances -> log(0).
+    Depending on your implementation, this might:
+      - raise, or
+      - return -inf, or
+      - remain finite if distances are clipped.
+    We only enforce: it should not be NaN (silent failure).
+    """
+    rng = _rng(8)
+    base = rng.normal(size=(80, 4))
+    x = np.vstack([base, base[:20]])
+
+    try:
+        h = compute_differential_entropy(x, k=4)
+    except (ValueError, AssertionError):
+        return
+
+    assert not np.isnan(h)
+
+
+def test_differential_entropy_1d():
+    """
+    Hand-computed Kozachenko–Leonenko (kNN) differential entropy test.
+
+    We assume the implementation matches the formula:
+        H = psi(n) - psi(k) + log(V_d) + d * mean(log(eps))
+    where:
+      - eps_i is the distance to the (k+1)-th nearest neighbor in X when using
+        NearestNeighbors(n_neighbors=k+1) on X and then taking distances[:, k]
+        (i.e., self-distance at index 0, first *other* neighbor at index 1 for k=1).
+      - V_d is the volume of the unit ball in R^d.
+
+    Choose a tiny 1D dataset with uniform spacing:
+        x = [0, 1, 2], n=3, d=1, k=1
+
+    Step 1) kNN distances (k=1):
+      For each point, the nearest *other* neighbor is at distance 1:
+        eps = [1, 1, 1]
+      Therefore:
+        mean(log(eps)) = mean(log(1)) = 0
+
+    Step 2) Unit ball volume term in 1D:
+      The "unit ball" in 1D is the interval [-1, 1], so:
+        V_1 = 2
+        log(V_1) = log(2)
+
+    Step 3) Digamma simplification for integers:
+      For integer n:
+        psi(n) = -gamma + H_{n-1}
+      where H_{m} is the m-th harmonic number.
+
+      psi(3) = -gamma + (1 + 1/2) = -gamma + 3/2
+      psi(1) = -gamma
+      So:
+        psi(3) - psi(1) = 3/2
+
+    Step 4) Combine terms:
+      H = (psi(3) - psi(1)) + log(2) + 1 * 0
+        = 3/2 + log(2)
+        ≈ 1.5 + 0.6931471805599453
+        = 2.1931471805599454
+    """
+    x = np.array([[0.0], [1.0], [2.0]], dtype=np.float64)
+    expected = 2.1931471805599454  # 1.5 + ln(2)
+
+    actual = compute_differential_entropy(x, k=1)
+    assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12)
\ No newline at end of file
diff --git a/tests/src/test_kl_divergence.py b/tests/src/test_kl_divergence.py
new file mode 100644
index 0000000..8c88800
--- /dev/null
+++ b/tests/src/test_kl_divergence.py
@@ -0,0 +1,162 @@
+import numpy as np
+import pytest
+from sklearn.neighbors import NearestNeighbors
+
+from src.utils import compute_kl_divergence as kl_divergence
+
+
+def _rng(seed=0):
+    return np.random.default_rng(seed)
+
+
+def test_returns_python_float():
+    rng = _rng(0)
+    p = rng.normal(size=(100, 16))
+    q = rng.normal(size=(120, 16))
+    val = kl_divergence(p, q, k=4, eps=1e-10)
+    assert isinstance(val, float)
+    assert np.isfinite(val)
+
+
+def test_identical_distributions_near_zero():
+    """
+    If p and q are the same sample set, KL(P||Q) should be ~0.
+    For kNN estimators it won't be exactly 0, so allow a tolerance.
+    """
+    rng = _rng(1)
+    p = rng.normal(size=(300, 8))
+    val = kl_divergence(p, p.copy(), k=4, eps=1e-10)
+    assert np.isfinite(val)
+    assert abs(val) < 0.5
+
+
+def test_same_distribution_independent_samples_near_zero():
+    """
+    Two independent draws from the same distribution -> KL should be small.
+    """
+    rng = _rng(2)
+    p = rng.normal(size=(400, 12))
+    q = rng.normal(size=(450, 12))
+    val = kl_divergence(p, q, k=4, eps=1e-10)
+    assert np.isfinite(val)
+    assert abs(val) < 0.3
+
+
+def test_shifted_distribution_positive():
+    """
+    If q is a shifted version, KL(P||Q) should be > 0 (usually noticeably).
+    """
+    rng = _rng(3)
+    p = rng.normal(size=(500, 10))
+    q = rng.normal(size=(500, 10)) + 1.5
+    val = kl_divergence(p, q, k=4, eps=1e-10)
+    assert np.isfinite(val)
+    assert val > 0.2
+
+
+def test_not_symmetric_in_general():
+    """
+    KL is not symmetric: KL(P||Q) != KL(Q||P) generally.
+    """
+    rng = _rng(4)
+    p = rng.normal(size=(400, 6))
+    q = rng.normal(size=(400, 6)) + 0.8
+    pq = kl_divergence(p, q, k=4, eps=1e-10)
+    qp = kl_divergence(q, p, k=4, eps=1e-10)
+    assert np.isfinite(pq) and np.isfinite(qp)
+    assert abs(pq - qp) > 1e-3
+
+
+def test_permutation_invariance():
+    """
+    Reordering rows should not change the result.
+    """
+    rng = _rng(5)
+    p = rng.normal(size=(300, 7))
+    q = rng.normal(size=(320, 7))
+    val1 = kl_divergence(p, q, k=4, eps=1e-10)
+
+    p_perm = p[rng.permutation(p.shape[0])]
+    q_perm = q[rng.permutation(q.shape[0])]
+    val2 = kl_divergence(p_perm, q_perm, k=4, eps=1e-10)
+
+    assert np.isfinite(val1) and np.isfinite(val2)
+    assert abs(val1 - val2) < 1e-10
+
+
+def test_translation_invariance_if_distance_based():
+    """
+    If your estimator depends only on pairwise distances (typical for kNN),
+    adding the same offset to both sets should not change KL.
+    """
+    rng = _rng(6)
+    p = rng.normal(size=(350, 9))
+    q = rng.normal(size=(360, 9))
+    offset = rng.normal(size=(1, 9)) * 10.0
+
+    val1 = kl_divergence(p, q, k=4, eps=1e-10)
+    val2 = kl_divergence(p + offset, q + offset, k=4, eps=1e-10)
+
+    assert np.isfinite(val1) and np.isfinite(val2)
+    assert abs(val1 - val2) < 1e-6
+
+
+def test_rejects_bad_shapes():
+    rng = _rng(7)
+    p = rng.normal(size=(100, 8))
+    q = rng.normal(size=(100, 9))
+    with pytest.raises((ValueError, AssertionError)):
+        kl_divergence(p, q, k=4, eps=1e-10)
+
+
+def test_rejects_non_2d_inputs():
+    rng = _rng(8)
+    p = rng.normal(size=(100, 8, 1))
+    q = rng.normal(size=(120, 8, 1))
+    with pytest.raises((ValueError, AssertionError)):
+        kl_divergence(p, q, k=4, eps=1e-10)
+
+
+def test_k_too_large_raises():
+    rng = _rng(9)
+    p = rng.normal(size=(10, 4))
+    q = rng.normal(size=(12, 4))
+    with pytest.raises((ValueError, AssertionError)):
+        kl_divergence(p, q, k=50, eps=1e-10)
+
+
+def test_eps_prevents_nan_with_duplicates():
+    """
+    Duplicate points can create zero distances; eps should prevent log(0)/div-by-0.
+    """
+    rng = _rng(10)
+    base = rng.normal(size=(50, 5))
+    p = np.vstack([base, base[:10]])  # duplicates
+    q = rng.normal(size=(70, 5))
+    val = kl_divergence(p, q, k=4, eps=1e-10)
+    assert np.isfinite(val)
+
+
+def test_kl_value():
+    """
+    Manual KL computation for:
+
+    P = [[0], [1], [3]]
+    Q = [[10], [11], [13]]
+    k = 1, d = 1
+
+    rho = [1, 1, 2]
+    nu  = [10, 9, 7]
+
+    KL = (1/3) * (ln(10) + ln(9) + ln(3.5)) + ln(3/2)
+       ≈ 2.322989
+    """
+
+    p = np.array([[0.0], [1.0], [3.0]])
+    q = np.array([[10.0], [11.0], [13.0]])
+
+    expected = 2.322989
+
+    actual = kl_divergence(p, q, k=1, eps=1e-12)
+
+    assert actual == pytest.approx(expected, rel=1e-6, abs=1e-6)
\ No newline at end of file

From 32355fefae513bd4bb21da7a1e56a127c0014849 Mon Sep 17 00:00:00 2001
From: Negiiiin <neginbaghbanzadeh@gmail.com>
Date: Tue, 24 Feb 2026 00:29:55 -0500
Subject: [PATCH 14/14] Added more pytest

---
 tests/src/test_benchmark_novelty.py | 70 +++++++++++++++++++++++++++++
 tests/src/test_mmd.py               | 33 ++++++++++++++
 tests/src/test_pad.py               | 39 ++++++++++++++++
 3 files changed, 142 insertions(+)
 create mode 100644 tests/src/test_benchmark_novelty.py
 create mode 100644 tests/src/test_mmd.py
 create mode 100644 tests/src/test_pad.py

diff --git a/tests/src/test_benchmark_novelty.py b/tests/src/test_benchmark_novelty.py
new file mode 100644
index 0000000..2268e14
--- /dev/null
+++ b/tests/src/test_benchmark_novelty.py
@@ -0,0 +1,70 @@
+import pytest
+
+from src.utils import compute_benchmark_novelty
+
+
+def test_benchmark_novelty_zero_when_current_matches_a_prior_dataset():
+    """
+    Hand-computed benchmark novelty test.
+
+    Novelty(D_c, D_prev, M) = 1 - SpearmanCorr(v_c, v_hat_c)
+
+    Construct a case where v_c is exactly equal to one of the prior benchmark
+    accuracy vectors. Then linear regression can predict perfectly:
+
+        Let V_prev be a single column vector equal to v_c.
+        A linear model v_hat = theta * v_prev + b can fit with theta=1, b=0,
+        so v_hat = v_c exactly.
+
+    Therefore:
+        SpearmanCorr(v_c, v_hat) = 1  (identical values -> identical ranks)
+        Novelty = 1 - 1 = 0
+    """
+    current = {"modelA": 0.90, "modelB": 0.70, "modelC": 0.80}
+    prior1 = {"modelA": 0.90, "modelB": 0.70, "modelC": 0.80}  # exactly the same vector
+
+    expected = 0.0
+    actual = compute_benchmark_novelty(current, [prior1])
+
+    assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12)
+
+
+def test_benchmark_novelty_nonzero_hand_computed_case():
+    """
+    Hand-computed novelty case with non-zero novelty.
+
+    Models: A, B, C, D
+
+    Prior vectors (as "accuracies", just numeric features):
+      prior1 x1 = [-1, +1, -1, +1]
+      prior2 x2 = [-1, -1, +1, +1]
+
+    Current:
+      v_c = [0.1, 0.2, 0.6, 0.3]
+
+    Least-squares with intercept:
+      b = mean(v_c) = 0.3
+      theta1 = (x1^T v_c) / (x1^T x1) = (-0.2)/4 = -0.05
+      theta2 = (x2^T v_c) / (x2^T x2) = (0.6)/4 = 0.15
+
+    Predicted:
+      v_hat = b + theta1*x1 + theta2*x2
+            = [0.2, 0.1, 0.5, 0.4]
+
+    Spearman ranks:
+      rank(v_c)   = [1, 2, 4, 3]
+      rank(v_hat) = [2, 1, 4, 3]
+      sum d^2 = 2
+      rho = 1 - 6*2/(4*(16-1)) = 0.8
+
+    Novelty = 1 - rho = 0.2
+    """
+    current = {"A": 0.1, "B": 0.2, "C": 0.6, "D": 0.3}
+
+    prior1 = {"A": -1.0, "B": 1.0, "C": -1.0, "D": 1.0}
+    prior2 = {"A": -1.0, "B": -1.0, "C": 1.0, "D": 1.0}
+
+    expected = 0.2
+    actual = compute_benchmark_novelty(current, [prior1, prior2])
+
+    assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12)
diff --git a/tests/src/test_mmd.py b/tests/src/test_mmd.py
new file mode 100644
index 0000000..cd667ea
--- /dev/null
+++ b/tests/src/test_mmd.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pytest
+
+from src.utils import compute_mmd
+
+
+def test_mmd_linear_hand_computed():
+    """
+    Hand-computed MMD test for the linear kernel.
+
+    For k(a,b) = a^T b, we have:
+        MMD^2 = || mean(x) - mean(y) ||^2
+
+    Choose 1D samples:
+        x = [0, 2]  -> mean(x) = (0 + 2)/2 = 1
+        y = [1, 3]  -> mean(y) = (1 + 3)/2 = 2
+
+    Difference in means:
+        mean(x) - mean(y) = 1 - 2 = -1
+
+    Therefore:
+        MMD^2 = (-1)^2 = 1
+
+    The implementation returns MMD^2 (mean of kernel Gram matrices formula),
+    so expected = 1.0.
+    """
+    x = np.array([[0.0], [2.0]], dtype=np.float64)
+    y = np.array([[1.0], [3.0]], dtype=np.float64)
+
+    expected = 1.0
+
+    actual = compute_mmd(x, y, kernel="linear")
+    assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12)
diff --git a/tests/src/test_pad.py b/tests/src/test_pad.py
new file mode 100644
index 0000000..f483273
--- /dev/null
+++ b/tests/src/test_pad.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pytest
+
+from src.utils import compute_pad
+
+
+def test_pad():
+    """
+    Hand-computed PAD test.
+
+    PAD = 2(1 - 2ε)
+
+    Choose perfectly linearly separable 1D embeddings:
+
+        Synthetic: [-10, -9, -8]
+        Real:      [  8,  9, 10]
+
+    These are separable by threshold at 0.
+
+    Classification error ε = 0
+
+    Therefore:
+        PAD = 2(1 - 2*0) = 2
+
+    The implementation uses a train/validation split, so the returned value
+    may be slightly below 2; we assert it is close to 2 (high separability).
+    """
+    x_syn = np.array([[-10.0], [-9.0], [-8.0]])
+    x_real = np.array([[8.0], [9.0], [10.0]])
+
+    expected = 2.0
+
+    actual = compute_pad(
+        x_syn,
+        x_real,
+        classifier_name="LogisticRegression",
+    )
+
+    assert actual == pytest.approx(expected, rel=0.05, abs=0.1)