From 563c8dc45e8599c91861d59437dbf5ab744d61ce Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Mon, 29 Dec 2025 20:28:34 -0500 Subject: [PATCH 01/14] Added quality evaluation metrics --- src/cfg/run_quality_evaluation_cfg.yaml | 15 ++++ src/run_quality_evaluation.py | 115 ++++++++++++++++++++++++ src/utils/__init__.py | 6 ++ src/utils/quality_evaluation_utils.py | 79 ++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 src/cfg/run_quality_evaluation_cfg.yaml create mode 100644 src/run_quality_evaluation.py create mode 100644 src/utils/quality_evaluation_utils.py diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml new file mode 100644 index 0000000..a00624c --- /dev/null +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -0,0 +1,15 @@ +prompt_cfg: + sys_msg: Compute benchmark quality metrics from existing scores. + +quality_eval_cfg: + # Absolute path to the directory that directly contains per-model score folders. + scores_root_dir: "/projects/aieng/public/ace/artifacts/negin_ace/scores" + scores_subdir: "scores" + +exp_cfg: + exp_id: "quality_evaluation" + +defaults: + - _self_ + + diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py new file mode 100644 index 0000000..a7d13e2 --- /dev/null +++ b/src/run_quality_evaluation.py @@ -0,0 +1,115 @@ +"""Script to compute quality metrics (e.g., benchmark difficulty) from existing scores.""" + +import json +import logging +import os +from typing import Dict, List + +import hydra +from omegaconf import DictConfig + +from src.utils import ( + compute_benchmark_difficulty, + compute_benchmark_separability, +) +from src.utils import constants +from src.utils.data_utils import get_run_id + + +logger = logging.getLogger(__name__) + + +def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: + """Extract the accuracy metric from a single Inspect eval JSON file.""" + try: + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as exc: # noqa: BLE001 + logger.warning("Failed to read %s: %s", json_path, exc) + return None + + try: + scores = data["results"]["scores"] + if not scores: + return None + metrics = scores[0]["metrics"] + acc = metrics["accuracy"]["value"] + return float(acc) + except (KeyError, TypeError, ValueError) as exc: + logger.warning("Failed to extract accuracy from %s: %s", json_path, exc) + return None + + +@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_cfg") +def main(cfg: DictConfig) -> None: + """ + Compute benchmark-level quality metrics from saved capability scores. + """ + run_id = get_run_id(cfg) + + scores_root_dir = getattr(cfg.quality_eval_cfg, "scores_root_dir", None) + if scores_root_dir: + base_scores_dir = scores_root_dir + else: + base_scores_dir = os.path.join( + constants.BASE_ARTIFACTS_DIR, + cfg.quality_eval_cfg.scores_subdir, + run_id, + ) + logger.info("Using fallback scores directory: %s", base_scores_dir) + + if not os.path.isdir(base_scores_dir): + logger.error( + "Scores directory '%s' does not exist. " + "Please ensure scores are generated for run_id '%s'.", + base_scores_dir, + run_id, + ) + return + + logger.info("Loading model accuracies from %s", base_scores_dir) + + # For each model directory, walk all JSON files and average their accuracies. + model_to_accuracy: Dict[str, float] = {} + for model_name in os.listdir(base_scores_dir): + model_dir = os.path.join(base_scores_dir, model_name) + if not os.path.isdir(model_dir): + continue + + accuracies: List[float] = [] + for root, _dirs, files in os.walk(model_dir): + for fname in files: + if not fname.endswith(".json"): + continue + json_path = os.path.join(root, fname) + acc = _extract_accuracy_from_inspect_json(json_path) + if acc is not None: + accuracies.append(acc) + + if not accuracies: + logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) + continue + + avg_acc = sum(accuracies) / len(accuracies) + model_to_accuracy[model_name] = avg_acc + logger.info( + "Model '%s' mean accuracy over %d JSON files: %.4f", + model_name, + len(accuracies), + avg_acc, + ) + + if not model_to_accuracy: + logger.error("No valid model accuracies found in %s", base_scores_dir) + return + + difficulty = compute_benchmark_difficulty(model_to_accuracy) + separability = compute_benchmark_separability(model_to_accuracy) + logger.info("Benchmark difficulty: %.4f", difficulty) + logger.info("Benchmark separability: %.4f", separability) + + +if __name__ == "__main__": + main() + + diff --git a/src/utils/__init__.py b/src/utils/__init__.py index f313105..00d0f19 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -6,3 +6,9 @@ """ from .data_utils import load_data +from .evaluation_utils import ( + compute_benchmark_difficulty_from_accuracies, + compute_benchmark_difficulty_from_model_scores, + compute_benchmark_separability_from_accuracies, + compute_benchmark_separability_from_model_scores, +) diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py new file mode 100644 index 0000000..4d0c6d6 --- /dev/null +++ b/src/utils/quality_evaluation_utils.py @@ -0,0 +1,79 @@ +"""Utility functions for evaluating benchmark-level metrics.""" + +from __future__ import annotations + +from typing import Iterable, Mapping, Union + + +def compute_benchmark_difficulty( + accuracies: Union[Iterable[float], Mapping[str, float]], +) -> float: + """ + Compute benchmark difficulty given per-model accuracies. + + The difficulty of a benchmark is defined as: + + DIFFICULTY(D_c, M) = 1 - max_{m in M} acc(LM_m, D_c) + + i.e., one minus the highest accuracy achieved by any model on the benchmark. + + Args: + accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model, + or a mapping from model name to accuracy in [0.0, 1.0]. + + Returns: + A float in [0.0, 1.0] representing the benchmark difficulty. + + Raises: + ValueError: If no accuracies are provided. + """ + # Handle Mapping by extracting values, otherwise treat as iterable + if isinstance(accuracies, Mapping): + accuracies = accuracies.values() + + accuracies = list(accuracies) + if not accuracies: + raise ValueError("Cannot compute difficulty: no accuracies provided.") + + best_acc = max(accuracies) + # Clamp to [0, 1] in case of tiny numerical issues. + best_acc = max(0.0, min(1.0, best_acc)) + return 1.0 - best_acc + + +def compute_benchmark_separability( + accuracies: Union[Iterable[float], Mapping[str, float]], +) -> float: + """ + Compute benchmark separability given per-model accuracies. + + Separability is defined as the mean absolute deviation of model accuracies + around their mean: + + SEP(D_c, M) = mean(|v_c - mean(v_c)|) + + where ``v_c`` are the accuracies of different models on the same dataset. + + Args: + accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model, + or a mapping from model name to accuracy in [0.0, 1.0]. + + Returns: + A non-negative float representing separability. + + Raises: + ValueError: If no accuracies are provided. + """ + # Handle Mapping by extracting values, otherwise treat as iterable + if isinstance(accuracies, Mapping): + accuracies = accuracies.values() + + accuracies = list(accuracies) + if not accuracies: + raise ValueError("Cannot compute separability: no accuracies provided.") + + mean_acc = sum(accuracies) / len(accuracies) + abs_devs = [abs(a - mean_acc) for a in accuracies] + return sum(abs_devs) / len(abs_devs) + + From 8361b0eda14f99eea62e05ebc13d2573d0790e0f Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Mon, 5 Jan 2026 10:01:21 -0500 Subject: [PATCH 02/14] Addd consistency & novelty --- src/cfg/run_quality_evaluation_cfg.yaml | 7 +- src/run_quality_evaluation.py | 181 +++++++++++++++++--- src/run_quality_evaluation_README.md | 185 +++++++++++++++++++++ src/utils/__init__.py | 10 +- src/utils/quality_evaluation_utils.py | 211 +++++++++++++++++++++++- 5 files changed, 564 insertions(+), 30 deletions(-) create mode 100644 src/run_quality_evaluation_README.md diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index a00624c..5e92360 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -3,8 +3,13 @@ prompt_cfg: quality_eval_cfg: # Absolute path to the directory that directly contains per-model score folders. - scores_root_dir: "/projects/aieng/public/ace/artifacts/negin_ace/scores" + scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" scores_subdir: "scores" + # List of absolute paths to prior datasets for novelty computation. + # Each path should point to a directory containing per-model score folders (same structure as scores_root_dir). + # Models must be consistent across all datasets. + prior_datasets: + - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500" exp_cfg: exp_id: "quality_evaluation" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index a7d13e2..be4c28c 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -9,7 +9,9 @@ from omegaconf import DictConfig from src.utils import ( + compute_benchmark_consistency, compute_benchmark_difficulty, + compute_benchmark_novelty, compute_benchmark_separability, ) from src.utils import constants @@ -19,6 +21,57 @@ logger = logging.getLogger(__name__) +def _collect_accuracies_from_dir(directory: str) -> List[float]: + """ + Collect all accuracy values from JSON files in a directory (recursively). + + Args: + directory: Directory to walk recursively for JSON files. + + Returns: + List of accuracy values found in the directory. + """ + accuracies: List[float] = [] + for root, _dirs, files in os.walk(directory): + for fname in files: + if not fname.endswith(".json"): + continue + json_path = os.path.join(root, fname) + acc = _extract_accuracy_from_inspect_json(json_path) + if acc is not None: + accuracies.append(acc) + return accuracies + + +def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: + """ + Load model accuracies from a directory structure. + + Args: + base_dir: Directory containing per-model subdirectories with JSON files. + + Returns: + Dictionary mapping model name to average accuracy. + """ + model_to_accuracy: Dict[str, float] = {} + + if not os.path.isdir(base_dir): + logger.warning("Directory does not exist: %s", base_dir) + return model_to_accuracy + + for model_name in os.listdir(base_dir): + model_dir = os.path.join(base_dir, model_name) + if not os.path.isdir(model_dir): + continue + + accuracies = _collect_accuracies_from_dir(model_dir) + if accuracies: + avg_acc = sum(accuracies) / len(accuracies) + model_to_accuracy[model_name] = avg_acc + + return model_to_accuracy + + def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: """Extract the accuracy metric from a single Inspect eval JSON file.""" try: @@ -29,6 +82,11 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: return None try: + # Check if file has results (successful evaluation) or error (failed evaluation) + if "error" in data or "results" not in data: + # File has error or no results, skip it + return None + scores = data["results"]["scores"] if not scores: return None @@ -40,7 +98,7 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: return None -@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_cfg") +@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg") def main(cfg: DictConfig) -> None: """ Compute benchmark-level quality metrics from saved capability scores. @@ -71,33 +129,75 @@ def main(cfg: DictConfig) -> None: # For each model directory, walk all JSON files and average their accuracies. model_to_accuracy: Dict[str, float] = {} + # For consistency: map model to list of accuracies per generation + model_to_generation_accuracies: Dict[str, List[float]] = {} + + # Get prior dataset names to exclude them from current dataset + prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) + prior_dataset_names = set() + for prior_path in prior_datasets: + # Extract the directory name from the path + prior_name = os.path.basename(os.path.normpath(prior_path)) + prior_dataset_names.add(prior_name) + for model_name in os.listdir(base_scores_dir): + # Skip if this is a prior dataset directory + if model_name in prior_dataset_names: + logger.debug("Skipping prior dataset directory: %s", model_name) + continue model_dir = os.path.join(base_scores_dir, model_name) if not os.path.isdir(model_dir): continue - accuracies: List[float] = [] - for root, _dirs, files in os.walk(model_dir): - for fname in files: - if not fname.endswith(".json"): - continue - json_path = os.path.join(root, fname) - acc = _extract_accuracy_from_inspect_json(json_path) - if acc is not None: - accuracies.append(acc) - - if not accuracies: - logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) - continue - - avg_acc = sum(accuracies) / len(accuracies) - model_to_accuracy[model_name] = avg_acc - logger.info( - "Model '%s' mean accuracy over %d JSON files: %.4f", - model_name, - len(accuracies), - avg_acc, - ) + # Check if model_dir contains subdirectories (generations/runs) + subdirs = [ + d for d in os.listdir(model_dir) + if os.path.isdir(os.path.join(model_dir, d)) + ] + + if subdirs: + # Structure: model_dir/generation_dir/...json files + # Each subdirectory represents a different dataset generation + generation_accuracies: List[float] = [] + for gen_dir_name in sorted(subdirs): + gen_dir = os.path.join(model_dir, gen_dir_name) + gen_accuracies = _collect_accuracies_from_dir(gen_dir) + + if gen_accuracies: + avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies) + generation_accuracies.append(avg_gen_acc) + logger.debug( + "Model '%s' generation '%s': %.4f (from %d JSON files)", + model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies) + ) + + if generation_accuracies: + model_to_generation_accuracies[model_name] = generation_accuracies + # Overall average across all generations + avg_acc = sum(generation_accuracies) / len(generation_accuracies) + model_to_accuracy[model_name] = avg_acc + logger.info( + "Model '%s' mean accuracy over %d generations: %.4f", + model_name, + len(generation_accuracies), + avg_acc, + ) + else: + # Structure: model_dir/...json files (no generation subdirectories) + accuracies = _collect_accuracies_from_dir(model_dir) + + if not accuracies: + logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) + continue + + avg_acc = sum(accuracies) / len(accuracies) + model_to_accuracy[model_name] = avg_acc + logger.info( + "Model '%s' mean accuracy over %d JSON files: %.4f", + model_name, + len(accuracies), + avg_acc, + ) if not model_to_accuracy: logger.error("No valid model accuracies found in %s", base_scores_dir) @@ -107,6 +207,41 @@ def main(cfg: DictConfig) -> None: separability = compute_benchmark_separability(model_to_accuracy) logger.info("Benchmark difficulty: %.4f", difficulty) logger.info("Benchmark separability: %.4f", separability) + + # Compute consistency if we have multiple generations per model + if model_to_generation_accuracies: + try: + consistency = compute_benchmark_consistency(model_to_generation_accuracies) + logger.info("Benchmark consistency: %.4f", consistency) + except ValueError as e: + logger.warning("Could not compute consistency: %s", e) + + # Compute novelty if prior datasets are provided + prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) + if prior_datasets: + try: + logger.info("Loading prior datasets for novelty computation...") + prior_datasets_accuracies: List[Dict[str, float]] = [] + for prior_dir in prior_datasets: + prior_acc = _load_model_accuracies_from_dir(prior_dir) + if prior_acc: + prior_datasets_accuracies.append(prior_acc) + logger.info( + "Loaded prior dataset from %s: %d models", + prior_dir, len(prior_acc) + ) + else: + logger.warning("No accuracies found in prior dataset: %s", prior_dir) + + if prior_datasets_accuracies: + novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies) + logger.info("Benchmark novelty: %.4f", novelty) + else: + logger.warning("No valid prior datasets found, skipping novelty computation.") + except ValueError as e: + logger.warning("Could not compute novelty: %s", e) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing novelty: %s", e) if __name__ == "__main__": diff --git a/src/run_quality_evaluation_README.md b/src/run_quality_evaluation_README.md new file mode 100644 index 0000000..6292531 --- /dev/null +++ b/src/run_quality_evaluation_README.md @@ -0,0 +1,185 @@ +# Quality Evaluation Script + +`run_quality_evaluation.py` computes benchmark-level quality metrics from existing evaluation scores. + +## Overview + +This script analyzes model performance scores to compute several quality metrics: + +- **Difficulty**: Measures how hard the benchmark is (`1 - max(accuracy)`) +- **Separability**: Measures how well the benchmark distinguishes between models (mean absolute deviation of accuracies) +- **Consistency**: Measures stability of model performance across different dataset generations (`1 - mean(std(performance across generations))`) +- **Novelty**: Measures how much new information the dataset reveals compared to prior benchmarks (`1 - rank_correlation(predicted, actual)`) + +## Usage + +```bash +python src/run_quality_evaluation.py +``` + +The script uses Hydra for configuration management. Configuration is specified in `src/cfg/run_quality_evaluation_cfg.yaml`. + +## Configuration + +Edit `src/cfg/run_quality_evaluation_cfg.yaml`: + +```yaml +quality_eval_cfg: + # Absolute path to directory containing per-model score folders + scores_root_dir: "/path/to/scores" + + # Fallback: if scores_root_dir not set, uses: + # {BASE_ARTIFACTS_DIR}/{scores_subdir}/{run_id} + scores_subdir: "scores" + + # Optional: List of prior datasets for novelty computation + prior_datasets: + - "/path/to/prior_dataset1" + - "/path/to/prior_dataset2" +``` + +## Data Structure + +The script expects a root directory containing **per-model subdirectories**. Two structures are supported: + +### Structure 1: With Multiple Generations (for Consistency) + +``` +scores_root_dir/ +├── model1/ +│ ├── generation1/ # First dataset generation +│ │ └── .../*.json files (recursively) +│ ├── generation2/ # Second dataset generation +│ │ └── .../*.json files +│ └── generation3/ +│ └── .../*.json files +├── model2/ +│ └── ... (same structure) +``` + +**Behavior:** +- Computes average accuracy **per generation** for each model +- **Consistency** is computed from generation-to-generation variation +- **Difficulty** and **Separability** use the **average across all generations** + +### Structure 2: Without Generations (Single Dataset) + +``` +scores_root_dir/ +├── model1/ +│ └── .../*.json files (recursively, any nesting allowed) +├── model2/ +│ └── .../*.json files +``` + +**Behavior:** +- Walks all JSON files recursively under each model directory +- Computes average accuracy per model +- **Consistency** is NOT computed (no generations available) +- **Difficulty** and **Separability** are computed from average accuracies + +## JSON File Format + +Each `.json` file must follow the Inspect AI evaluation format: + +```json +{ + "results": { + "scores": [ + { + "metrics": { + "accuracy": { + "value": 0.75 + } + } + } + ] + } +} +``` + +## Metrics + +### Difficulty + +Measures how difficult the benchmark is for models: + +``` +difficulty = 1 - max(accuracy across all models) +``` + +- Range: [0, 1] +- Higher values = harder benchmark + +### Separability + +Measures how well the benchmark distinguishes between models: + +``` +separability = mean(|accuracy_i - mean(accuracies)|) +``` + +- Range: [0, 1] +- Higher values = better model discrimination + +### Consistency + +Measures stability of model performance across dataset generations: + +``` +consistency = 1 - (1/n) * Σ std(performance(m_i) across generations) +``` + +- Range: [0, 1] +- Higher values = more stable/consistent performance +- **Only computed** when multiple generations are detected + +### Novelty + +Measures how much new information the dataset reveals compared to prior benchmarks: + +``` +1. Predict current accuracies from prior datasets using linear regression +2. Compute rank correlation between predicted and actual rankings +3. novelty = 1 - rank_correlation +``` + +- Range: [0, 1] +- Higher values = more novel/unpredictable performance patterns +- **Only computed** when `prior_datasets` are specified in config + +## Prior Datasets (for Novelty) + +Prior datasets should have the **same structure** as the main dataset. + +**Important:** Prior dataset directories should be **separate** from the main `scores_root_dir` to avoid being treated as models. + +Example: +``` +data/ +├── scores_sample/ # Main dataset +│ ├── model1/ +│ └── model2/ +└── scores_sample/ + └── math-500/ # Prior dataset (separate directory) + ├── model1/ + └── model2/ +``` + +**Requirements:** +- All prior datasets must have the same set of models as the current dataset +- Models must be consistent across all datasets for novelty computation + +## Output + +The script logs all computed metrics: + +``` +[INFO] Model 'model1' mean accuracy over 3 generations: 0.7500 +[INFO] Model 'model2' mean accuracy over 3 generations: 0.6500 +[INFO] Benchmark difficulty: 0.2500 +[INFO] Benchmark separability: 0.0500 +[INFO] Benchmark consistency: 0.9200 +[INFO] Benchmark novelty: 0.5000 +``` + diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 00d0f19..b7be76a 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -6,9 +6,9 @@ """ from .data_utils import load_data -from .evaluation_utils import ( - compute_benchmark_difficulty_from_accuracies, - compute_benchmark_difficulty_from_model_scores, - compute_benchmark_separability_from_accuracies, - compute_benchmark_separability_from_model_scores, +from .quality_evaluation_utils import ( + compute_benchmark_consistency, + compute_benchmark_difficulty, + compute_benchmark_novelty, + compute_benchmark_separability, ) diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index 4d0c6d6..0f5c5e0 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -2,7 +2,11 @@ from __future__ import annotations -from typing import Iterable, Mapping, Union +import statistics +from typing import Iterable, List, Mapping, Union + +import numpy as np +from scipy.stats import spearmanr def compute_benchmark_difficulty( @@ -77,3 +81,208 @@ def compute_benchmark_separability( return sum(abs_devs) / len(abs_devs) +def compute_benchmark_consistency( + model_to_generation_accuracies: Mapping[str, Iterable[float]], +) -> float: + """ + Compute benchmark consistency given per-model accuracies across multiple dataset generations. + + Consistency measures how stable model performance is across different dataset generations. + The consistency of a benchmark is defined as: + + CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_{i=1}^n std({performance(m_i) | D_gen,j}_{j=1}^k) + + where: + - n is the number of models + - k is the number of dataset generations + - For each model m_i, we compute the standard deviation of its performance + across k dataset generations + - We average these standard deviations across all models + - We subtract from 1 to get a consistency score (higher is better) + + Args: + model_to_generation_accuracies: A mapping from model name to an iterable of + accuracy values, where each accuracy corresponds to the model's performance + on a different dataset generation. Each model should have the same number + of generations (k). + + Returns: + A float in [0.0, 1.0] representing the benchmark consistency. + Higher values indicate more consistent performance across generations. + + Raises: + ValueError: If no models are provided, or if models have inconsistent + numbers of generations, or if any model has fewer than 2 generations + (std requires at least 2 values). + + Example: + >>> model_to_accs = { + ... "model1": [0.8, 0.82, 0.79], + ... "model2": [0.7, 0.71, 0.69], + ... } + >>> consistency = compute_benchmark_consistency(model_to_accs) + """ + if not model_to_generation_accuracies: + raise ValueError("Cannot compute consistency: no models provided.") + + # Convert to lists and validate + model_accuracies = { + model: list(accuracies) + for model, accuracies in model_to_generation_accuracies.items() + } + + # Check that all models have the same number of generations + num_generations = len(next(iter(model_accuracies.values()))) + if num_generations < 2: + raise ValueError( + f"Cannot compute consistency: need at least 2 generations per model, " + f"but found {num_generations}." + ) + + for model, accuracies in model_accuracies.items(): + if len(accuracies) != num_generations: + raise ValueError( + f"Inconsistent number of generations: model '{model}' has " + f"{len(accuracies)} generations, but expected {num_generations}." + ) + + # Compute standard deviation for each model across generations + model_stds = [] + for model, accuracies in model_accuracies.items(): + if len(accuracies) < 2: + raise ValueError( + f"Model '{model}' has fewer than 2 generations, cannot compute std." + ) + std_dev = statistics.stdev(accuracies) + model_stds.append(std_dev) + + # Average the standard deviations across all models + mean_std = sum(model_stds) / len(model_stds) + + # Consistency = 1 - mean_std + # Clamp to [0, 1] in case of numerical issues + consistency = max(0.0, min(1.0, 1.0 - mean_std)) + return consistency + + +def compute_benchmark_novelty( + current_accuracies: Mapping[str, float], + prior_datasets_accuracies: List[Mapping[str, float]], +) -> float: + """ + Compute benchmark novelty by comparing current dataset performance to prior datasets. + + Novelty measures how much new information a dataset reveals about existing models + over existing benchmarks. The formula is: + + NOVELTY(D_c, D_prev, M) = 1 - RANKCORR(v̂_c, v_c) + + where: + - v_c is the current dataset's accuracy vector (M×1) + - V_prev is the prior datasets' accuracy matrix (M×N) + - v̂_c = V_prev * θ* + b* (predicted from linear regression) + - RANKCORR is the rank correlation (Spearman correlation) + + If the new accuracy vector v_c is spanned by existing accuracy vectors, + RANKCORR(v_c, v̂_c) will be close to 1, resulting in low novelty. + If v_c discovers new patterns in model performance, RANKCORR(v_c, v̂_c) + will be low, resulting in high novelty. + + Args: + current_accuracies: A mapping from model name to accuracy on the current + dataset. This is v_c. + prior_datasets_accuracies: A list of mappings, where each mapping contains + model name to accuracy for a prior dataset. This represents V_prev. + All mappings must contain the same set of models, and these models + must match the models in current_accuracies. + + Returns: + A float in [0.0, 1.0] representing the benchmark novelty. + Higher values indicate more novel/unique performance patterns. + + Raises: + ValueError: If no prior datasets provided, models don't match, or + regression fails (e.g., singular matrix). + + Example: + >>> current = {"model1": 0.8, "model2": 0.6, "model3": 0.7} + >>> prior1 = {"model1": 0.75, "model2": 0.65, "model3": 0.72} + >>> prior2 = {"model1": 0.78, "model2": 0.62, "model3": 0.68} + >>> novelty = compute_benchmark_novelty(current, [prior1, prior2]) + """ + if not prior_datasets_accuracies: + raise ValueError("Cannot compute novelty: no prior datasets provided.") + + # Get sorted model names to ensure consistent ordering + current_models = sorted(current_accuracies.keys()) + if not current_models: + raise ValueError("Cannot compute novelty: current_accuracies is empty.") + + # Validate that all prior datasets have the same models + for i, prior_acc in enumerate(prior_datasets_accuracies): + prior_models = sorted(prior_acc.keys()) + if set(prior_models) != set(current_models): + missing = set(current_models) - set(prior_models) + extra = set(prior_models) - set(current_models) + raise ValueError( + f"Prior dataset {i} has mismatched models. " + f"Missing: {missing}, Extra: {extra}" + ) + + # Build matrices: V_prev (M×N) and v_c (M×1) + # M = number of models, N = number of prior datasets + num_models = len(current_models) + num_prior = len(prior_datasets_accuracies) + + # V_prev: each column is a prior dataset's accuracies + V_prev = np.zeros((num_models, num_prior)) + for i, prior_acc in enumerate(prior_datasets_accuracies): + for j, model in enumerate(current_models): + V_prev[j, i] = prior_acc[model] + + # v_c: current dataset's accuracies + v_c = np.array([current_accuracies[model] for model in current_models]) + + # Perform linear regression: v_c = V_prev * θ + b + # We solve: min ||V_prev * θ + b - v_c||² + # To use np.linalg.lstsq, we reformulate as: [V_prev, 1] * [θ; b] = v_c + # where 1 is a column vector of ones (for the intercept b) + + # Augment design matrix with column of ones for intercept + ones = np.ones((num_models, 1)) + X = np.hstack([V_prev, ones]) + + try: + # Solve using least squares: X * params = v_c + # params = [θ; b] + params, residuals, rank, s = np.linalg.lstsq(X, v_c, rcond=None) + except np.linalg.LinAlgError as e: + raise ValueError( + f"Linear regression failed (singular matrix): {e}. " + "This may happen if prior datasets are linearly dependent." + ) from e + + # Extract θ and b + theta = params[:-1] # First N elements + b = params[-1] # Last element (intercept) + + # Compute predicted values: v̂_c = V_prev * θ + b + v_pred = V_prev @ theta + b + + # Compute rank correlation (Spearman correlation) using scipy + try: + rank_corr, _p_value = spearmanr(v_c, v_pred) + except Exception as e: + raise ValueError(f"Rank correlation computation failed: {e}") from e + + # Handle edge cases: if correlation is NaN or invalid, novelty is 1.0 + # (NaN occurs when either array has no variation, meaning we can't predict) + if np.isnan(rank_corr) or not np.isfinite(rank_corr): + return 1.0 + + # Novelty = 1 - rank_correlation + # Clamp to [0, 1] in case of numerical issues (e.g., negative correlation) + novelty = max(0.0, min(1.0, 1.0 - rank_corr)) + return novelty + + From 391f3064d25683bf72a0a6f2410eee9d8a195d2c Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 8 Jan 2026 13:07:09 -0500 Subject: [PATCH 03/14] Added SYNQUE diversity metrics --- src/cfg/run_quality_evaluation_cfg.yaml | 34 +- src/run_quality_evaluation.py | 266 ++++++++++++++- src/utils/__init__.py | 3 + src/utils/diversity_metrics_dataloaders.py | 364 +++++++++++++++++++++ src/utils/quality_evaluation_utils.py | 165 ++++++++++ 5 files changed, 827 insertions(+), 5 deletions(-) create mode 100644 src/utils/diversity_metrics_dataloaders.py diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index 5e92360..932b102 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -2,14 +2,40 @@ prompt_cfg: sys_msg: Compute benchmark quality metrics from existing scores. quality_eval_cfg: - # Absolute path to the directory that directly contains per-model score folders. scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" scores_subdir: "scores" - # List of absolute paths to prior datasets for novelty computation. - # Each path should point to a directory containing per-model score folders (same structure as scores_root_dir). - # Models must be consistent across all datasets. prior_datasets: - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500" + + capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" + + real_data_dir: null + + real_dataloader_config: + type: "huggingface" + dataset_name: "HuggingFaceH4/MATH-500" + split: "test" + subset: null + text_field: "problem" + + # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers + embedding_backend: "openai" + embedding_model: "text-embedding-3-large" + # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) + embedding_dimensions: 3072 + + diversity_metrics: + - "pad" + - "mmd" + - "mdm" + + pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" + + mmd_kernel: "polynomial" # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid" + mmd_degree: 3 + + mdm_n_clusters: 5 + mdm_metric: "euclidean" exp_cfg: exp_id: "quality_evaluation" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index be4c28c..da23ad5 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -3,19 +3,33 @@ import json import logging import os -from typing import Dict, List +from typing import Any, Dict, List, Optional import hydra +import numpy as np +import torch from omegaconf import DictConfig +from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName from src.utils import ( compute_benchmark_consistency, compute_benchmark_difficulty, compute_benchmark_novelty, compute_benchmark_separability, + compute_mdm, + compute_mmd, + compute_pad, ) from src.utils import constants from src.utils.data_utils import get_run_id +from src.utils.diversity_metrics_dataloaders import ( + CapabilityDataloader, + HuggingFaceDatasetDataloader, + JSONLDataloader, + CSVDataloader, + DatasetDataloader, + load_texts_from_dataloader, +) logger = logging.getLogger(__name__) @@ -72,6 +86,150 @@ def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: return model_to_accuracy +def _create_dataloader_from_config( + data_path: str, + dataloader_config: Dict[str, Any], +) -> DatasetDataloader: + """Create a dataloader from configuration. + + Args: + data_path: Path to the data + dataloader_config: Configuration dict with 'type' and other fields + + Returns: + DatasetDataloader instance + """ + dataloader_type = dataloader_config.get("type", "capability") + + if dataloader_type == "capability": + return CapabilityDataloader(data_path) + + elif dataloader_type == "huggingface": + from datasets import load_dataset + dataset_name = dataloader_config.get("dataset_name") + split = dataloader_config.get("split", "train") + subset = dataloader_config.get("subset", None) + dataset = load_dataset(dataset_name, name=subset, split=split) + + return HuggingFaceDatasetDataloader( + dataset=dataset, + text_field=dataloader_config.get("text_field", "problem"), + ) + + elif dataloader_type == "jsonl": + return JSONLDataloader( + jsonl_path=data_path, + name_field=dataloader_config.get("name_field", "name"), + description_field=dataloader_config.get("description_field", "description"), + area_field=dataloader_config.get("area_field"), + instructions_field=dataloader_config.get("instructions_field"), + task_field=dataloader_config.get("task_field", "problem"), + ) + + elif dataloader_type == "csv": + return CSVDataloader( + csv_path=data_path, + name_field=dataloader_config.get("name_field", "name"), + description_field=dataloader_config.get("description_field", "description"), + area_field=dataloader_config.get("area_field"), + instructions_field=dataloader_config.get("instructions_field"), + task_field=dataloader_config.get("task_field", "problem"), + ) + + else: + raise ValueError(f"Unknown dataloader type: {dataloader_type}") + + +def _load_capabilities_and_generate_embeddings( + capabilities_dir: str, + embedding_model_name: str, + embed_dimensions: int, + dataloader_config: Optional[Dict[str, Any]] = None, + embedding_backend: str = "openai", +) -> tuple[np.ndarray, List[Any]]: + """ + Load capabilities from directory and generate embeddings. + + Supports both capability format (default) and custom dataloaders. + Always uses the dataloader system for consistency. + + Args: + capabilities_dir: Directory containing capability subdirectories OR path to data file + embedding_model_name: Name of embedding model to use + embed_dimensions: Number of embedding dimensions + dataloader_config: Optional configuration for custom dataloader. + If None, defaults to capability format. + + Returns: + Tuple of (embeddings array, list of items/capabilities) + """ + # Use dataloader system: default to capability format if no config provided + if dataloader_config: + logger.info("Using custom dataloader: %s", dataloader_config.get("type", "unknown")) + dataloader = _create_dataloader_from_config(capabilities_dir, dataloader_config) + else: + # Default: use capability format dataloader + if not os.path.isdir(capabilities_dir): + logger.error("capabilities_dir must be a directory when using default capability format: %s", capabilities_dir) + return np.array([]), [] + logger.info("Using capability format dataloader for %s", capabilities_dir) + dataloader = CapabilityDataloader(capabilities_dir) + + # Extract texts using the dataloader + texts = load_texts_from_dataloader(dataloader) + + if not texts: + logger.warning("No texts extracted from %s", capabilities_dir) + return np.array([]), [] + + logger.info("Extracted %d texts for embedding", len(texts)) + + # Generate embeddings + logger.info( + "Generating embeddings using %s (backend=%s)", + embedding_model_name, + embedding_backend, + ) + if embedding_backend.lower() == "openai": + # Use existing OpenAI-based EmbeddingGenerator + embedding_generator = EmbeddingGenerator( + model_name=EmbeddingModelName(embedding_model_name), + embed_dimensions=embed_dimensions, + ) + embeddings = embedding_generator.generate_embeddings(texts) + embeddings_array = np.array([emb.numpy() for emb in embeddings]) + elif embedding_backend.lower() == "huggingface": + # Use HuggingFace encoder models such as gte-Qwen + try: + from sentence_transformers import SentenceTransformer # type: ignore[import] + except Exception as exc: # noqa: BLE001 + logger.error( + "Failed to import sentence_transformers for HuggingFace embeddings: %s", + exc, + ) + return np.array([]), [] + + hf_model = SentenceTransformer(embedding_model_name) + embeddings_array = hf_model.encode( + texts, + show_progress_bar=True, + convert_to_numpy=True, + ) + # Optionally warn if requested dim does not match actual dim + if embed_dimensions and embeddings_array.shape[1] != embed_dimensions: + logger.warning( + "Requested embed_dimensions=%d but HuggingFace model produced %d dims; " + "using model's native dimension.", + embed_dimensions, + embeddings_array.shape[1], + ) + else: + logger.error("Unknown embedding_backend: %s", embedding_backend) + return np.array([]), [] + + return embeddings_array, [] + + def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: """Extract the accuracy metric from a single Inspect eval JSON file.""" try: @@ -182,6 +340,8 @@ def main(cfg: DictConfig) -> None: len(generation_accuracies), avg_acc, ) + # Continue to next model if we processed subdirs + continue else: # Structure: model_dir/...json files (no generation subdirectories) accuracies = _collect_accuracies_from_dir(model_dir) @@ -242,6 +402,110 @@ def main(cfg: DictConfig) -> None: logger.warning("Could not compute novelty: %s", e) except Exception as e: # noqa: BLE001 logger.warning("Error computing novelty: %s", e) + + # Compute diversity metrics if capabilities directory is provided + capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None) + if capabilities_dir: + metrics_to_compute = getattr(cfg.quality_eval_cfg, "diversity_metrics", ["pad", "mmd", "mdm"]) + embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large") + embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai") + embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072) + + # Get dataloader config if provided + synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None) + if synth_dataloader_config: + synth_dataloader_config = dict(synth_dataloader_config) + + logger.info("Computing diversity metrics for capabilities in %s", capabilities_dir) + + # Load capabilities and generate embeddings + synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings( + capabilities_dir=capabilities_dir, + embedding_model_name=embedding_model, + embed_dimensions=embed_dimensions, + dataloader_config=synth_dataloader_config, + embedding_backend=embedding_backend, + ) + + if len(synth_embeddings) == 0: + logger.warning("No embeddings generated, skipping diversity metrics") + else: + # Check if real data directory/file is provided for comparison + real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None) + real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None) + + # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.) + has_real_data = False + # Case 1: local path (capability/JSONL/CSV formats) + if real_data_dir and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)): + has_real_data = True + # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None) + elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface": + has_real_data = True + + if has_real_data: + # Get real data dataloader config if provided + if real_dataloader_config: + real_dataloader_config = dict(real_dataloader_config) + + if real_data_dir: + logger.info("Loading real data embeddings from %s", real_data_dir) + else: + logger.info("Loading real data embeddings using dataloader config (no local path)") + real_embeddings, _ = _load_capabilities_and_generate_embeddings( + # For HuggingFace, the capabilities_dir is unused; fallback to empty string + capabilities_dir=real_data_dir or "", + embedding_model_name=embedding_model, + embed_dimensions=embed_dimensions, + dataloader_config=real_dataloader_config, + embedding_backend=embedding_backend, + ) + + if len(real_embeddings) > 0: + # Compute metrics that require both synthetic and real data + if "pad" in metrics_to_compute: + try: + pad_score = compute_pad( + synth_embeddings, + real_embeddings, + classifier_name=getattr(cfg.quality_eval_cfg, "pad_classifier", "LogisticRegression"), + ) + logger.info("PAD score: %.4f", pad_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing PAD: %s", e) + + if "mmd" in metrics_to_compute: + try: + mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial") + mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3) + mmd_score = compute_mmd( + synth_embeddings, + real_embeddings, + kernel=mmd_kernel, + degree=mmd_degree, + ) + logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing MMD: %s", e) + else: + logger.warning("No real data embeddings generated, skipping comparison metrics") + else: + logger.info("No real_data_dir provided, skipping PAD and MMD (require real data)") + + # Compute MDM (can be computed without real data - measures internal diversity) + if "mdm" in metrics_to_compute: + try: + mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5) + mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean") + mdm_score = compute_mdm( + synth_embeddings, + dummy_placeholder=None, + n_clusters=mdm_n_clusters, + metric=mdm_metric, + ) + logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing MDM: %s", e) if __name__ == "__main__": diff --git a/src/utils/__init__.py b/src/utils/__init__.py index b7be76a..7889911 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -11,4 +11,7 @@ compute_benchmark_difficulty, compute_benchmark_novelty, compute_benchmark_separability, + compute_mdm, + compute_mmd, + compute_pad, ) diff --git a/src/utils/diversity_metrics_dataloaders.py b/src/utils/diversity_metrics_dataloaders.py new file mode 100644 index 0000000..5720203 --- /dev/null +++ b/src/utils/diversity_metrics_dataloaders.py @@ -0,0 +1,364 @@ +"""Dataloaders for extracting text from different dataset formats for diversity metrics. + +This module provides a flexible interface for loading data from different formats +and extracting the text needed for embedding generation. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional +import json +import os +import logging + +logger = logging.getLogger(__name__) + + +class DatasetDataloader(ABC): + """Abstract base class for dataloaders that extract text from datasets.""" + + @abstractmethod + def get_name(self, item: Any) -> str: + """Extract the name/title from a dataset item.""" + pass + + @abstractmethod + def get_description(self, item: Any) -> str: + """Extract the description from a dataset item.""" + pass + + def get_area(self, item: Any) -> Optional[str]: + """Extract the area/category from a dataset item (optional).""" + return None + + def get_instructions(self, item: Any) -> Optional[str]: + """Extract instructions from a dataset item (optional).""" + return None + + def get_sample_tasks(self, item: Any, max_samples: int = 5) -> List[str]: + """Extract sample tasks/problems from a dataset item (optional). + + Args: + item: The dataset item + max_samples: Maximum number of sample tasks to return + + Returns: + List of task/problem strings + """ + return [] + + def extract_text(self, item: Any, max_task_samples: int = 5) -> str: + """Extract full text representation from a dataset item. + + Args: + item: The dataset item + max_task_samples: Maximum number of sample tasks to include + + Returns: + Text string suitable for embedding generation + """ + text_parts = [ + f"Name: {self.get_name(item)}", + f"Description: {self.get_description(item)}", + ] + + area = self.get_area(item) + if area: + text_parts.append(f"Area: {area}") + + instructions = self.get_instructions(item) + if instructions: + text_parts.append(f"Instructions: {instructions}") + + tasks = self.get_sample_tasks(item, max_samples=max_task_samples) + if tasks: + task_texts = [f"Task: {task}" for task in tasks] + text_parts.append("Tasks: " + " | ".join(task_texts)) + + return " | ".join(text_parts) + + +class CapabilityDataloader(DatasetDataloader): + """Dataloader for capability format (capability.json structure). + + Can handle either: + - A single capability directory (contains capability.json) + - A parent directory containing multiple capability subdirectories + """ + + def __init__(self, capability_dir: str): + """Initialize with a capability directory. + + Args: + capability_dir: Path to capability directory or parent directory with capability subdirectories + """ + self.capability_dir = capability_dir + self.capabilities = self._load_capabilities() + + def _load_capabilities(self) -> List[Dict[str, Any]]: + """Load capabilities from directory. + + Returns: + List of capability data dictionaries + """ + capabilities = [] + + # Check if this is a single capability directory (has capability.json) + single_cap_json = os.path.join(self.capability_dir, "capability.json") + if os.path.exists(single_cap_json): + with open(single_cap_json, 'r') as f: + capabilities.append(json.load(f)) + return capabilities + + # Otherwise, treat as parent directory with multiple capability subdirectories + if not os.path.isdir(self.capability_dir): + raise FileNotFoundError(f"Capability directory does not exist: {self.capability_dir}") + + for item_name in os.listdir(self.capability_dir): + item_path = os.path.join(self.capability_dir, item_name) + if not os.path.isdir(item_path): + continue + + cap_json = os.path.join(item_path, "capability.json") + if os.path.exists(cap_json): + with open(cap_json, 'r') as f: + capabilities.append(json.load(f)) + + if not capabilities: + raise FileNotFoundError(f"No capabilities found in {self.capability_dir}") + + return capabilities + + def get_name(self, item: Dict[str, Any]) -> str: + return item.get("capability_name", "") + + def get_description(self, item: Dict[str, Any]) -> str: + return item.get("capability_description", "") + + def get_area(self, item: Dict[str, Any]) -> Optional[str]: + return item.get("capability_area") + + def get_instructions(self, item: Dict[str, Any]) -> Optional[str]: + return item.get("capability_instructions") + + def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]: + tasks = item.get("capability_data", []) + problems = [] + for task in tasks[:max_samples]: + if isinstance(task, dict): + problem = task.get('problem', '') + if problem: + problems.append(problem) + return problems + + +class HuggingFaceDatasetDataloader(DatasetDataloader): + """Dataloader for HuggingFace datasets. + + Simply extracts text from a specified field in each dataset item. + """ + + def __init__(self, dataset, text_field: str = "problem"): + """Initialize with a HuggingFace dataset. + + Args: + dataset: HuggingFace dataset or iterable of dicts + text_field: Field name containing the text to embed (e.g., "problem", "text", "content") + """ + self.dataset = dataset + self.text_field = text_field + + def get_name(self, item: Dict[str, Any]) -> str: + return "" # Not used in simplified version + + def get_description(self, item: Dict[str, Any]) -> str: + return str(item.get(self.text_field, "")) + + def get_area(self, item: Dict[str, Any]) -> Optional[str]: + return None # Not used in simplified version + + def get_instructions(self, item: Dict[str, Any]) -> Optional[str]: + return None # Not used in simplified version + + def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]: + return [] # Not used in simplified version + + def extract_text(self, item: Any, max_task_samples: int = 5) -> str: + """Extract text from the specified field. + + Args: + item: The dataset item + max_task_samples: Ignored (kept for interface compatibility) + + Returns: + Text string from the specified field + """ + if isinstance(item, dict): + return str(item.get(self.text_field, "")) + return str(item) + + +class JSONLDataloader(DatasetDataloader): + """Dataloader for JSONL files (one JSON object per line). + + Flexible loader that can handle various JSONL formats by specifying field mappings. + """ + + def __init__(self, jsonl_path: str, name_field: str = "name", + description_field: str = "description", + area_field: Optional[str] = None, + instructions_field: Optional[str] = None, + task_field: Optional[str] = "problem"): + """Initialize with a JSONL file path. + + Args: + jsonl_path: Path to JSONL file + name_field: Field name for name/title + description_field: Field name for description + area_field: Field name for area/category (optional) + instructions_field: Field name for instructions (optional) + task_field: Field name for tasks/problems (optional) + """ + self.jsonl_path = jsonl_path + self.name_field = name_field + self.description_field = description_field + self.area_field = area_field + self.instructions_field = instructions_field + self.task_field = task_field + + def get_name(self, item: Dict[str, Any]) -> str: + return str(item.get(self.name_field, "")) + + def get_description(self, item: Dict[str, Any]) -> str: + return str(item.get(self.description_field, "")) + + def get_area(self, item: Dict[str, Any]) -> Optional[str]: + if self.area_field: + return item.get(self.area_field) + return None + + def get_instructions(self, item: Dict[str, Any]) -> Optional[str]: + if self.instructions_field: + return item.get(self.instructions_field) + return None + + def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]: + if self.task_field and self.task_field in item: + task_value = item[self.task_field] + if isinstance(task_value, str): + return [task_value] + elif isinstance(task_value, list): + return [str(t) for t in task_value[:max_samples] if t] + return [] + + def load_items(self) -> List[Dict[str, Any]]: + """Load all items from the JSONL file.""" + items = [] + with open(self.jsonl_path, 'r') as f: + for line in f: + if line.strip(): + items.append(json.loads(line)) + return items + + +class CSVDataloader(DatasetDataloader): + """Dataloader for CSV files.""" + + def __init__(self, csv_path: str, name_field: str = "name", + description_field: str = "description", + area_field: Optional[str] = None, + instructions_field: Optional[str] = None, + task_field: Optional[str] = "problem"): + """Initialize with a CSV file path. + + Args: + csv_path: Path to CSV file + name_field: Column name for name/title + description_field: Column name for description + area_field: Column name for area/category (optional) + instructions_field: Column name for instructions (optional) + task_field: Column name for tasks/problems (optional) + """ + import pandas as pd + self.df = pd.read_csv(csv_path) + self.name_field = name_field + self.description_field = description_field + self.area_field = area_field + self.instructions_field = instructions_field + self.task_field = task_field + + def get_name(self, item: Dict[str, Any]) -> str: + return str(item.get(self.name_field, "")) + + def get_description(self, item: Dict[str, Any]) -> str: + return str(item.get(self.description_field, "")) + + def get_area(self, item: Dict[str, Any]) -> Optional[str]: + if self.area_field and self.area_field in item: + return item.get(self.area_field) + return None + + def get_instructions(self, item: Dict[str, Any]) -> Optional[str]: + if self.instructions_field and self.instructions_field in item: + return item.get(self.instructions_field) + return None + + def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]: + if self.task_field and self.task_field in item: + task_value = item[self.task_field] + if isinstance(task_value, str): + return [task_value] + return [] + + def load_items(self) -> List[Dict[str, Any]]: + """Load all items from the CSV file.""" + return self.df.to_dict('records') + + +def load_texts_from_dataloader(dataloader: DatasetDataloader) -> List[str]: + """Extract texts from a dataloader for embedding generation. + + Args: + dataloader: A DatasetDataloader instance + + Returns: + List of text strings ready for embedding + """ + texts = [] + + if isinstance(dataloader, CapabilityDataloader): + # Capability format: iterate over all capabilities + for capability_data in dataloader.capabilities: + texts.append(dataloader.extract_text(capability_data)) + elif isinstance(dataloader, HuggingFaceDatasetDataloader): + # HuggingFace dataset: iterate over items + for item in dataloader.dataset: + texts.append(dataloader.extract_text(item)) + elif isinstance(dataloader, JSONLDataloader): + # JSONL: load all items + items = dataloader.load_items() + for item in items: + texts.append(dataloader.extract_text(item)) + elif isinstance(dataloader, CSVDataloader): + # CSV: load all items + items = dataloader.load_items() + for item in items: + texts.append(dataloader.extract_text(item)) + else: + # Generic: try to iterate + try: + if hasattr(dataloader, 'dataset'): + for item in dataloader.dataset: + texts.append(dataloader.extract_text(item)) + elif hasattr(dataloader, 'load_items'): + items = dataloader.load_items() + for item in items: + texts.append(dataloader.extract_text(item)) + else: + logger.error("Dataloader does not have dataset or load_items method") + raise ValueError("Dataloader must have dataset attribute or load_items method") + except Exception as e: + logger.error(f"Could not extract texts from dataloader: {e}") + raise + + return texts + diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index 0f5c5e0..7419f7e 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -7,6 +7,19 @@ import numpy as np from scipy.stats import spearmanr +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics.pairwise import ( + polynomial_kernel, + rbf_kernel, + laplacian_kernel, + linear_kernel, + sigmoid_kernel, +) +import kmedoids +from sklearn.metrics import pairwise_distances def compute_benchmark_difficulty( @@ -286,3 +299,155 @@ def compute_benchmark_novelty( return novelty +# =========================== +# ---- Diversity Metrics (PAD, MMD, MDM) +# =========================== + +def compute_pad( + x_syn_emb: np.ndarray, + x_real_emb: np.ndarray, + classifier_name: str = "LogisticRegression", +) -> float: + """ + Compute the Proxy-A-Distance (PAD) between two sets of embeddings. + + PAD measures the distance between synthetic and real data distributions + by training a classifier to distinguish between them. Lower values indicate + more similar distributions. + + Args: + x_syn_emb: Embeddings of synthetic data, shape (n_samples, n_features) + x_real_emb: Embeddings of real data, shape (n_samples, n_features) + classifier_name: Classifier to use ("LogisticRegression", "RandomForest", "MLP") + + Returns: + float: PAD value (typically in range [0, 2], lower is better) + """ + y_syn_train = np.zeros(len(x_syn_emb)) + y_real_train = np.ones(len(x_real_emb)) + x_train = np.concatenate([x_syn_emb, x_real_emb], axis=0) + y_train = np.concatenate([y_syn_train, y_real_train], axis=0) + + # Split into train/validation + x_train, x_val, y_train, y_val = train_test_split( + x_train, y_train, test_size=0.2, random_state=42 + ) + + # Classifier + if classifier_name == "LogisticRegression": + classifier = LogisticRegression(random_state=42, max_iter=1000) + elif classifier_name == "RandomForest": + classifier = RandomForestClassifier(random_state=42) + elif classifier_name == "MLP": + classifier = MLPClassifier( + hidden_layer_sizes=(128, 64), + activation='relu', + max_iter=200, + random_state=42 + ) + else: + raise ValueError(f"Unknown classifier: {classifier_name}") + + classifier.fit(x_train, y_train) + y_pred_proba = classifier.predict_proba(x_val)[:, 1] + average_loss = np.mean(np.abs(y_pred_proba - y_val)) + return 2 * (1 - 2 * average_loss) + + +def compute_mmd( + X: np.ndarray, + Y: np.ndarray, + kernel: str = "polynomial", + degree: int = 3, + gamma: float | None = None, + coef0: float = 1, +) -> float: + """ + Compute the Maximum Mean Discrepancy (MMD) between two samples: X and Y. + + MMD measures the distance between two distributions in a reproducing kernel + Hilbert space. Lower values indicate more similar distributions. + + Args: + X: First sample, shape (n_samples_X, n_features) + Y: Second sample, shape (n_samples_Y, n_features) + kernel: Kernel name ("polynomial", "rbf", "laplacian", "linear", "sigmoid") + degree: Degree for polynomial kernel (default: 3) + gamma: Gamma parameter for kernels (default: None, auto) + coef0: Coef0 for polynomial/sigmoid kernel + + Returns: + float: MMD value (non-negative, lower is better) + """ + kernel = kernel.lower() if isinstance(kernel, str) else kernel + if kernel == "polynomial": + kfunc = polynomial_kernel + XX = kfunc(X, X, degree=degree, gamma=gamma, coef0=coef0) + YY = kfunc(Y, Y, degree=degree, gamma=gamma, coef0=coef0) + XY = kfunc(X, Y, degree=degree, gamma=gamma, coef0=coef0) + elif kernel == "rbf": + kfunc = rbf_kernel + XX = kfunc(X, X, gamma=gamma) + YY = kfunc(Y, Y, gamma=gamma) + XY = kfunc(X, Y, gamma=gamma) + elif kernel == "laplacian": + kfunc = laplacian_kernel + XX = kfunc(X, X, gamma=gamma) + YY = kfunc(Y, Y, gamma=gamma) + XY = kfunc(X, Y, gamma=gamma) + elif kernel == "linear": + kfunc = linear_kernel + XX = kfunc(X, X) + YY = kfunc(Y, Y) + XY = kfunc(X, Y) + elif kernel == "sigmoid": + kfunc = sigmoid_kernel + XX = kfunc(X, X, gamma=gamma, coef0=coef0) + YY = kfunc(Y, Y, gamma=gamma, coef0=coef0) + XY = kfunc(X, Y, gamma=gamma, coef0=coef0) + else: + raise ValueError(f"Unknown kernel: {kernel}") + return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY) + + +def compute_mdm( + embeddings: np.ndarray, + dummy_placeholder: any = None, # noqa: ANN001 + n_clusters: int = 5, + metric: str = "euclidean", +) -> float: + """ + Compute the mean distance of points in each cluster to its medoid, then average across clusters. + + MDM measures the internal diversity/coherence of a set of embeddings by clustering + them and computing the average distance to cluster medoids. Lower values indicate + more coherent/diverse clusters. + + Args: + embeddings: Embedding matrix of shape (n_samples, n_features) + dummy_placeholder: Dummy placeholder to match the signature (unused) + n_clusters: Number of clusters/medoids to use + metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.) + + Returns: + float: Mean distance to medoid (averaged over all clusters) + """ + n_samples = len(embeddings) + if n_samples < n_clusters: + n_clusters = max(1, n_samples) + + diss = pairwise_distances(embeddings, metric=metric) + pam_result = kmedoids.fasterpam(diss, n_clusters, random_state=42) + labels = pam_result.labels + medoid_indices = pam_result.medoids + + total_dist = 0.0 + for i, medoid_idx in enumerate(medoid_indices): + cluster_points_idx = np.where(labels == i)[0] + if len(cluster_points_idx) == 0: + continue + dists = diss[cluster_points_idx, medoid_idx] + total_dist += np.mean(dists) + return total_dist / n_clusters + + From 3727c44caa175c140a052c8d3fe829d111d322a6 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 8 Jan 2026 13:39:33 -0500 Subject: [PATCH 04/14] Cleaned code --- src/run_quality_evaluation.py | 1 - src/run_quality_evaluation_README.md | 185 --------------------- src/utils/diversity_metrics_dataloaders.py | 30 +--- src/utils/quality_evaluation_utils.py | 2 - 4 files changed, 7 insertions(+), 211 deletions(-) delete mode 100644 src/run_quality_evaluation_README.md diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index da23ad5..5cc3bc9 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -499,7 +499,6 @@ def main(cfg: DictConfig) -> None: mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean") mdm_score = compute_mdm( synth_embeddings, - dummy_placeholder=None, n_clusters=mdm_n_clusters, metric=mdm_metric, ) diff --git a/src/run_quality_evaluation_README.md b/src/run_quality_evaluation_README.md deleted file mode 100644 index 6292531..0000000 --- a/src/run_quality_evaluation_README.md +++ /dev/null @@ -1,185 +0,0 @@ -# Quality Evaluation Script - -`run_quality_evaluation.py` computes benchmark-level quality metrics from existing evaluation scores. - -## Overview - -This script analyzes model performance scores to compute several quality metrics: - -- **Difficulty**: Measures how hard the benchmark is (`1 - max(accuracy)`) -- **Separability**: Measures how well the benchmark distinguishes between models (mean absolute deviation of accuracies) -- **Consistency**: Measures stability of model performance across different dataset generations (`1 - mean(std(performance across generations))`) -- **Novelty**: Measures how much new information the dataset reveals compared to prior benchmarks (`1 - rank_correlation(predicted, actual)`) - -## Usage - -```bash -python src/run_quality_evaluation.py -``` - -The script uses Hydra for configuration management. Configuration is specified in `src/cfg/run_quality_evaluation_cfg.yaml`. - -## Configuration - -Edit `src/cfg/run_quality_evaluation_cfg.yaml`: - -```yaml -quality_eval_cfg: - # Absolute path to directory containing per-model score folders - scores_root_dir: "/path/to/scores" - - # Fallback: if scores_root_dir not set, uses: - # {BASE_ARTIFACTS_DIR}/{scores_subdir}/{run_id} - scores_subdir: "scores" - - # Optional: List of prior datasets for novelty computation - prior_datasets: - - "/path/to/prior_dataset1" - - "/path/to/prior_dataset2" -``` - -## Data Structure - -The script expects a root directory containing **per-model subdirectories**. Two structures are supported: - -### Structure 1: With Multiple Generations (for Consistency) - -``` -scores_root_dir/ -├── model1/ -│ ├── generation1/ # First dataset generation -│ │ └── .../*.json files (recursively) -│ ├── generation2/ # Second dataset generation -│ │ └── .../*.json files -│ └── generation3/ -│ └── .../*.json files -├── model2/ -│ └── ... (same structure) -``` - -**Behavior:** -- Computes average accuracy **per generation** for each model -- **Consistency** is computed from generation-to-generation variation -- **Difficulty** and **Separability** use the **average across all generations** - -### Structure 2: Without Generations (Single Dataset) - -``` -scores_root_dir/ -├── model1/ -│ └── .../*.json files (recursively, any nesting allowed) -├── model2/ -│ └── .../*.json files -``` - -**Behavior:** -- Walks all JSON files recursively under each model directory -- Computes average accuracy per model -- **Consistency** is NOT computed (no generations available) -- **Difficulty** and **Separability** are computed from average accuracies - -## JSON File Format - -Each `.json` file must follow the Inspect AI evaluation format: - -```json -{ - "results": { - "scores": [ - { - "metrics": { - "accuracy": { - "value": 0.75 - } - } - } - ] - } -} -``` - -## Metrics - -### Difficulty - -Measures how difficult the benchmark is for models: - -``` -difficulty = 1 - max(accuracy across all models) -``` - -- Range: [0, 1] -- Higher values = harder benchmark - -### Separability - -Measures how well the benchmark distinguishes between models: - -``` -separability = mean(|accuracy_i - mean(accuracies)|) -``` - -- Range: [0, 1] -- Higher values = better model discrimination - -### Consistency - -Measures stability of model performance across dataset generations: - -``` -consistency = 1 - (1/n) * Σ std(performance(m_i) across generations) -``` - -- Range: [0, 1] -- Higher values = more stable/consistent performance -- **Only computed** when multiple generations are detected - -### Novelty - -Measures how much new information the dataset reveals compared to prior benchmarks: - -``` -1. Predict current accuracies from prior datasets using linear regression -2. Compute rank correlation between predicted and actual rankings -3. novelty = 1 - rank_correlation -``` - -- Range: [0, 1] -- Higher values = more novel/unpredictable performance patterns -- **Only computed** when `prior_datasets` are specified in config - -## Prior Datasets (for Novelty) - -Prior datasets should have the **same structure** as the main dataset. - -**Important:** Prior dataset directories should be **separate** from the main `scores_root_dir` to avoid being treated as models. - -Example: -``` -data/ -├── scores_sample/ # Main dataset -│ ├── model1/ -│ └── model2/ -└── scores_sample/ - └── math-500/ # Prior dataset (separate directory) - ├── model1/ - └── model2/ -``` - -**Requirements:** -- All prior datasets must have the same set of models as the current dataset -- Models must be consistent across all datasets for novelty computation - -## Output - -The script logs all computed metrics: - -``` -[INFO] Model 'model1' mean accuracy over 3 generations: 0.7500 -[INFO] Model 'model2' mean accuracy over 3 generations: 0.6500 -[INFO] Benchmark difficulty: 0.2500 -[INFO] Benchmark separability: 0.0500 -[INFO] Benchmark consistency: 0.9200 -[INFO] Benchmark novelty: 0.5000 -``` - diff --git a/src/utils/diversity_metrics_dataloaders.py b/src/utils/diversity_metrics_dataloaders.py index 5720203..5affc62 100644 --- a/src/utils/diversity_metrics_dataloaders.py +++ b/src/utils/diversity_metrics_dataloaders.py @@ -78,38 +78,23 @@ def extract_text(self, item: Any, max_task_samples: int = 5) -> str: class CapabilityDataloader(DatasetDataloader): - """Dataloader for capability format (capability.json structure). - - Can handle either: - - A single capability directory (contains capability.json) - - A parent directory containing multiple capability subdirectories - """ + """Dataloader for capability format (capability.json structure).""" def __init__(self, capability_dir: str): - """Initialize with a capability directory. - - Args: - capability_dir: Path to capability directory or parent directory with capability subdirectories - """ + """Initialize with a capability directory or parent directory.""" self.capability_dir = capability_dir self.capabilities = self._load_capabilities() def _load_capabilities(self) -> List[Dict[str, Any]]: - """Load capabilities from directory. - - Returns: - List of capability data dictionaries - """ + """Load capabilities from directory.""" capabilities = [] - # Check if this is a single capability directory (has capability.json) single_cap_json = os.path.join(self.capability_dir, "capability.json") if os.path.exists(single_cap_json): with open(single_cap_json, 'r') as f: capabilities.append(json.load(f)) return capabilities - # Otherwise, treat as parent directory with multiple capability subdirectories if not os.path.isdir(self.capability_dir): raise FileNotFoundError(f"Capability directory does not exist: {self.capability_dir}") @@ -168,19 +153,19 @@ def __init__(self, dataset, text_field: str = "problem"): self.text_field = text_field def get_name(self, item: Dict[str, Any]) -> str: - return "" # Not used in simplified version + return "" def get_description(self, item: Dict[str, Any]) -> str: return str(item.get(self.text_field, "")) def get_area(self, item: Dict[str, Any]) -> Optional[str]: - return None # Not used in simplified version + return None def get_instructions(self, item: Dict[str, Any]) -> Optional[str]: - return None # Not used in simplified version + return None def get_sample_tasks(self, item: Dict[str, Any], max_samples: int = 5) -> List[str]: - return [] # Not used in simplified version + return [] def extract_text(self, item: Any, max_task_samples: int = 5) -> str: """Extract text from the specified field. @@ -326,7 +311,6 @@ def load_texts_from_dataloader(dataloader: DatasetDataloader) -> List[str]: texts = [] if isinstance(dataloader, CapabilityDataloader): - # Capability format: iterate over all capabilities for capability_data in dataloader.capabilities: texts.append(dataloader.extract_text(capability_data)) elif isinstance(dataloader, HuggingFaceDatasetDataloader): diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index 7419f7e..ea7e75c 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -412,7 +412,6 @@ def compute_mmd( def compute_mdm( embeddings: np.ndarray, - dummy_placeholder: any = None, # noqa: ANN001 n_clusters: int = 5, metric: str = "euclidean", ) -> float: @@ -425,7 +424,6 @@ def compute_mdm( Args: embeddings: Embedding matrix of shape (n_samples, n_features) - dummy_placeholder: Dummy placeholder to match the signature (unused) n_clusters: Number of clusters/medoids to use metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.) From 5859941f5fe34741e8655d4293500b6acc2b7dba Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 16 Jan 2026 12:58:32 -0500 Subject: [PATCH 05/14] Added InfoSynth metrics --- src/cfg/run_quality_evaluation_cfg.yaml | 18 ++++- src/run_quality_evaluation.py | 66 ++++++++++----- src/utils/__init__.py | 2 + src/utils/quality_evaluation_utils.py | 103 ++++++++++++++++++++++++ 4 files changed, 165 insertions(+), 24 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index 932b102..144ca74 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -24,10 +24,16 @@ quality_eval_cfg: # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) embedding_dimensions: 3072 - diversity_metrics: - - "pad" - - "mmd" - - "mdm" + # Internal diversity metrics (only need synthetic data) + internal_diversity_metrics: + - "mdm" # Mean Distance to Medoid - measures internal coherence + - "entropy" # Differential Entropy - measures diversity/uncertainty + + # Comparison metrics (need both synthetic and real data) + comparison_metrics: + - "pad" # Proxy-A-Distance - measures distribution similarity + - "mmd" # Maximum Mean Discrepancy - measures distribution distance + - "kl_divergence" # KL Divergence - measures novelty (how different from real) pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" @@ -36,6 +42,10 @@ quality_eval_cfg: mdm_n_clusters: 5 mdm_metric: "euclidean" + + entropy_k: 4 # Number of nearest neighbors for differential entropy computation + + kl_k: 4 # Number of nearest neighbors for KL divergence computation exp_cfg: exp_id: "quality_evaluation" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 5cc3bc9..8bab276 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -16,6 +16,8 @@ compute_benchmark_difficulty, compute_benchmark_novelty, compute_benchmark_separability, + compute_differential_entropy, + compute_kl_divergence, compute_mdm, compute_mmd, compute_pad, @@ -346,18 +348,18 @@ def main(cfg: DictConfig) -> None: # Structure: model_dir/...json files (no generation subdirectories) accuracies = _collect_accuracies_from_dir(model_dir) - if not accuracies: - logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) - continue + if not accuracies: + logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) + continue - avg_acc = sum(accuracies) / len(accuracies) - model_to_accuracy[model_name] = avg_acc - logger.info( - "Model '%s' mean accuracy over %d JSON files: %.4f", - model_name, - len(accuracies), - avg_acc, - ) + avg_acc = sum(accuracies) / len(accuracies) + model_to_accuracy[model_name] = avg_acc + logger.info( + "Model '%s' mean accuracy over %d JSON files: %.4f", + model_name, + len(accuracies), + avg_acc, + ) if not model_to_accuracy: logger.error("No valid model accuracies found in %s", base_scores_dir) @@ -403,10 +405,11 @@ def main(cfg: DictConfig) -> None: except Exception as e: # noqa: BLE001 logger.warning("Error computing novelty: %s", e) - # Compute diversity metrics if capabilities directory is provided + # Compute embedding-based metrics if capabilities directory is provided capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None) if capabilities_dir: - metrics_to_compute = getattr(cfg.quality_eval_cfg, "diversity_metrics", ["pad", "mmd", "mdm"]) + internal_diversity_metrics = getattr(cfg.quality_eval_cfg, "internal_diversity_metrics", ["mdm", "entropy"]) + comparison_metrics = getattr(cfg.quality_eval_cfg, "comparison_metrics", ["pad", "mmd", "kl_divergence"]) embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large") embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai") embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072) @@ -416,7 +419,7 @@ def main(cfg: DictConfig) -> None: if synth_dataloader_config: synth_dataloader_config = dict(synth_dataloader_config) - logger.info("Computing diversity metrics for capabilities in %s", capabilities_dir) + logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir) # Load capabilities and generate embeddings synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings( @@ -462,8 +465,8 @@ def main(cfg: DictConfig) -> None: ) if len(real_embeddings) > 0: - # Compute metrics that require both synthetic and real data - if "pad" in metrics_to_compute: + # Compute comparison metrics that require both synthetic and real data + if "pad" in comparison_metrics: try: pad_score = compute_pad( synth_embeddings, @@ -474,7 +477,7 @@ def main(cfg: DictConfig) -> None: except Exception as e: # noqa: BLE001 logger.warning("Error computing PAD: %s", e) - if "mmd" in metrics_to_compute: + if "mmd" in comparison_metrics: try: mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial") mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3) @@ -487,13 +490,25 @@ def main(cfg: DictConfig) -> None: logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing MMD: %s", e) + + if "kl_divergence" in comparison_metrics: + try: + kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4) + kl_score = compute_kl_divergence( + synth_embeddings, + real_embeddings, + k=kl_k, + ) + logger.info("KL divergence score (k=%d): %.4f", kl_k, kl_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing KL divergence: %s", e) else: logger.warning("No real data embeddings generated, skipping comparison metrics") else: - logger.info("No real_data_dir provided, skipping PAD and MMD (require real data)") + logger.info("No real_data_dir provided, skipping comparison metrics (require real data)") - # Compute MDM (can be computed without real data - measures internal diversity) - if "mdm" in metrics_to_compute: + # Compute internal diversity metrics (only need synthetic data) + if "mdm" in internal_diversity_metrics: try: mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5) mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean") @@ -505,6 +520,17 @@ def main(cfg: DictConfig) -> None: logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing MDM: %s", e) + + if "entropy" in internal_diversity_metrics: + try: + entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4) + entropy_score = compute_differential_entropy( + synth_embeddings, + k=entropy_k, + ) + logger.info("Differential entropy score (k=%d): %.4f", entropy_k, entropy_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing differential entropy: %s", e) if __name__ == "__main__": diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 7889911..3c562f3 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -11,6 +11,8 @@ compute_benchmark_difficulty, compute_benchmark_novelty, compute_benchmark_separability, + compute_differential_entropy, + compute_kl_divergence, compute_mdm, compute_mmd, compute_pad, diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index ea7e75c..623e6e2 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -7,6 +7,7 @@ import numpy as np from scipy.stats import spearmanr +from scipy.special import digamma, gammaln from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier @@ -18,6 +19,7 @@ linear_kernel, sigmoid_kernel, ) +from sklearn.neighbors import NearestNeighbors import kmedoids from sklearn.metrics import pairwise_distances @@ -449,3 +451,104 @@ def compute_mdm( return total_dist / n_clusters +# =========================== +# ---- Information-Theoretic Metrics (Entropy, KL-Divergence) +# =========================== + +def compute_differential_entropy( + embeddings: np.ndarray, + k: int = 4, +) -> float: + """ + Compute the differential entropy of a set of embeddings using k-nearest neighbors. + + Differential entropy measures the diversity/uncertainty in the embedding distribution. + Higher values indicate more diverse data. + + This implementation uses the k-NN estimator for differential entropy: + H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps)) + + where: + - N is the number of samples + - d is the embedding dimension + - k is the number of neighbors + - eps is the distance to the k-th nearest neighbor + + Args: + embeddings: Embedding matrix of shape (n_samples, n_features) + k: Number of nearest neighbors to use (default: 4) + + Returns: + float: Differential entropy value (higher is more diverse) + """ + N, d = embeddings.shape + if N < k + 1: + raise ValueError( + f"Cannot compute entropy: need at least {k + 1} samples, but got {N}." + ) + + nbrs = NearestNeighbors(n_neighbors=k + 1).fit(embeddings) + distances, _ = nbrs.kneighbors(embeddings) + eps = distances[:, -1] + eps[eps == 0] = np.nextafter(0, 1) + + log_vol = (d / 2) * np.log(np.pi) - gammaln(d / 2 + 1) + entropy = digamma(N) - digamma(k) + log_vol + d * np.mean(np.log(eps)) + return float(entropy) + + +def compute_kl_divergence( + p_embeddings: np.ndarray, + q_embeddings: np.ndarray, + k: int = 4, + eps: float = 1e-10, +) -> float: + """ + Compute the KL divergence between two sets of embeddings using k-nearest neighbors. + + KL divergence measures how different distribution P is from distribution Q. + Higher values indicate more novelty (P is more different from Q). + + This implementation uses the k-NN estimator for KL divergence: + KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1)) + + where: + - P is the distribution of p_embeddings (n samples) + - Q is the distribution of q_embeddings (m samples) + - d is the embedding dimension + - rho is the distance to the k-th nearest neighbor in P + - nu is the distance to the k-th nearest neighbor in Q + + Args: + p_embeddings: Embeddings of distribution P, shape (n_samples_p, n_features) + q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features) + k: Number of nearest neighbors to use (default: 4) + eps: Small epsilon to avoid division by zero (default: 1e-10) + + Returns: + float: KL divergence value (higher is more novel/different) + """ + n, d = p_embeddings.shape + m, _ = q_embeddings.shape + + if n < k + 1: + raise ValueError( + f"Cannot compute KL divergence: P needs at least {k + 1} samples, but got {n}." + ) + if m < k: + raise ValueError( + f"Cannot compute KL divergence: Q needs at least {k} samples, but got {m}." + ) + + # Find k-th nearest neighbor in P for each point in P + nbrs_p = NearestNeighbors(n_neighbors=k + 1).fit(p_embeddings) + rho = np.maximum(nbrs_p.kneighbors(p_embeddings)[0][:, k], eps) + + # Find k-th nearest neighbor in Q for each point in P + nbrs_q = NearestNeighbors(n_neighbors=k).fit(q_embeddings) + nu = np.maximum(nbrs_q.kneighbors(p_embeddings)[0][:, k - 1], eps) + + kl_div = (d / n) * np.sum(np.log(nu / rho)) + np.log(m / (n - 1)) + return float(kl_div) + + From 24c45f81cad9371ff11d992b970074c32254bb5e Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Wed, 28 Jan 2026 14:34:09 -0500 Subject: [PATCH 06/14] To the PR comments --- src/run_quality_evaluation.py | 26 +++++- src/utils/quality_evaluation_utils.py | 113 +++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 5 deletions(-) diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 8bab276..233c86e 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -59,7 +59,7 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]: return accuracies -def _load_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: +def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: """ Load model accuracies from a directory structure. @@ -385,7 +385,7 @@ def main(cfg: DictConfig) -> None: logger.info("Loading prior datasets for novelty computation...") prior_datasets_accuracies: List[Dict[str, float]] = [] for prior_dir in prior_datasets: - prior_acc = _load_model_accuracies_from_dir(prior_dir) + prior_acc = _load_avg_model_accuracies_from_dir(prior_dir) if prior_acc: prior_datasets_accuracies.append(prior_acc) logger.info( @@ -494,12 +494,21 @@ def main(cfg: DictConfig) -> None: if "kl_divergence" in comparison_metrics: try: kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4) + umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) + umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) + umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) + umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") kl_score = compute_kl_divergence( synth_embeddings, real_embeddings, k=kl_k, + umap_n_components=umap_n_components, + umap_n_neighbors=umap_n_neighbors, + umap_min_dist=umap_min_dist, + umap_metric=umap_metric, ) - logger.info("KL divergence score (k=%d): %.4f", kl_k, kl_score) + umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing KL divergence: %s", e) else: @@ -524,11 +533,20 @@ def main(cfg: DictConfig) -> None: if "entropy" in internal_diversity_metrics: try: entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4) + umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) + umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) + umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) + umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") entropy_score = compute_differential_entropy( synth_embeddings, k=entropy_k, + umap_n_components=umap_n_components, + umap_n_neighbors=umap_n_neighbors, + umap_min_dist=umap_min_dist, + umap_metric=umap_metric, ) - logger.info("Differential entropy score (k=%d): %.4f", entropy_k, entropy_score) + umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing differential entropy: %s", e) diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index 623e6e2..dcebafb 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -3,7 +3,8 @@ from __future__ import annotations import statistics -from typing import Iterable, List, Mapping, Union +import warnings +from typing import Iterable, List, Mapping, Optional, Union import numpy as np from scipy.stats import spearmanr @@ -23,7 +24,16 @@ import kmedoids from sklearn.metrics import pairwise_distances +# Optional UMAP import +try: + from umap import UMAP + UMAP_AVAILABLE = True +except ImportError: + UMAP_AVAILABLE = False + UMAP = None + +# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351 def compute_benchmark_difficulty( accuracies: Union[Iterable[float], Mapping[str, float]], ) -> float: @@ -60,6 +70,7 @@ def compute_benchmark_difficulty( return 1.0 - best_acc +# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351 def compute_benchmark_separability( accuracies: Union[Iterable[float], Mapping[str, float]], ) -> float: @@ -96,6 +107,7 @@ def compute_benchmark_separability( return sum(abs_devs) / len(abs_devs) +# Source paper: Data Swarms - https://arxiv.org/abs/2506.00741 def compute_benchmark_consistency( model_to_generation_accuracies: Mapping[str, Iterable[float]], ) -> float: @@ -180,6 +192,7 @@ def compute_benchmark_consistency( return consistency +# Source paper: AutoBencher - https://arxiv.org/abs/2407.08351 def compute_benchmark_novelty( current_accuracies: Mapping[str, float], prior_datasets_accuracies: List[Mapping[str, float]], @@ -305,6 +318,7 @@ def compute_benchmark_novelty( # ---- Diversity Metrics (PAD, MMD, MDM) # =========================== +# Source paper: SynQue - https://arxiv.org/abs/2511.03928 def compute_pad( x_syn_emb: np.ndarray, x_real_emb: np.ndarray, @@ -356,6 +370,7 @@ def compute_pad( return 2 * (1 - 2 * average_loss) +# Source paper: SynQue - https://arxiv.org/abs/2511.03928 def compute_mmd( X: np.ndarray, Y: np.ndarray, @@ -412,6 +427,7 @@ def compute_mmd( return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY) +# Source paper: SynQue - https://arxiv.org/abs/2511.03928 def compute_mdm( embeddings: np.ndarray, n_clusters: int = 5, @@ -455,9 +471,66 @@ def compute_mdm( # ---- Information-Theoretic Metrics (Entropy, KL-Divergence) # =========================== +def _apply_umap_reduction( + embeddings: np.ndarray, + n_components: Optional[int] = None, + n_neighbors: int = 15, + min_dist: float = 0.1, + metric: str = "cosine", +) -> np.ndarray: + """ + Optionally apply UMAP dimensionality reduction to embeddings. + + Args: + embeddings: Embedding matrix of shape (n_samples, n_features) + n_components: Target dimension. If None, returns original embeddings. + n_neighbors: Number of neighbors for UMAP (default: 15) + min_dist: Minimum distance for UMAP (default: 0.1) + metric: Distance metric for UMAP (default: "cosine") + + Returns: + Reduced embeddings if n_components is provided, otherwise original embeddings + """ + if n_components is None: + return embeddings + + if not UMAP_AVAILABLE: + raise ImportError( + "UMAP is required for dimensionality reduction. " + "Install it with: pip install umap-learn" + ) + + if embeddings.shape[1] <= n_components: + # Already at or below target dimension + return embeddings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + umap_model = UMAP( + n_neighbors=n_neighbors, + min_dist=min_dist, + n_components=n_components, + metric=metric, + random_state=42, + ) + reduced_embeddings = umap_model.fit_transform(embeddings) + + # Renormalize (like InfoSynth does) + norms = np.linalg.norm(reduced_embeddings, axis=1, keepdims=True) + eps = 1e-12 + reduced_embeddings = reduced_embeddings / (norms + eps) + + return reduced_embeddings + + +# Source paper: InfoSyth - https://arxiv.org/abs/2601.00575 def compute_differential_entropy( embeddings: np.ndarray, k: int = 4, + umap_n_components: Optional[int] = None, + umap_n_neighbors: int = 15, + umap_min_dist: float = 0.1, + umap_metric: str = "cosine", ) -> float: """ Compute the differential entropy of a set of embeddings using k-nearest neighbors. @@ -477,10 +550,24 @@ def compute_differential_entropy( Args: embeddings: Embedding matrix of shape (n_samples, n_features) k: Number of nearest neighbors to use (default: 4) + umap_n_components: Optional UMAP target dimension. If None, uses original embeddings. + umap_n_neighbors: Number of neighbors for UMAP (default: 15) + umap_min_dist: Minimum distance for UMAP (default: 0.1) + umap_metric: Distance metric for UMAP (default: "cosine") Returns: float: Differential entropy value (higher is more diverse) """ + # Apply UMAP reduction if requested + if umap_n_components is not None: + embeddings = _apply_umap_reduction( + embeddings, + n_components=umap_n_components, + n_neighbors=umap_n_neighbors, + min_dist=umap_min_dist, + metric=umap_metric, + ) + N, d = embeddings.shape if N < k + 1: raise ValueError( @@ -497,11 +584,16 @@ def compute_differential_entropy( return float(entropy) +# Source paper: InfoSyth - https://arxiv.org/abs/2601.00575 def compute_kl_divergence( p_embeddings: np.ndarray, q_embeddings: np.ndarray, k: int = 4, eps: float = 1e-10, + umap_n_components: Optional[int] = None, + umap_n_neighbors: int = 15, + umap_min_dist: float = 0.1, + umap_metric: str = "cosine", ) -> float: """ Compute the KL divergence between two sets of embeddings using k-nearest neighbors. @@ -524,10 +616,29 @@ def compute_kl_divergence( q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features) k: Number of nearest neighbors to use (default: 4) eps: Small epsilon to avoid division by zero (default: 1e-10) + umap_n_components: Optional UMAP target dimension. If None, uses original embeddings. + umap_n_neighbors: Number of neighbors for UMAP (default: 15) + umap_min_dist: Minimum distance for UMAP (default: 0.1) + umap_metric: Distance metric for UMAP (default: "cosine") Returns: float: KL divergence value (higher is more novel/different) """ + # Apply UMAP reduction if requested (apply to both embeddings together for consistency) + if umap_n_components is not None: + # Stack embeddings, apply UMAP, then split back + # This ensures both distributions are reduced in the same space + combined_embeddings = np.vstack([p_embeddings, q_embeddings]) + reduced_combined = _apply_umap_reduction( + combined_embeddings, + n_components=umap_n_components, + n_neighbors=umap_n_neighbors, + min_dist=umap_min_dist, + metric=umap_metric, + ) + p_embeddings = reduced_combined[:len(p_embeddings)] + q_embeddings = reduced_combined[len(p_embeddings):] + n, d = p_embeddings.shape m, _ = q_embeddings.shape From 9cec4f88c1aa099574d18a4be2976c7c9eeffb4b Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 30 Jan 2026 08:40:55 -0500 Subject: [PATCH 07/14] Updated UMAP --- src/run_quality_evaluation.py | 184 ++++++++++++++------------ src/utils/__init__.py | 1 + src/utils/quality_evaluation_utils.py | 130 ++++++------------ 3 files changed, 146 insertions(+), 169 deletions(-) diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 233c86e..3be6a25 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -21,6 +21,7 @@ compute_mdm, compute_mmd, compute_pad, + fit_umap_shared, ) from src.utils import constants from src.utils.data_utils import get_run_id @@ -40,10 +41,10 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]: """ Collect all accuracy values from JSON files in a directory (recursively). - + Args: directory: Directory to walk recursively for JSON files. - + Returns: List of accuracy values found in the directory. """ @@ -62,29 +63,29 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]: def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: """ Load model accuracies from a directory structure. - + Args: base_dir: Directory containing per-model subdirectories with JSON files. - + Returns: Dictionary mapping model name to average accuracy. """ model_to_accuracy: Dict[str, float] = {} - + if not os.path.isdir(base_dir): logger.warning("Directory does not exist: %s", base_dir) return model_to_accuracy - + for model_name in os.listdir(base_dir): model_dir = os.path.join(base_dir, model_name) if not os.path.isdir(model_dir): continue - + accuracies = _collect_accuracies_from_dir(model_dir) if accuracies: avg_acc = sum(accuracies) / len(accuracies) model_to_accuracy[model_name] = avg_acc - + return model_to_accuracy @@ -93,31 +94,31 @@ def _create_dataloader_from_config( dataloader_config: Dict[str, Any], ) -> DatasetDataloader: """Create a dataloader from configuration. - + Args: data_path: Path to the data dataloader_config: Configuration dict with 'type' and other fields - + Returns: DatasetDataloader instance """ dataloader_type = dataloader_config.get("type", "capability") - + if dataloader_type == "capability": return CapabilityDataloader(data_path) - + elif dataloader_type == "huggingface": from datasets import load_dataset dataset_name = dataloader_config.get("dataset_name") split = dataloader_config.get("split", "train") subset = dataloader_config.get("subset", None) dataset = load_dataset(dataset_name, name=subset, split=split) - + return HuggingFaceDatasetDataloader( dataset=dataset, text_field=dataloader_config.get("text_field", "problem"), ) - + elif dataloader_type == "jsonl": return JSONLDataloader( jsonl_path=data_path, @@ -127,7 +128,7 @@ def _create_dataloader_from_config( instructions_field=dataloader_config.get("instructions_field"), task_field=dataloader_config.get("task_field", "problem"), ) - + elif dataloader_type == "csv": return CSVDataloader( csv_path=data_path, @@ -137,7 +138,7 @@ def _create_dataloader_from_config( instructions_field=dataloader_config.get("instructions_field"), task_field=dataloader_config.get("task_field", "problem"), ) - + else: raise ValueError(f"Unknown dataloader type: {dataloader_type}") @@ -148,22 +149,22 @@ def _load_capabilities_and_generate_embeddings( embed_dimensions: int, dataloader_config: Optional[Dict[str, Any]] = None, embedding_backend: str = "openai", -) -> tuple[np.ndarray, List[Any]]: +) -> tuple[np.ndarray, List[str]]: """ Load capabilities from directory and generate embeddings. - + Supports both capability format (default) and custom dataloaders. Always uses the dataloader system for consistency. - + Args: capabilities_dir: Directory containing capability subdirectories OR path to data file embedding_model_name: Name of embedding model to use embed_dimensions: Number of embedding dimensions dataloader_config: Optional configuration for custom dataloader. If None, defaults to capability format. - + Returns: - Tuple of (embeddings array, list of items/capabilities) + Tuple of (embeddings array, list of extracted texts) """ # Use dataloader system: default to capability format if no config provided if dataloader_config: @@ -176,16 +177,16 @@ def _load_capabilities_and_generate_embeddings( return np.array([]), [] logger.info("Using capability format dataloader for %s", capabilities_dir) dataloader = CapabilityDataloader(capabilities_dir) - + # Extract texts using the dataloader texts = load_texts_from_dataloader(dataloader) - + if not texts: logger.warning("No texts extracted from %s", capabilities_dir) return np.array([]), [] - + logger.info("Extracted %d texts for embedding", len(texts)) - + # Generate embeddings logger.info( "Generating embeddings using %s (backend=%s)", @@ -210,7 +211,7 @@ def _load_capabilities_and_generate_embeddings( exc, ) return np.array([]), [] - + hf_model = SentenceTransformer(embedding_model_name) embeddings_array = hf_model.encode( texts, @@ -228,8 +229,8 @@ def _load_capabilities_and_generate_embeddings( else: logger.error("Unknown embedding_backend: %s", embedding_backend) return np.array([]), [] - - return embeddings_array, [] + + return embeddings_array, texts def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: @@ -246,7 +247,7 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: if "error" in data or "results" not in data: # File has error or no results, skip it return None - + scores = data["results"]["scores"] if not scores: return None @@ -291,7 +292,7 @@ def main(cfg: DictConfig) -> None: model_to_accuracy: Dict[str, float] = {} # For consistency: map model to list of accuracies per generation model_to_generation_accuracies: Dict[str, List[float]] = {} - + # Get prior dataset names to exclude them from current dataset prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) prior_dataset_names = set() @@ -299,7 +300,7 @@ def main(cfg: DictConfig) -> None: # Extract the directory name from the path prior_name = os.path.basename(os.path.normpath(prior_path)) prior_dataset_names.add(prior_name) - + for model_name in os.listdir(base_scores_dir): # Skip if this is a prior dataset directory if model_name in prior_dataset_names: @@ -314,7 +315,7 @@ def main(cfg: DictConfig) -> None: d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) ] - + if subdirs: # Structure: model_dir/generation_dir/...json files # Each subdirectory represents a different dataset generation @@ -322,7 +323,7 @@ def main(cfg: DictConfig) -> None: for gen_dir_name in sorted(subdirs): gen_dir = os.path.join(model_dir, gen_dir_name) gen_accuracies = _collect_accuracies_from_dir(gen_dir) - + if gen_accuracies: avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies) generation_accuracies.append(avg_gen_acc) @@ -330,7 +331,7 @@ def main(cfg: DictConfig) -> None: "Model '%s' generation '%s': %.4f (from %d JSON files)", model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies) ) - + if generation_accuracies: model_to_generation_accuracies[model_name] = generation_accuracies # Overall average across all generations @@ -369,7 +370,7 @@ def main(cfg: DictConfig) -> None: separability = compute_benchmark_separability(model_to_accuracy) logger.info("Benchmark difficulty: %.4f", difficulty) logger.info("Benchmark separability: %.4f", separability) - + # Compute consistency if we have multiple generations per model if model_to_generation_accuracies: try: @@ -377,7 +378,7 @@ def main(cfg: DictConfig) -> None: logger.info("Benchmark consistency: %.4f", consistency) except ValueError as e: logger.warning("Could not compute consistency: %s", e) - + # Compute novelty if prior datasets are provided prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) if prior_datasets: @@ -394,7 +395,7 @@ def main(cfg: DictConfig) -> None: ) else: logger.warning("No accuracies found in prior dataset: %s", prior_dir) - + if prior_datasets_accuracies: novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies) logger.info("Benchmark novelty: %.4f", novelty) @@ -404,7 +405,7 @@ def main(cfg: DictConfig) -> None: logger.warning("Could not compute novelty: %s", e) except Exception as e: # noqa: BLE001 logger.warning("Error computing novelty: %s", e) - + # Compute embedding-based metrics if capabilities directory is provided capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None) if capabilities_dir: @@ -413,14 +414,14 @@ def main(cfg: DictConfig) -> None: embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large") embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai") embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072) - + # Get dataloader config if provided synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None) if synth_dataloader_config: synth_dataloader_config = dict(synth_dataloader_config) - + logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir) - + # Load capabilities and generate embeddings synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings( capabilities_dir=capabilities_dir, @@ -429,14 +430,15 @@ def main(cfg: DictConfig) -> None: dataloader_config=synth_dataloader_config, embedding_backend=embedding_backend, ) - + if len(synth_embeddings) == 0: logger.warning("No embeddings generated, skipping diversity metrics") else: + real_embeddings = None # Check if real data directory/file is provided for comparison real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None) real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None) - + # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.) has_real_data = False # Case 1: local path (capability/JSONL/CSV formats) @@ -445,12 +447,12 @@ def main(cfg: DictConfig) -> None: # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None) elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface": has_real_data = True - + if has_real_data: # Get real data dataloader config if provided if real_dataloader_config: real_dataloader_config = dict(real_dataloader_config) - + if real_data_dir: logger.info("Loading real data embeddings from %s", real_data_dir) else: @@ -463,7 +465,7 @@ def main(cfg: DictConfig) -> None: dataloader_config=real_dataloader_config, embedding_backend=embedding_backend, ) - + if len(real_embeddings) > 0: # Compute comparison metrics that require both synthetic and real data if "pad" in comparison_metrics: @@ -476,7 +478,7 @@ def main(cfg: DictConfig) -> None: logger.info("PAD score: %.4f", pad_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing PAD: %s", e) - + if "mmd" in comparison_metrics: try: mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial") @@ -490,32 +492,60 @@ def main(cfg: DictConfig) -> None: logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing MMD: %s", e) - - if "kl_divergence" in comparison_metrics: - try: - kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4) - umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) - umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) - umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) - umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") - kl_score = compute_kl_divergence( - synth_embeddings, - real_embeddings, - k=kl_k, - umap_n_components=umap_n_components, - umap_n_neighbors=umap_n_neighbors, - umap_min_dist=umap_min_dist, - umap_metric=umap_metric, - ) - umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing KL divergence: %s", e) else: logger.warning("No real data embeddings generated, skipping comparison metrics") else: logger.info("No real_data_dir provided, skipping comparison metrics (require real data)") - + + # Joint UMAP (InfoSynth-style): fit on all datasets so entropy/KL are in a shared space + umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) + umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) + umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) + umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") + need_joint_umap = ( + umap_n_components is not None + and ( + "entropy" in internal_diversity_metrics + or ( + "kl_divergence" in comparison_metrics + and real_embeddings is not None + and len(real_embeddings) > 0 + ) + ) + ) + synth_reduced = None + real_reduced = None + if need_joint_umap: + all_emb = [synth_embeddings] + if real_embeddings is not None and len(real_embeddings) > 0: + all_emb.append(real_embeddings) + reduced_list = fit_umap_shared( + all_emb, + umap_n_components, + n_neighbors=umap_n_neighbors, + min_dist=umap_min_dist, + metric=umap_metric, + ) + synth_reduced = reduced_list[0] + if len(reduced_list) > 1: + real_reduced = reduced_list[1] + + # KL divergence (uses joint UMAP when need_joint_umap so synth and real share a space) + if ( + "kl_divergence" in comparison_metrics + and real_embeddings is not None + and len(real_embeddings) > 0 + ): + try: + kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4) + kl_synth = synth_reduced if real_reduced is not None else synth_embeddings + kl_real = real_reduced if real_reduced is not None else real_embeddings + kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k) + umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing KL divergence: %s", e) + # Compute internal diversity metrics (only need synthetic data) if "mdm" in internal_diversity_metrics: try: @@ -529,22 +559,12 @@ def main(cfg: DictConfig) -> None: logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score) except Exception as e: # noqa: BLE001 logger.warning("Error computing MDM: %s", e) - + if "entropy" in internal_diversity_metrics: try: entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4) - umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) - umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) - umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) - umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") - entropy_score = compute_differential_entropy( - synth_embeddings, - k=entropy_k, - umap_n_components=umap_n_components, - umap_n_neighbors=umap_n_neighbors, - umap_min_dist=umap_min_dist, - umap_metric=umap_metric, - ) + entropy_emb = synth_reduced if synth_reduced is not None else synth_embeddings + entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k) umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score) except Exception as e: # noqa: BLE001 diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 3c562f3..02d2cef 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -16,4 +16,5 @@ compute_mdm, compute_mmd, compute_pad, + fit_umap_shared, ) diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index dcebafb..e31456a 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -471,39 +471,40 @@ def compute_mdm( # ---- Information-Theoretic Metrics (Entropy, KL-Divergence) # =========================== -def _apply_umap_reduction( - embeddings: np.ndarray, - n_components: Optional[int] = None, +def fit_umap_shared( + embeddings_list: List[np.ndarray], + n_components: int, n_neighbors: int = 15, min_dist: float = 0.1, metric: str = "cosine", -) -> np.ndarray: +) -> List[np.ndarray]: """ - Optionally apply UMAP dimensionality reduction to embeddings. - + Fit UMAP on the concatenation of all embedding arrays, then split back (InfoSynth-style). + + This ensures entropy and KL divergence are comparable across datasets by using + a single shared low-dimensional space. + Args: - embeddings: Embedding matrix of shape (n_samples, n_features) - n_components: Target dimension. If None, returns original embeddings. - n_neighbors: Number of neighbors for UMAP (default: 15) - min_dist: Minimum distance for UMAP (default: 0.1) - metric: Distance metric for UMAP (default: "cosine") - + embeddings_list: List of embedding matrices, each shape (n_i, n_features). + n_components: UMAP target dimension. + n_neighbors: Number of neighbors for UMAP (default: 15). + min_dist: Minimum distance for UMAP (default: 0.1). + metric: Distance metric for UMAP (default: "cosine"). + Returns: - Reduced embeddings if n_components is provided, otherwise original embeddings + List of reduced embedding arrays in the same order as embeddings_list. """ - if n_components is None: - return embeddings - if not UMAP_AVAILABLE: raise ImportError( - "UMAP is required for dimensionality reduction. " - "Install it with: pip install umap-learn" + "UMAP is required. Install it with: pip install umap-learn" ) - - if embeddings.shape[1] <= n_components: - # Already at or below target dimension - return embeddings - + if not embeddings_list: + return [] + counts = [emb.shape[0] for emb in embeddings_list] + split_indices = np.cumsum(counts)[:-1] + combined = np.vstack(embeddings_list) + if combined.shape[1] <= n_components: + return [emb.copy() for emb in embeddings_list] with warnings.catch_warnings(): warnings.simplefilter("ignore") umap_model = UMAP( @@ -513,61 +514,38 @@ def _apply_umap_reduction( metric=metric, random_state=42, ) - reduced_embeddings = umap_model.fit_transform(embeddings) - - # Renormalize (like InfoSynth does) - norms = np.linalg.norm(reduced_embeddings, axis=1, keepdims=True) + reduced = umap_model.fit_transform(combined) + norms = np.linalg.norm(reduced, axis=1, keepdims=True) eps = 1e-12 - reduced_embeddings = reduced_embeddings / (norms + eps) - - return reduced_embeddings + reduced = reduced / (norms + eps) + return np.split(reduced, split_indices, axis=0) # Source paper: InfoSyth - https://arxiv.org/abs/2601.00575 -def compute_differential_entropy( - embeddings: np.ndarray, - k: int = 4, - umap_n_components: Optional[int] = None, - umap_n_neighbors: int = 15, - umap_min_dist: float = 0.1, - umap_metric: str = "cosine", -) -> float: +def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float: """ Compute the differential entropy of a set of embeddings using k-nearest neighbors. - + Differential entropy measures the diversity/uncertainty in the embedding distribution. - Higher values indicate more diverse data. - + Higher values indicate more diverse data. For a shared space across datasets, apply + UMAP (e.g. fit_umap_shared) to embeddings before calling this function. + This implementation uses the k-NN estimator for differential entropy: H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps)) - + where: - N is the number of samples - d is the embedding dimension - k is the number of neighbors - eps is the distance to the k-th nearest neighbor - + Args: embeddings: Embedding matrix of shape (n_samples, n_features) k: Number of nearest neighbors to use (default: 4) - umap_n_components: Optional UMAP target dimension. If None, uses original embeddings. - umap_n_neighbors: Number of neighbors for UMAP (default: 15) - umap_min_dist: Minimum distance for UMAP (default: 0.1) - umap_metric: Distance metric for UMAP (default: "cosine") - + Returns: float: Differential entropy value (higher is more diverse) """ - # Apply UMAP reduction if requested - if umap_n_components is not None: - embeddings = _apply_umap_reduction( - embeddings, - n_components=umap_n_components, - n_neighbors=umap_n_neighbors, - min_dist=umap_min_dist, - metric=umap_metric, - ) - N, d = embeddings.shape if N < k + 1: raise ValueError( @@ -590,55 +568,33 @@ def compute_kl_divergence( q_embeddings: np.ndarray, k: int = 4, eps: float = 1e-10, - umap_n_components: Optional[int] = None, - umap_n_neighbors: int = 15, - umap_min_dist: float = 0.1, - umap_metric: str = "cosine", ) -> float: """ Compute the KL divergence between two sets of embeddings using k-nearest neighbors. - + KL divergence measures how different distribution P is from distribution Q. - Higher values indicate more novelty (P is more different from Q). - + Higher values indicate more novelty (P is more different from Q). For a shared + space, apply UMAP (e.g. fit_umap_shared) to [P, Q] before calling this function. + This implementation uses the k-NN estimator for KL divergence: KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1)) - + where: - P is the distribution of p_embeddings (n samples) - Q is the distribution of q_embeddings (m samples) - d is the embedding dimension - rho is the distance to the k-th nearest neighbor in P - nu is the distance to the k-th nearest neighbor in Q - + Args: p_embeddings: Embeddings of distribution P, shape (n_samples_p, n_features) q_embeddings: Embeddings of distribution Q, shape (n_samples_q, n_features) k: Number of nearest neighbors to use (default: 4) eps: Small epsilon to avoid division by zero (default: 1e-10) - umap_n_components: Optional UMAP target dimension. If None, uses original embeddings. - umap_n_neighbors: Number of neighbors for UMAP (default: 15) - umap_min_dist: Minimum distance for UMAP (default: 0.1) - umap_metric: Distance metric for UMAP (default: "cosine") - + Returns: float: KL divergence value (higher is more novel/different) """ - # Apply UMAP reduction if requested (apply to both embeddings together for consistency) - if umap_n_components is not None: - # Stack embeddings, apply UMAP, then split back - # This ensures both distributions are reduced in the same space - combined_embeddings = np.vstack([p_embeddings, q_embeddings]) - reduced_combined = _apply_umap_reduction( - combined_embeddings, - n_components=umap_n_components, - n_neighbors=umap_n_neighbors, - min_dist=umap_min_dist, - metric=umap_metric, - ) - p_embeddings = reduced_combined[:len(p_embeddings)] - q_embeddings = reduced_combined[len(p_embeddings):] - n, d = p_embeddings.shape m, _ = q_embeddings.shape From 5a2a4a42e551325abe21419baa866aae9819f475 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 30 Jan 2026 11:45:47 -0500 Subject: [PATCH 08/14] Removed default values --- src/cfg/run_quality_evaluation_cfg.yaml | 31 +-- src/run_quality_evaluation.py | 241 +++++++++++++++--------- src/utils/quality_evaluation_utils.py | 230 +++++++++++----------- 3 files changed, 288 insertions(+), 214 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index 144ca74..f92b0a9 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -6,51 +6,56 @@ quality_eval_cfg: scores_subdir: "scores" prior_datasets: - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500" - + capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" - + synthetic_dataloader_config: null # Optional: custom dataloader for capabilities_dir (e.g. jsonl, csv, huggingface) + real_data_dir: null - + real_dataloader_config: type: "huggingface" dataset_name: "HuggingFaceH4/MATH-500" split: "test" subset: null text_field: "problem" - + # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers embedding_backend: "openai" embedding_model: "text-embedding-3-large" # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) embedding_dimensions: 3072 - + # Internal diversity metrics (only need synthetic data) internal_diversity_metrics: - "mdm" # Mean Distance to Medoid - measures internal coherence - "entropy" # Differential Entropy - measures diversity/uncertainty - + # Comparison metrics (need both synthetic and real data) comparison_metrics: - "pad" # Proxy-A-Distance - measures distribution similarity - "mmd" # Maximum Mean Discrepancy - measures distribution distance - "kl_divergence" # KL Divergence - measures novelty (how different from real) - + pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" - + mmd_kernel: "polynomial" # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid" mmd_degree: 3 - + mdm_n_clusters: 5 mdm_metric: "euclidean" - + entropy_k: 4 # Number of nearest neighbors for differential entropy computation - + kl_k: 4 # Number of nearest neighbors for KL divergence computation + # Optional UMAP dimensionality reduction (like InfoSynth) + umap_n_components: 10 # Set to null to disable and use original embeddings + umap_n_neighbors: 15 # Number of neighbors for UMAP + umap_min_dist: 0.1 # Minimum distance for UMAP + umap_metric: "cosine" # Distance metric for UMAP + exp_cfg: exp_id: "quality_evaluation" defaults: - _self_ - - diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 3be6a25..82e8228 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -1,13 +1,12 @@ -"""Script to compute quality metrics (e.g., benchmark difficulty) from existing scores.""" +"""Compute quality metrics (e.g., benchmark difficulty) from existing scores.""" import json import logging import os -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Mapping, Optional, cast import hydra import numpy as np -import torch from omegaconf import DictConfig from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName @@ -21,16 +20,16 @@ compute_mdm, compute_mmd, compute_pad, + constants, fit_umap_shared, ) -from src.utils import constants from src.utils.data_utils import get_run_id from src.utils.diversity_metrics_dataloaders import ( CapabilityDataloader, - HuggingFaceDatasetDataloader, - JSONLDataloader, CSVDataloader, DatasetDataloader, + HuggingFaceDatasetDataloader, + JSONLDataloader, load_texts_from_dataloader, ) @@ -45,7 +44,8 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]: Args: directory: Directory to walk recursively for JSON files. - Returns: + Returns + ------- List of accuracy values found in the directory. """ accuracies: List[float] = [] @@ -67,7 +67,8 @@ def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: Args: base_dir: Directory containing per-model subdirectories with JSON files. - Returns: + Returns + ------- Dictionary mapping model name to average accuracy. """ model_to_accuracy: Dict[str, float] = {} @@ -99,7 +100,8 @@ def _create_dataloader_from_config( data_path: Path to the data dataloader_config: Configuration dict with 'type' and other fields - Returns: + Returns + ------- DatasetDataloader instance """ dataloader_type = dataloader_config.get("type", "capability") @@ -107,11 +109,12 @@ def _create_dataloader_from_config( if dataloader_type == "capability": return CapabilityDataloader(data_path) - elif dataloader_type == "huggingface": + if dataloader_type == "huggingface": from datasets import load_dataset + dataset_name = dataloader_config.get("dataset_name") split = dataloader_config.get("split", "train") - subset = dataloader_config.get("subset", None) + subset = dataloader_config.get("subset") dataset = load_dataset(dataset_name, name=subset, split=split) return HuggingFaceDatasetDataloader( @@ -119,7 +122,7 @@ def _create_dataloader_from_config( text_field=dataloader_config.get("text_field", "problem"), ) - elif dataloader_type == "jsonl": + if dataloader_type == "jsonl": return JSONLDataloader( jsonl_path=data_path, name_field=dataloader_config.get("name_field", "name"), @@ -129,7 +132,7 @@ def _create_dataloader_from_config( task_field=dataloader_config.get("task_field", "problem"), ) - elif dataloader_type == "csv": + if dataloader_type == "csv": return CSVDataloader( csv_path=data_path, name_field=dataloader_config.get("name_field", "name"), @@ -139,8 +142,7 @@ def _create_dataloader_from_config( task_field=dataloader_config.get("task_field", "problem"), ) - else: - raise ValueError(f"Unknown dataloader type: {dataloader_type}") + raise ValueError(f"Unknown dataloader type: {dataloader_type}") def _load_capabilities_and_generate_embeddings( @@ -157,23 +159,29 @@ def _load_capabilities_and_generate_embeddings( Always uses the dataloader system for consistency. Args: - capabilities_dir: Directory containing capability subdirectories OR path to data file + capabilities_dir: Dir with capability subdirs or path to data file embedding_model_name: Name of embedding model to use embed_dimensions: Number of embedding dimensions dataloader_config: Optional configuration for custom dataloader. If None, defaults to capability format. - Returns: + Returns + ------- Tuple of (embeddings array, list of extracted texts) """ # Use dataloader system: default to capability format if no config provided if dataloader_config: - logger.info("Using custom dataloader: %s", dataloader_config.get("type", "unknown")) + logger.info( + "Using custom dataloader: %s", dataloader_config.get("type", "unknown") + ) dataloader = _create_dataloader_from_config(capabilities_dir, dataloader_config) else: # Default: use capability format dataloader if not os.path.isdir(capabilities_dir): - logger.error("capabilities_dir must be a directory when using default capability format: %s", capabilities_dir) + logger.error( + "capabilities_dir must be a directory when using default capability format: %s", + capabilities_dir, + ) return np.array([]), [] logger.info("Using capability format dataloader for %s", capabilities_dir) dataloader = CapabilityDataloader(capabilities_dir) @@ -204,7 +212,7 @@ def _load_capabilities_and_generate_embeddings( elif embedding_backend.lower() == "huggingface": # Use HuggingFace encoder models such as gte-Qwen try: - from sentence_transformers import SentenceTransformer # type: ignore[import] + from sentence_transformers import SentenceTransformer except Exception as exc: # noqa: BLE001 logger.error( "Failed to import sentence_transformers for HuggingFace embeddings: %s", @@ -259,14 +267,14 @@ def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: return None -@hydra.main(version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg") +@hydra.main( + version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg" +) def main(cfg: DictConfig) -> None: - """ - Compute benchmark-level quality metrics from saved capability scores. - """ + """Compute benchmark-level quality metrics from saved capability scores.""" run_id = get_run_id(cfg) - scores_root_dir = getattr(cfg.quality_eval_cfg, "scores_root_dir", None) + scores_root_dir = cfg.quality_eval_cfg.scores_root_dir if scores_root_dir: base_scores_dir = scores_root_dir else: @@ -294,7 +302,7 @@ def main(cfg: DictConfig) -> None: model_to_generation_accuracies: Dict[str, List[float]] = {} # Get prior dataset names to exclude them from current dataset - prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) + prior_datasets = cfg.quality_eval_cfg.prior_datasets prior_dataset_names = set() for prior_path in prior_datasets: # Extract the directory name from the path @@ -312,7 +320,8 @@ def main(cfg: DictConfig) -> None: # Check if model_dir contains subdirectories (generations/runs) subdirs = [ - d for d in os.listdir(model_dir) + d + for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) ] @@ -329,7 +338,10 @@ def main(cfg: DictConfig) -> None: generation_accuracies.append(avg_gen_acc) logger.debug( "Model '%s' generation '%s': %.4f (from %d JSON files)", - model_name, gen_dir_name, avg_gen_acc, len(gen_accuracies) + model_name, + gen_dir_name, + avg_gen_acc, + len(gen_accuracies), ) if generation_accuracies: @@ -345,12 +357,13 @@ def main(cfg: DictConfig) -> None: ) # Continue to next model if we processed subdirs continue - else: - # Structure: model_dir/...json files (no generation subdirectories) - accuracies = _collect_accuracies_from_dir(model_dir) + # Structure: model_dir/...json files (no generation subdirectories) + accuracies = _collect_accuracies_from_dir(model_dir) if not accuracies: - logger.warning("No accuracies found for model '%s' in %s", model_name, model_dir) + logger.warning( + "No accuracies found for model '%s' in %s", model_name, model_dir + ) continue avg_acc = sum(accuracies) / len(accuracies) @@ -380,7 +393,7 @@ def main(cfg: DictConfig) -> None: logger.warning("Could not compute consistency: %s", e) # Compute novelty if prior datasets are provided - prior_datasets = getattr(cfg.quality_eval_cfg, "prior_datasets", []) + prior_datasets = cfg.quality_eval_cfg.prior_datasets if prior_datasets: try: logger.info("Loading prior datasets for novelty computation...") @@ -391,36 +404,46 @@ def main(cfg: DictConfig) -> None: prior_datasets_accuracies.append(prior_acc) logger.info( "Loaded prior dataset from %s: %d models", - prior_dir, len(prior_acc) + prior_dir, + len(prior_acc), ) else: - logger.warning("No accuracies found in prior dataset: %s", prior_dir) + logger.warning( + "No accuracies found in prior dataset: %s", prior_dir + ) if prior_datasets_accuracies: - novelty = compute_benchmark_novelty(model_to_accuracy, prior_datasets_accuracies) + novelty = compute_benchmark_novelty( + model_to_accuracy, + cast(List[Mapping[str, float]], prior_datasets_accuracies), + ) logger.info("Benchmark novelty: %.4f", novelty) else: - logger.warning("No valid prior datasets found, skipping novelty computation.") + logger.warning( + "No valid prior datasets found, skipping novelty computation." + ) except ValueError as e: logger.warning("Could not compute novelty: %s", e) except Exception as e: # noqa: BLE001 logger.warning("Error computing novelty: %s", e) # Compute embedding-based metrics if capabilities directory is provided - capabilities_dir = getattr(cfg.quality_eval_cfg, "capabilities_dir", None) + capabilities_dir = cfg.quality_eval_cfg.capabilities_dir if capabilities_dir: - internal_diversity_metrics = getattr(cfg.quality_eval_cfg, "internal_diversity_metrics", ["mdm", "entropy"]) - comparison_metrics = getattr(cfg.quality_eval_cfg, "comparison_metrics", ["pad", "mmd", "kl_divergence"]) - embedding_model = getattr(cfg.quality_eval_cfg, "embedding_model", "text-embedding-3-large") - embedding_backend = getattr(cfg.quality_eval_cfg, "embedding_backend", "openai") - embed_dimensions = getattr(cfg.quality_eval_cfg, "embedding_dimensions", 3072) + internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics + comparison_metrics = cfg.quality_eval_cfg.comparison_metrics + embedding_model = cfg.quality_eval_cfg.embedding_model + embedding_backend = cfg.quality_eval_cfg.embedding_backend + embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions # Get dataloader config if provided - synth_dataloader_config = getattr(cfg.quality_eval_cfg, "synthetic_dataloader_config", None) + synth_dataloader_config = cfg.quality_eval_cfg.synthetic_dataloader_config if synth_dataloader_config: synth_dataloader_config = dict(synth_dataloader_config) - logger.info("Computing embedding-based metrics for capabilities in %s", capabilities_dir) + logger.info( + "Computing embedding-based metrics for capabilities in %s", capabilities_dir + ) # Load capabilities and generate embeddings synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings( @@ -436,16 +459,18 @@ def main(cfg: DictConfig) -> None: else: real_embeddings = None # Check if real data directory/file is provided for comparison - real_data_dir = getattr(cfg.quality_eval_cfg, "real_data_dir", None) - real_dataloader_config = getattr(cfg.quality_eval_cfg, "real_dataloader_config", None) + real_data_dir = cfg.quality_eval_cfg.real_data_dir + real_dataloader_config = cfg.quality_eval_cfg.real_dataloader_config - # Check if we have real data: either a valid path OR a dataloader config (for HuggingFace, etc.) + # Real data: valid path or dataloader config (e.g. HuggingFace) has_real_data = False # Case 1: local path (capability/JSONL/CSV formats) - if real_data_dir and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)): - has_real_data = True - # Case 2: HuggingFace dataset via dataloader (real_data_dir may be None) - elif real_dataloader_config and real_dataloader_config.get("type") == "huggingface": + if ( + real_data_dir + and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)) + or real_dataloader_config + and real_dataloader_config.get("type") == "huggingface" + ): has_real_data = True if has_real_data: @@ -456,9 +481,11 @@ def main(cfg: DictConfig) -> None: if real_data_dir: logger.info("Loading real data embeddings from %s", real_data_dir) else: - logger.info("Loading real data embeddings using dataloader config (no local path)") + logger.info( + "Loading real data embeddings using dataloader config (no local path)" + ) real_embeddings, _ = _load_capabilities_and_generate_embeddings( - # For HuggingFace, the capabilities_dir is unused; fallback to empty string + # HuggingFace: capabilities_dir unused, pass empty string capabilities_dir=real_data_dir or "", embedding_model_name=embedding_model, embed_dimensions=embed_dimensions, @@ -467,13 +494,13 @@ def main(cfg: DictConfig) -> None: ) if len(real_embeddings) > 0: - # Compute comparison metrics that require both synthetic and real data + # Comparison metrics (need both synth and real) if "pad" in comparison_metrics: try: pad_score = compute_pad( synth_embeddings, real_embeddings, - classifier_name=getattr(cfg.quality_eval_cfg, "pad_classifier", "LogisticRegression"), + classifier_name=cfg.quality_eval_cfg.pad_classifier, ) logger.info("PAD score: %.4f", pad_score) except Exception as e: # noqa: BLE001 @@ -481,36 +508,39 @@ def main(cfg: DictConfig) -> None: if "mmd" in comparison_metrics: try: - mmd_kernel = getattr(cfg.quality_eval_cfg, "mmd_kernel", "polynomial") - mmd_degree = getattr(cfg.quality_eval_cfg, "mmd_degree", 3) + mmd_kernel = cfg.quality_eval_cfg.mmd_kernel + mmd_degree = cfg.quality_eval_cfg.mmd_degree mmd_score = compute_mmd( synth_embeddings, real_embeddings, kernel=mmd_kernel, degree=mmd_degree, ) - logger.info("MMD score (%s kernel): %.4f", mmd_kernel, mmd_score) + logger.info( + "MMD score (%s kernel): %.4f", mmd_kernel, mmd_score + ) except Exception as e: # noqa: BLE001 logger.warning("Error computing MMD: %s", e) else: - logger.warning("No real data embeddings generated, skipping comparison metrics") - else: - logger.info("No real_data_dir provided, skipping comparison metrics (require real data)") - - # Joint UMAP (InfoSynth-style): fit on all datasets so entropy/KL are in a shared space - umap_n_components = getattr(cfg.quality_eval_cfg, "umap_n_components", None) - umap_n_neighbors = getattr(cfg.quality_eval_cfg, "umap_n_neighbors", 15) - umap_min_dist = getattr(cfg.quality_eval_cfg, "umap_min_dist", 0.1) - umap_metric = getattr(cfg.quality_eval_cfg, "umap_metric", "cosine") - need_joint_umap = ( - umap_n_components is not None - and ( - "entropy" in internal_diversity_metrics - or ( - "kl_divergence" in comparison_metrics - and real_embeddings is not None - and len(real_embeddings) > 0 + logger.warning( + "No real data embeddings generated, skipping comparison metrics" ) + else: + logger.info( + "No real_data_dir provided, skipping comparison metrics (require real data)" + ) + + # Joint UMAP + umap_n_components = cfg.quality_eval_cfg.umap_n_components + umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors + umap_min_dist = cfg.quality_eval_cfg.umap_min_dist + umap_metric = cfg.quality_eval_cfg.umap_metric + need_joint_umap = umap_n_components is not None and ( + "entropy" in internal_diversity_metrics + or ( + "kl_divergence" in comparison_metrics + and real_embeddings is not None + and len(real_embeddings) > 0 ) ) synth_reduced = None @@ -530,48 +560,73 @@ def main(cfg: DictConfig) -> None: if len(reduced_list) > 1: real_reduced = reduced_list[1] - # KL divergence (uses joint UMAP when need_joint_umap so synth and real share a space) + # KL divergence (joint UMAP so synth and real share a space) if ( "kl_divergence" in comparison_metrics and real_embeddings is not None and len(real_embeddings) > 0 ): try: - kl_k = getattr(cfg.quality_eval_cfg, "kl_k", 4) - kl_synth = synth_reduced if real_reduced is not None else synth_embeddings - kl_real = real_reduced if real_reduced is not None else real_embeddings - kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k) - umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) + kl_k = cfg.quality_eval_cfg.kl_k + kl_synth = ( + synth_reduced if real_reduced is not None else synth_embeddings + ) + kl_real = ( + real_reduced if real_reduced is not None else real_embeddings + ) + if kl_synth is not None and kl_real is not None: + kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k) + else: + kl_score = 0.0 + umap_info = ( + f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + ) + logger.info( + "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score + ) except Exception as e: # noqa: BLE001 logger.warning("Error computing KL divergence: %s", e) # Compute internal diversity metrics (only need synthetic data) if "mdm" in internal_diversity_metrics: try: - mdm_n_clusters = getattr(cfg.quality_eval_cfg, "mdm_n_clusters", 5) - mdm_metric = getattr(cfg.quality_eval_cfg, "mdm_metric", "euclidean") + mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters + mdm_metric = cfg.quality_eval_cfg.mdm_metric mdm_score = compute_mdm( synth_embeddings, n_clusters=mdm_n_clusters, metric=mdm_metric, ) - logger.info("MDM score (%d clusters, %s metric): %.4f", mdm_n_clusters, mdm_metric, mdm_score) + logger.info( + "MDM score (%d clusters, %s metric): %.4f", + mdm_n_clusters, + mdm_metric, + mdm_score, + ) except Exception as e: # noqa: BLE001 logger.warning("Error computing MDM: %s", e) if "entropy" in internal_diversity_metrics: try: - entropy_k = getattr(cfg.quality_eval_cfg, "entropy_k", 4) - entropy_emb = synth_reduced if synth_reduced is not None else synth_embeddings - entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k) - umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - logger.info("Differential entropy score (k=%d)%s: %.4f", entropy_k, umap_info, entropy_score) + entropy_k = cfg.quality_eval_cfg.entropy_k + entropy_emb = ( + synth_reduced if synth_reduced is not None else synth_embeddings + ) + entropy_score = compute_differential_entropy( + entropy_emb, k=entropy_k + ) + umap_info = ( + f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + ) + logger.info( + "Differential entropy score (k=%d)%s: %.4f", + entropy_k, + umap_info, + entropy_score, + ) except Exception as e: # noqa: BLE001 logger.warning("Error computing differential entropy: %s", e) if __name__ == "__main__": main() - - diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index e31456a..f5a9154 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -4,29 +4,31 @@ import statistics import warnings -from typing import Iterable, List, Mapping, Optional, Union +from typing import Iterable, List, Mapping, Union +import kmedoids import numpy as np -from scipy.stats import spearmanr from scipy.special import digamma, gammaln -from sklearn.linear_model import LogisticRegression +from scipy.stats import spearmanr from sklearn.ensemble import RandomForestClassifier -from sklearn.neural_network import MLPClassifier -from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import pairwise_distances from sklearn.metrics.pairwise import ( - polynomial_kernel, - rbf_kernel, laplacian_kernel, linear_kernel, + polynomial_kernel, + rbf_kernel, sigmoid_kernel, ) +from sklearn.model_selection import train_test_split from sklearn.neighbors import NearestNeighbors -import kmedoids -from sklearn.metrics import pairwise_distances +from sklearn.neural_network import MLPClassifier + # Optional UMAP import try: from umap import UMAP + UMAP_AVAILABLE = True except ImportError: UMAP_AVAILABLE = False @@ -50,16 +52,18 @@ def compute_benchmark_difficulty( accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model, or a mapping from model name to accuracy in [0.0, 1.0]. - Returns: + Returns + ------- A float in [0.0, 1.0] representing the benchmark difficulty. - Raises: + Raises + ------ ValueError: If no accuracies are provided. """ # Handle Mapping by extracting values, otherwise treat as iterable if isinstance(accuracies, Mapping): accuracies = accuracies.values() - + accuracies = list(accuracies) if not accuracies: raise ValueError("Cannot compute difficulty: no accuracies provided.") @@ -88,16 +92,18 @@ def compute_benchmark_separability( accuracies: Either an iterable of accuracy values in [0.0, 1.0] for each model, or a mapping from model name to accuracy in [0.0, 1.0]. - Returns: + Returns + ------- A non-negative float representing separability. - Raises: + Raises + ------ ValueError: If no accuracies are provided. """ # Handle Mapping by extracting values, otherwise treat as iterable if isinstance(accuracies, Mapping): accuracies = accuracies.values() - + accuracies = list(accuracies) if not accuracies: raise ValueError("Cannot compute separability: no accuracies provided.") @@ -112,12 +118,14 @@ def compute_benchmark_consistency( model_to_generation_accuracies: Mapping[str, Iterable[float]], ) -> float: """ - Compute benchmark consistency given per-model accuracies across multiple dataset generations. + Compute benchmark consistency from per-model accuracies across generations. - Consistency measures how stable model performance is across different dataset generations. + Consistency measures how stable model performance is across + different dataset generations. The consistency of a benchmark is defined as: - CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_{i=1}^n std({performance(m_i) | D_gen,j}_{j=1}^k) + CONSISTENCY(D_gen, M) = 1 - 1/n * Σ_i std(performance(m_i)) + over dataset generations j=1..k where: - n is the number of models @@ -133,11 +141,13 @@ def compute_benchmark_consistency( on a different dataset generation. Each model should have the same number of generations (k). - Returns: + Returns + ------- A float in [0.0, 1.0] representing the benchmark consistency. Higher values indicate more consistent performance across generations. - Raises: + Raises + ------ ValueError: If no models are provided, or if models have inconsistent numbers of generations, or if any model has fewer than 2 generations (std requires at least 2 values). @@ -186,10 +196,8 @@ def compute_benchmark_consistency( # Average the standard deviations across all models mean_std = sum(model_stds) / len(model_stds) - # Consistency = 1 - mean_std # Clamp to [0, 1] in case of numerical issues - consistency = max(0.0, min(1.0, 1.0 - mean_std)) - return consistency + return max(0.0, min(1.0, 1.0 - mean_std)) # Source paper: AutoBencher - https://arxiv.org/abs/2407.08351 @@ -198,9 +206,10 @@ def compute_benchmark_novelty( prior_datasets_accuracies: List[Mapping[str, float]], ) -> float: """ - Compute benchmark novelty by comparing current dataset performance to prior datasets. + Compute benchmark novelty by comparing current to prior dataset performance. - Novelty measures how much new information a dataset reveals about existing models + Novelty measures how much new information a dataset reveals + about existing models over existing benchmarks. The formula is: NOVELTY(D_c, D_prev, M) = 1 - RANKCORR(v̂_c, v_c) @@ -224,11 +233,13 @@ def compute_benchmark_novelty( All mappings must contain the same set of models, and these models must match the models in current_accuracies. - Returns: + Returns + ------- A float in [0.0, 1.0] representing the benchmark novelty. Higher values indicate more novel/unique performance patterns. - Raises: + Raises + ------ ValueError: If no prior datasets provided, models don't match, or regression fails (e.g., singular matrix). @@ -262,24 +273,24 @@ def compute_benchmark_novelty( num_models = len(current_models) num_prior = len(prior_datasets_accuracies) - # V_prev: each column is a prior dataset's accuracies - V_prev = np.zeros((num_models, num_prior)) + # v_prev: each column is a prior dataset's accuracies + v_prev = np.zeros((num_models, num_prior)) for i, prior_acc in enumerate(prior_datasets_accuracies): for j, model in enumerate(current_models): - V_prev[j, i] = prior_acc[model] + v_prev[j, i] = prior_acc[model] # v_c: current dataset's accuracies v_c = np.array([current_accuracies[model] for model in current_models]) - # Perform linear regression: v_c = V_prev * θ + b - # We solve: min ||V_prev * θ + b - v_c||² - # To use np.linalg.lstsq, we reformulate as: [V_prev, 1] * [θ; b] = v_c + # Perform linear regression: v_c = v_prev * θ + b + # We solve: min ||v_prev * θ + b - v_c||² + # To use np.linalg.lstsq, we reformulate as: [v_prev, 1] * [θ; b] = v_c # where 1 is a column vector of ones (for the intercept b) - + # Augment design matrix with column of ones for intercept ones = np.ones((num_models, 1)) - X = np.hstack([V_prev, ones]) - + X = np.hstack([v_prev, ones]) + try: # Solve using least squares: X * params = v_c # params = [θ; b] @@ -294,8 +305,8 @@ def compute_benchmark_novelty( theta = params[:-1] # First N elements b = params[-1] # Last element (intercept) - # Compute predicted values: v̂_c = V_prev * θ + b - v_pred = V_prev @ theta + b + # Compute predicted values: v̂_c = v_prev * θ + b + v_pred = v_prev @ theta + b # Compute rank correlation (Spearman correlation) using scipy try: @@ -308,16 +319,15 @@ def compute_benchmark_novelty( if np.isnan(rank_corr) or not np.isfinite(rank_corr): return 1.0 - # Novelty = 1 - rank_correlation # Clamp to [0, 1] in case of numerical issues (e.g., negative correlation) - novelty = max(0.0, min(1.0, 1.0 - rank_corr)) - return novelty + return float(max(0.0, min(1.0, 1.0 - rank_corr))) # =========================== # ---- Diversity Metrics (PAD, MMD, MDM) # =========================== + # Source paper: SynQue - https://arxiv.org/abs/2511.03928 def compute_pad( x_syn_emb: np.ndarray, @@ -326,29 +336,30 @@ def compute_pad( ) -> float: """ Compute the Proxy-A-Distance (PAD) between two sets of embeddings. - + PAD measures the distance between synthetic and real data distributions by training a classifier to distinguish between them. Lower values indicate more similar distributions. - + Args: x_syn_emb: Embeddings of synthetic data, shape (n_samples, n_features) x_real_emb: Embeddings of real data, shape (n_samples, n_features) classifier_name: Classifier to use ("LogisticRegression", "RandomForest", "MLP") - - Returns: + + Returns + ------- float: PAD value (typically in range [0, 2], lower is better) """ y_syn_train = np.zeros(len(x_syn_emb)) y_real_train = np.ones(len(x_real_emb)) x_train = np.concatenate([x_syn_emb, x_real_emb], axis=0) y_train = np.concatenate([y_syn_train, y_real_train], axis=0) - + # Split into train/validation x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=0.2, random_state=42 ) - + # Classifier if classifier_name == "LogisticRegression": classifier = LogisticRegression(random_state=42, max_iter=1000) @@ -357,74 +368,75 @@ def compute_pad( elif classifier_name == "MLP": classifier = MLPClassifier( hidden_layer_sizes=(128, 64), - activation='relu', + activation="relu", max_iter=200, - random_state=42 + random_state=42, ) else: raise ValueError(f"Unknown classifier: {classifier_name}") - + classifier.fit(x_train, y_train) y_pred_proba = classifier.predict_proba(x_val)[:, 1] average_loss = np.mean(np.abs(y_pred_proba - y_val)) - return 2 * (1 - 2 * average_loss) + return float(2 * (1 - 2 * average_loss)) # Source paper: SynQue - https://arxiv.org/abs/2511.03928 def compute_mmd( - X: np.ndarray, - Y: np.ndarray, + x: np.ndarray, + y: np.ndarray, kernel: str = "polynomial", degree: int = 3, gamma: float | None = None, coef0: float = 1, ) -> float: """ - Compute the Maximum Mean Discrepancy (MMD) between two samples: X and Y. - + Compute the Maximum Mean Discrepancy (MMD) between two samples: x and y. + MMD measures the distance between two distributions in a reproducing kernel Hilbert space. Lower values indicate more similar distributions. - + Args: - X: First sample, shape (n_samples_X, n_features) - Y: Second sample, shape (n_samples_Y, n_features) + x: First sample, shape (n_samples_x, n_features) + y: Second sample, shape (n_samples_y, n_features) kernel: Kernel name ("polynomial", "rbf", "laplacian", "linear", "sigmoid") degree: Degree for polynomial kernel (default: 3) gamma: Gamma parameter for kernels (default: None, auto) coef0: Coef0 for polynomial/sigmoid kernel - - Returns: + + Returns + ------- float: MMD value (non-negative, lower is better) """ kernel = kernel.lower() if isinstance(kernel, str) else kernel if kernel == "polynomial": kfunc = polynomial_kernel - XX = kfunc(X, X, degree=degree, gamma=gamma, coef0=coef0) - YY = kfunc(Y, Y, degree=degree, gamma=gamma, coef0=coef0) - XY = kfunc(X, Y, degree=degree, gamma=gamma, coef0=coef0) + xx = kfunc(x, x, degree=degree, gamma=gamma, coef0=coef0) + yy = kfunc(y, y, degree=degree, gamma=gamma, coef0=coef0) + xy = kfunc(x, y, degree=degree, gamma=gamma, coef0=coef0) elif kernel == "rbf": kfunc = rbf_kernel - XX = kfunc(X, X, gamma=gamma) - YY = kfunc(Y, Y, gamma=gamma) - XY = kfunc(X, Y, gamma=gamma) + xx = kfunc(x, x, gamma=gamma) + yy = kfunc(y, y, gamma=gamma) + xy = kfunc(x, y, gamma=gamma) elif kernel == "laplacian": kfunc = laplacian_kernel - XX = kfunc(X, X, gamma=gamma) - YY = kfunc(Y, Y, gamma=gamma) - XY = kfunc(X, Y, gamma=gamma) + xx = kfunc(x, x, gamma=gamma) + yy = kfunc(y, y, gamma=gamma) + xy = kfunc(x, y, gamma=gamma) elif kernel == "linear": kfunc = linear_kernel - XX = kfunc(X, X) - YY = kfunc(Y, Y) - XY = kfunc(X, Y) + xx = kfunc(x, x) + yy = kfunc(y, y) + xy = kfunc(x, y) elif kernel == "sigmoid": kfunc = sigmoid_kernel - XX = kfunc(X, X, gamma=gamma, coef0=coef0) - YY = kfunc(Y, Y, gamma=gamma, coef0=coef0) - XY = kfunc(X, Y, gamma=gamma, coef0=coef0) + xx = kfunc(x, x, gamma=gamma, coef0=coef0) + yy = kfunc(y, y, gamma=gamma, coef0=coef0) + xy = kfunc(x, y, gamma=gamma, coef0=coef0) else: raise ValueError(f"Unknown kernel: {kernel}") - return np.mean(XX) + np.mean(YY) - 2 * np.mean(XY) + return float(np.mean(xx) + np.mean(yy) - 2 * np.mean(xy)) # Source paper: SynQue - https://arxiv.org/abs/2511.03928 @@ -434,29 +446,31 @@ def compute_mdm( metric: str = "euclidean", ) -> float: """ - Compute the mean distance of points in each cluster to its medoid, then average across clusters. - - MDM measures the internal diversity/coherence of a set of embeddings by clustering + Compute mean distance to medoid per cluster, then average across clusters. + + MDM measures the internal diversity/coherence of a set of embeddings + by clustering them and computing the average distance to cluster medoids. Lower values indicate more coherent/diverse clusters. - + Args: embeddings: Embedding matrix of shape (n_samples, n_features) n_clusters: Number of clusters/medoids to use metric: Distance metric for KMedoids ('euclidean', 'cosine', etc.) - - Returns: + + Returns + ------- float: Mean distance to medoid (averaged over all clusters) """ n_samples = len(embeddings) if n_samples < n_clusters: n_clusters = max(1, n_samples) - + diss = pairwise_distances(embeddings, metric=metric) pam_result = kmedoids.fasterpam(diss, n_clusters, random_state=42) labels = pam_result.labels medoid_indices = pam_result.medoids - + total_dist = 0.0 for i, medoid_idx in enumerate(medoid_indices): cluster_points_idx = np.where(labels == i)[0] @@ -471,6 +485,7 @@ def compute_mdm( # ---- Information-Theoretic Metrics (Entropy, KL-Divergence) # =========================== + def fit_umap_shared( embeddings_list: List[np.ndarray], n_components: int, @@ -479,7 +494,7 @@ def fit_umap_shared( metric: str = "cosine", ) -> List[np.ndarray]: """ - Fit UMAP on the concatenation of all embedding arrays, then split back (InfoSynth-style). + Fit UMAP on the concatenation of all embedding arrays, then split back. This ensures entropy and KL divergence are comparable across datasets by using a single shared low-dimensional space. @@ -491,13 +506,12 @@ def fit_umap_shared( min_dist: Minimum distance for UMAP (default: 0.1). metric: Distance metric for UMAP (default: "cosine"). - Returns: + Returns + ------- List of reduced embedding arrays in the same order as embeddings_list. """ if not UMAP_AVAILABLE: - raise ImportError( - "UMAP is required. Install it with: pip install umap-learn" - ) + raise ImportError("UMAP is required. Install it with: pip install umap-learn") if not embeddings_list: return [] counts = [emb.shape[0] for emb in embeddings_list] @@ -526,9 +540,9 @@ def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float: """ Compute the differential entropy of a set of embeddings using k-nearest neighbors. - Differential entropy measures the diversity/uncertainty in the embedding distribution. - Higher values indicate more diverse data. For a shared space across datasets, apply - UMAP (e.g. fit_umap_shared) to embeddings before calling this function. + Differential entropy measures the diversity/uncertainty in the + embedding distribution. + Higher values indicate more diverse data. This implementation uses the k-NN estimator for differential entropy: H(X) ≈ digamma(N) - digamma(k) + log(volume) + d * mean(log(eps)) @@ -543,22 +557,24 @@ def compute_differential_entropy(embeddings: np.ndarray, k: int = 4) -> float: embeddings: Embedding matrix of shape (n_samples, n_features) k: Number of nearest neighbors to use (default: 4) - Returns: + Returns + ------- float: Differential entropy value (higher is more diverse) """ - N, d = embeddings.shape - if N < k + 1: + n_samples, d = embeddings.shape + if k + 1 > n_samples: raise ValueError( - f"Cannot compute entropy: need at least {k + 1} samples, but got {N}." + f"Cannot compute entropy: need at least {k + 1} samples, " + f"but got {n_samples}." ) - + nbrs = NearestNeighbors(n_neighbors=k + 1).fit(embeddings) distances, _ = nbrs.kneighbors(embeddings) eps = distances[:, -1] eps[eps == 0] = np.nextafter(0, 1) - + log_vol = (d / 2) * np.log(np.pi) - gammaln(d / 2 + 1) - entropy = digamma(N) - digamma(k) + log_vol + d * np.mean(np.log(eps)) + entropy = digamma(n_samples) - digamma(k) + log_vol + d * np.mean(np.log(eps)) return float(entropy) @@ -573,8 +589,7 @@ def compute_kl_divergence( Compute the KL divergence between two sets of embeddings using k-nearest neighbors. KL divergence measures how different distribution P is from distribution Q. - Higher values indicate more novelty (P is more different from Q). For a shared - space, apply UMAP (e.g. fit_umap_shared) to [P, Q] before calling this function. + Higher values indicate more novelty (P is more different from Q). This implementation uses the k-NN estimator for KL divergence: KL(P||Q) ≈ (d/n) * sum(log(nu/rho)) + log(m/(n-1)) @@ -592,12 +607,13 @@ def compute_kl_divergence( k: Number of nearest neighbors to use (default: 4) eps: Small epsilon to avoid division by zero (default: 1e-10) - Returns: + Returns + ------- float: KL divergence value (higher is more novel/different) """ n, d = p_embeddings.shape m, _ = q_embeddings.shape - + if n < k + 1: raise ValueError( f"Cannot compute KL divergence: P needs at least {k + 1} samples, but got {n}." @@ -606,16 +622,14 @@ def compute_kl_divergence( raise ValueError( f"Cannot compute KL divergence: Q needs at least {k} samples, but got {m}." ) - + # Find k-th nearest neighbor in P for each point in P nbrs_p = NearestNeighbors(n_neighbors=k + 1).fit(p_embeddings) rho = np.maximum(nbrs_p.kneighbors(p_embeddings)[0][:, k], eps) - + # Find k-th nearest neighbor in Q for each point in P nbrs_q = NearestNeighbors(n_neighbors=k).fit(q_embeddings) nu = np.maximum(nbrs_q.kneighbors(p_embeddings)[0][:, k - 1], eps) - + kl_div = (d / n) * np.sum(np.log(nu / rho)) + np.log(m / (n - 1)) return float(kl_div) - - From 579cedf51db4cd383fd95a2ce8fc53406a471b79 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 5 Feb 2026 08:56:09 -0500 Subject: [PATCH 09/14] Fixed real data logic --- src/cfg/run_quality_evaluation_cfg.yaml | 70 +++++- src/run_quality_evaluation.py | 308 ++++++++++++++++-------- src/utils/__init__.py | 2 +- src/utils/quality_evaluation_utils.py | 2 +- 4 files changed, 261 insertions(+), 121 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index f92b0a9..e183c0c 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -2,22 +2,55 @@ prompt_cfg: sys_msg: Compute benchmark quality metrics from existing scores. quality_eval_cfg: - scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" - scores_subdir: "scores" - prior_datasets: - - "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample/math-500" + # Synthetic benchmark source (scores + capabilities) + synthetic_source: + # Root directory containing per-model score subdirs for the synthetic benchmark + scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" + # Optional subdirectory name when falling back to BASE_ARTIFACTS_DIR + scores_subdir: "scores" + # Capability directory for the synthetic benchmark + capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" - capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" - synthetic_dataloader_config: null # Optional: custom dataloader for capabilities_dir (e.g. jsonl, csv, huggingface) + # Novelty: "combined" = one score from all real sources (linear regression on all); + # "per_dataset" = one novelty per prior (how novel vs each benchmark separately); + # "both" = report combined and per-dataset. + novelty_mode: "combined" # "combined" | "per_dataset" | "both" - real_data_dir: null + # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL). + # real_data_source can be: + # - a single mapping {path, dataloader, name}, OR + # - a list of such mappings when you have multiple real datasets. + # + # When multiple sources are provided, real_comparison_mode controls whether + # they are pooled together into one real distribution ("pooled") or compared + # pairwise against the synthetic data ("per_dataset") for PAD/MMD. + real_comparison_mode: "pooled" # or "per_dataset" - real_dataloader_config: - type: "huggingface" - dataset_name: "HuggingFaceH4/MATH-500" - split: "test" - subset: null - text_field: "problem" + # Example: multiple real datasets (HuggingFace math benchmarks). + # Novelty uses score dirs from each source: set scores_dir explicitly, or + # we use scores_root_dir/ when name is set. + real_data_source: + - name: "MATH-500" + path: null + # Optional: explicit scores directory for novelty; otherwise uses + # scores_root_dir/name + scores_dir: null + dataloader: + type: "huggingface" + dataset_name: "HuggingFaceH4/MATH-500" + split: "test" + subset: null + text_field: "problem" + + - name: "MATH-Hard" + path: null + scores_dir: null + dataloader: + type: "huggingface" + dataset_name: "lighteval/MATH-Hard" + split: "test" + subset: null + text_field: "problem" # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers embedding_backend: "openai" @@ -54,6 +87,17 @@ quality_eval_cfg: umap_min_dist: 0.1 # Minimum distance for UMAP umap_metric: "cosine" # Distance metric for UMAP + # Evaluation settings to use if we need to (re-)evaluate prior or real datasets. + # These mirror the subject_llm settings in src/cfg/run_cfg.yaml. + evaluation_cfg: + subject_llm: + name: "o1-mini" + provider: "openai" + generation_cfg: + temperature: 0.7 + max_tokens: 2048 + seed: 42 + exp_cfg: exp_id: "quality_evaluation" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 82e8228..812c367 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -7,10 +7,10 @@ import hydra import numpy as np -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName -from src.utils import ( +from src.utils.quality_evaluation_utils import ( compute_benchmark_consistency, compute_benchmark_difficulty, compute_benchmark_novelty, @@ -20,8 +20,7 @@ compute_mdm, compute_mmd, compute_pad, - constants, - fit_umap_shared, + fit_umap, ) from src.utils.data_utils import get_run_id from src.utils.diversity_metrics_dataloaders import ( @@ -274,13 +273,17 @@ def main(cfg: DictConfig) -> None: """Compute benchmark-level quality metrics from saved capability scores.""" run_id = get_run_id(cfg) - scores_root_dir = cfg.quality_eval_cfg.scores_root_dir + # Synthetic benchmark source (scores + capabilities) + synthetic_cfg = cfg.quality_eval_cfg.synthetic_source + scores_root_dir = synthetic_cfg.get("scores_root_dir") + scores_subdir = synthetic_cfg.get("scores_subdir", "scores") + if scores_root_dir: base_scores_dir = scores_root_dir else: base_scores_dir = os.path.join( constants.BASE_ARTIFACTS_DIR, - cfg.quality_eval_cfg.scores_subdir, + scores_subdir, run_id, ) logger.info("Using fallback scores directory: %s", base_scores_dir) @@ -301,19 +304,7 @@ def main(cfg: DictConfig) -> None: # For consistency: map model to list of accuracies per generation model_to_generation_accuracies: Dict[str, List[float]] = {} - # Get prior dataset names to exclude them from current dataset - prior_datasets = cfg.quality_eval_cfg.prior_datasets - prior_dataset_names = set() - for prior_path in prior_datasets: - # Extract the directory name from the path - prior_name = os.path.basename(os.path.normpath(prior_path)) - prior_dataset_names.add(prior_name) - for model_name in os.listdir(base_scores_dir): - # Skip if this is a prior dataset directory - if model_name in prior_dataset_names: - logger.debug("Skipping prior dataset directory: %s", model_name) - continue model_dir = os.path.join(base_scores_dir, model_name) if not os.path.isdir(model_dir): continue @@ -392,16 +383,38 @@ def main(cfg: DictConfig) -> None: except ValueError as e: logger.warning("Could not compute consistency: %s", e) - # Compute novelty if prior datasets are provided - prior_datasets = cfg.quality_eval_cfg.prior_datasets - if prior_datasets: + # Compute novelty using score dirs derived from real_data_source. + novelty_score_dirs: List[str] = [] + real_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + real_source_configs: List[Mapping[str, Any]] = [] + if real_source_cfg is not None: + cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True) + if isinstance(cfg_container, list): + real_source_configs = cfg_container # type: ignore[list-item] + elif isinstance(cfg_container, Mapping): + real_source_configs = [cfg_container] # type: ignore[list-item] + + # Use synthetic_source.scores_root_dir for deriving default score dirs + scores_root_dir = synthetic_cfg.get("scores_root_dir") + for src in real_source_configs: + scores_dir = src.get("scores_dir") + if not scores_dir: + src_name = src.get("name") + if scores_root_dir and src_name: + scores_dir = os.path.join(scores_root_dir, src_name) + if scores_dir: + novelty_score_dirs.append(str(scores_dir)) + + if novelty_score_dirs: try: logger.info("Loading prior datasets for novelty computation...") prior_datasets_accuracies: List[Dict[str, float]] = [] - for prior_dir in prior_datasets: + prior_labels: List[str] = [] + for prior_dir in novelty_score_dirs: prior_acc = _load_avg_model_accuracies_from_dir(prior_dir) if prior_acc: prior_datasets_accuracies.append(prior_acc) + prior_labels.append(os.path.basename(os.path.normpath(prior_dir))) logger.info( "Loaded prior dataset from %s: %d models", prior_dir, @@ -413,22 +426,38 @@ def main(cfg: DictConfig) -> None: ) if prior_datasets_accuracies: - novelty = compute_benchmark_novelty( - model_to_accuracy, - cast(List[Mapping[str, float]], prior_datasets_accuracies), - ) - logger.info("Benchmark novelty: %.4f", novelty) + novelty_mode = str( + cfg.quality_eval_cfg.get("novelty_mode", "combined") + ).lower() + if novelty_mode in ("combined", "both"): + novelty = compute_benchmark_novelty( + model_to_accuracy, + cast( + List[Mapping[str, float]], prior_datasets_accuracies + ), + ) + logger.info("Benchmark novelty (combined): %.4f", novelty) + if novelty_mode in ("per_dataset", "both"): + for label, prior_acc in zip( + prior_labels, prior_datasets_accuracies + ): + n_per = compute_benchmark_novelty( + model_to_accuracy, [prior_acc] + ) + logger.info( + "Novelty[%s]: %.4f", label, n_per + ) else: logger.warning( - "No valid prior datasets found, skipping novelty computation." + "No valid real data score dirs found (real_data_source with scores_dir or name), skipping novelty computation." ) except ValueError as e: logger.warning("Could not compute novelty: %s", e) except Exception as e: # noqa: BLE001 logger.warning("Error computing novelty: %s", e) - # Compute embedding-based metrics if capabilities directory is provided - capabilities_dir = cfg.quality_eval_cfg.capabilities_dir + # Compute embedding-based metrics if synthetic capabilities directory is provided + capabilities_dir = synthetic_cfg.get("capabilities_dir") if capabilities_dir: internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics comparison_metrics = cfg.quality_eval_cfg.comparison_metrics @@ -436,11 +465,6 @@ def main(cfg: DictConfig) -> None: embedding_backend = cfg.quality_eval_cfg.embedding_backend embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions - # Get dataloader config if provided - synth_dataloader_config = cfg.quality_eval_cfg.synthetic_dataloader_config - if synth_dataloader_config: - synth_dataloader_config = dict(synth_dataloader_config) - logger.info( "Computing embedding-based metrics for capabilities in %s", capabilities_dir ) @@ -450,7 +474,7 @@ def main(cfg: DictConfig) -> None: capabilities_dir=capabilities_dir, embedding_model_name=embedding_model, embed_dimensions=embed_dimensions, - dataloader_config=synth_dataloader_config, + dataloader_config=None, embedding_backend=embedding_backend, ) @@ -458,58 +482,136 @@ def main(cfg: DictConfig) -> None: logger.warning("No embeddings generated, skipping diversity metrics") else: real_embeddings = None - # Check if real data directory/file is provided for comparison - real_data_dir = cfg.quality_eval_cfg.real_data_dir - real_dataloader_config = cfg.quality_eval_cfg.real_dataloader_config - - # Real data: valid path or dataloader config (e.g. HuggingFace) - has_real_data = False - # Case 1: local path (capability/JSONL/CSV formats) - if ( - real_data_dir - and (os.path.isdir(real_data_dir) or os.path.isfile(real_data_dir)) - or real_dataloader_config - and real_dataloader_config.get("type") == "huggingface" - ): - has_real_data = True - - if has_real_data: - # Get real data dataloader config if provided - if real_dataloader_config: - real_dataloader_config = dict(real_dataloader_config) - - if real_data_dir: - logger.info("Loading real data embeddings from %s", real_data_dir) + # Real data sources for comparison metrics (PAD, MMD, KL) + real_mode = str( + cfg.quality_eval_cfg.get("real_comparison_mode", "pooled") + ).lower() + real_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + + # Normalize to a list of source configs: each with optional name, path, dataloader. + # real_data_source can be a single mapping or a list of mappings. + real_source_configs: List[Dict[str, Any]] = [] + if real_source_cfg is None: + logger.info( + "real_data_source is not set in config; skipping comparison metrics (PAD, MMD, KL)." + ) + else: + cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True) + if isinstance(cfg_container, list): + raw_list: List[Any] = cfg_container + elif isinstance(cfg_container, Mapping): + raw_list = [cfg_container] + else: + raw_list = [] + for i, src in enumerate(raw_list): + src_dict = dict(src) + src_dict.setdefault("name", f"real_{i}") + real_source_configs.append(src_dict) + + real_embeddings_list: List[np.ndarray] = [] + real_names: List[str] = [] + + # Load embeddings for each real source + for src in real_source_configs: + name = src.get("name", "real") + real_data_path = src.get("path") + real_dataloader_cfg = src.get("dataloader") + if real_dataloader_cfg is not None and not isinstance( + real_dataloader_cfg, dict + ): + real_dataloader_cfg = dict( + OmegaConf.to_container(real_dataloader_cfg, resolve=True) + ) + + has_real_data = False + if real_data_path and ( + os.path.isdir(real_data_path) or os.path.isfile(real_data_path) + ): + has_real_data = True + elif real_dataloader_cfg and real_dataloader_cfg.get( + "type" + ) == "huggingface": + has_real_data = True + + if not has_real_data: + logger.info( + "Skipping real source %s: no valid path or dataloader (type=huggingface) provided", + name, + ) + continue + + if real_dataloader_cfg is None: + real_dataloader_cfg = {} + + if real_data_path: + logger.info("Loading real data embeddings from %s", real_data_path) else: logger.info( - "Loading real data embeddings using dataloader config (no local path)" + "Loading real data embeddings for %s using dataloader config (no local path)", + name, ) - real_embeddings, _ = _load_capabilities_and_generate_embeddings( - # HuggingFace: capabilities_dir unused, pass empty string - capabilities_dir=real_data_dir or "", + + emb_real, _ = _load_capabilities_and_generate_embeddings( + capabilities_dir=real_data_path or "", embedding_model_name=embedding_model, embed_dimensions=embed_dimensions, - dataloader_config=real_dataloader_config, + dataloader_config=real_dataloader_cfg, embedding_backend=embedding_backend, ) - - if len(real_embeddings) > 0: - # Comparison metrics (need both synth and real) - if "pad" in comparison_metrics: - try: + if emb_real is None or len(emb_real) == 0: + logger.warning( + "No real data embeddings generated for source %s, skipping it", + name, + ) + continue + + real_embeddings_list.append(emb_real) + real_names.append(name) + + if real_embeddings_list: + # Pooled real embeddings (used for KL + joint UMAP, and for PAD/MMD in 'pooled' mode) + real_embeddings = np.vstack(real_embeddings_list) + + # Comparison metrics (need both synth and real) + if "pad" in comparison_metrics: + try: + if real_mode == "per_dataset" and len(real_embeddings_list) > 1: + for name, emb_real in zip(real_names, real_embeddings_list): + pad_score = compute_pad( + synth_embeddings, + emb_real, + classifier_name=cfg.quality_eval_cfg.pad_classifier, + ) + logger.info("PAD[%s]: %.4f", name, pad_score) + else: pad_score = compute_pad( synth_embeddings, real_embeddings, classifier_name=cfg.quality_eval_cfg.pad_classifier, ) - logger.info("PAD score: %.4f", pad_score) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing PAD: %s", e) - - if "mmd" in comparison_metrics: - try: - mmd_kernel = cfg.quality_eval_cfg.mmd_kernel - mmd_degree = cfg.quality_eval_cfg.mmd_degree + logger.info("PAD (pooled real): %.4f", pad_score) + except Exception as e: # noqa: BLE001 + logger.warning("Error computing PAD: %s", e) + + if "mmd" in comparison_metrics: + try: + mmd_kernel = cfg.quality_eval_cfg.mmd_kernel + mmd_degree = cfg.quality_eval_cfg.mmd_degree + if real_mode == "per_dataset" and len(real_embeddings_list) > 1: + for name, emb_real in zip(real_names, real_embeddings_list): + mmd_score = compute_mmd( + synth_embeddings, + emb_real, + kernel=mmd_kernel, + degree=mmd_degree, + ) + logger.info( + "MMD[%s] (%s kernel): %.4f", + name, + mmd_kernel, + mmd_score, + ) + else: mmd_score = compute_mmd( synth_embeddings, real_embeddings, @@ -517,55 +619,49 @@ def main(cfg: DictConfig) -> None: degree=mmd_degree, ) logger.info( - "MMD score (%s kernel): %.4f", mmd_kernel, mmd_score + "MMD (pooled real, %s kernel): %.4f", + mmd_kernel, + mmd_score, ) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing MMD: %s", e) - else: - logger.warning( - "No real data embeddings generated, skipping comparison metrics" - ) - else: - logger.info( - "No real_data_dir provided, skipping comparison metrics (require real data)" + except Exception as e: # noqa: BLE001 + logger.warning("Error computing MMD: %s", e) + elif real_source_configs: + logger.warning( + "No real data embeddings could be generated for any source. " + "Check dataloader config (e.g. dataset_name, text_field) and embedding API/network." ) + # When real_source_configs is empty we already logged that real_data_source is not set - # Joint UMAP + # Joint UMAP (for entropy and/or KL in shared space) + has_real = ( + real_embeddings is not None and len(real_embeddings) > 0 + ) umap_n_components = cfg.quality_eval_cfg.umap_n_components umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors umap_min_dist = cfg.quality_eval_cfg.umap_min_dist umap_metric = cfg.quality_eval_cfg.umap_metric - need_joint_umap = umap_n_components is not None and ( + need_umap = umap_n_components is not None and ( "entropy" in internal_diversity_metrics - or ( - "kl_divergence" in comparison_metrics - and real_embeddings is not None - and len(real_embeddings) > 0 - ) + or ("kl_divergence" in comparison_metrics and has_real) ) synth_reduced = None real_reduced = None - if need_joint_umap: - all_emb = [synth_embeddings] - if real_embeddings is not None and len(real_embeddings) > 0: - all_emb.append(real_embeddings) - reduced_list = fit_umap_shared( - all_emb, + if need_umap: + embeddings_to_reduce = [synth_embeddings] + if has_real: + embeddings_to_reduce.append(real_embeddings) + reduced_list = fit_umap( + embeddings_to_reduce, umap_n_components, n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, metric=umap_metric, ) synth_reduced = reduced_list[0] - if len(reduced_list) > 1: - real_reduced = reduced_list[1] + real_reduced = reduced_list[1] if len(reduced_list) > 1 else None # KL divergence (joint UMAP so synth and real share a space) - if ( - "kl_divergence" in comparison_metrics - and real_embeddings is not None - and len(real_embeddings) > 0 - ): + if "kl_divergence" in comparison_metrics and has_real: try: kl_k = cfg.quality_eval_cfg.kl_k kl_synth = ( diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 02d2cef..5c96a0c 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -16,5 +16,5 @@ compute_mdm, compute_mmd, compute_pad, - fit_umap_shared, + fit_umap, ) diff --git a/src/utils/quality_evaluation_utils.py b/src/utils/quality_evaluation_utils.py index f5a9154..e7719c5 100644 --- a/src/utils/quality_evaluation_utils.py +++ b/src/utils/quality_evaluation_utils.py @@ -486,7 +486,7 @@ def compute_mdm( # =========================== -def fit_umap_shared( +def fit_umap( embeddings_list: List[np.ndarray], n_components: int, n_neighbors: int = 15, From 10bd2e6b31478b0569cac871f4ad230adc2923e4 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 6 Feb 2026 23:52:15 -0500 Subject: [PATCH 10/14] Refactored main function --- src/cfg/run_quality_evaluation_cfg.yaml | 31 +- src/run_quality_evaluation.py | 975 ++++++++++++++---------- 2 files changed, 574 insertions(+), 432 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index e183c0c..00acebf 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -11,14 +11,30 @@ quality_eval_cfg: # Capability directory for the synthetic benchmark capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" + # List of metrics to compute. Available metrics: + # - Benchmark metrics: "difficulty", "separability", "consistency" + # - Novelty: "novelty" + # - Internal diversity: "mdm", "entropy" + # - Comparison metrics: "pad", "mmd", "kl_divergence" + metrics_to_compute: + - "difficulty" + - "separability" + - "consistency" + # - "novelty" + - "mdm" + - "entropy" + - "pad" + - "mmd" + - "kl_divergence" + # Novelty: "combined" = one score from all real sources (linear regression on all); # "per_dataset" = one novelty per prior (how novel vs each benchmark separately); # "both" = report combined and per-dataset. novelty_mode: "combined" # "combined" | "per_dataset" | "both" - # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL). + # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL) and novelty. # real_data_source can be: - # - a single mapping {path, dataloader, name}, OR + # - a single mapping {path, dataloader, name, scores_dir}, OR # - a list of such mappings when you have multiple real datasets. # # When multiple sources are provided, real_comparison_mode controls whether @@ -58,17 +74,6 @@ quality_eval_cfg: # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) embedding_dimensions: 3072 - # Internal diversity metrics (only need synthetic data) - internal_diversity_metrics: - - "mdm" # Mean Distance to Medoid - measures internal coherence - - "entropy" # Differential Entropy - measures diversity/uncertainty - - # Comparison metrics (need both synthetic and real data) - comparison_metrics: - - "pad" # Proxy-A-Distance - measures distribution similarity - - "mmd" # Maximum Mean Discrepancy - measures distribution distance - - "kl_divergence" # KL Divergence - measures novelty (how different from real) - pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" mmd_kernel: "polynomial" # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 812c367..a761de9 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -3,7 +3,7 @@ import json import logging import os -from typing import Any, Dict, List, Mapping, Optional, cast +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import hydra import numpy as np @@ -23,6 +23,7 @@ fit_umap, ) from src.utils.data_utils import get_run_id +from src.utils import constants from src.utils.diversity_metrics_dataloaders import ( CapabilityDataloader, CSVDataloader, @@ -36,16 +37,243 @@ logger = logging.getLogger(__name__) -def _collect_accuracies_from_dir(directory: str) -> List[float]: +def _validate_metric_requirements(cfg: DictConfig) -> None: """ - Collect all accuracy values from JSON files in a directory (recursively). + Validate that all required data is provided for the requested metrics. - Args: - directory: Directory to walk recursively for JSON files. + Raises ValueError if any required data is missing. + """ + metrics_to_compute = cfg.quality_eval_cfg.get("metrics_to_compute", []) + if not metrics_to_compute: + raise ValueError( + "metrics_to_compute must be specified in config. " + "Available metrics: difficulty, separability, consistency, novelty, " + "mdm, entropy, pad, mmd, kl_divergence" + ) - Returns - ------- - List of accuracy values found in the directory. + benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + + # Benchmark metrics (difficulty, separability, consistency) need scores + benchmark_metrics = {"difficulty", "separability", "consistency"} + if benchmark_metrics.intersection(metrics_to_compute): + scores_root_dir = benchmark_source_cfg.get("scores_root_dir") + scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores") + run_id = get_run_id(cfg) + + if scores_root_dir: + base_scores_dir = scores_root_dir + else: + base_scores_dir = os.path.join( + constants.BASE_ARTIFACTS_DIR, scores_subdir, run_id + ) + + if not os.path.isdir(base_scores_dir): + raise ValueError( + f"Benchmark metrics ({benchmark_metrics.intersection(metrics_to_compute)}) " + f"require scores directory to exist. " + f"benchmark scores_root_dir or fallback directory not found: {base_scores_dir}" + ) + + # Check that scores directory contains at least one model subdirectory + model_dirs = [ + d for d in os.listdir(base_scores_dir) + if os.path.isdir(os.path.join(base_scores_dir, d)) + ] + if not model_dirs: + raise ValueError( + f"Scores directory '{base_scores_dir}' exists but contains no model subdirectories. " + "Please ensure scores are generated for at least one model." + ) + + # For consistency metric, check that at least one model has generation subdirectories + if "consistency" in metrics_to_compute: + has_generations = False + for model_name in model_dirs: + model_dir = os.path.join(base_scores_dir, model_name) + subdirs = [ + d for d in os.listdir(model_dir) + if os.path.isdir(os.path.join(model_dir, d)) + ] + if subdirs: + has_generations = True + break + if not has_generations: + raise ValueError( + f"Consistency metric requires multiple generations per model " + f"(subdirectories in model directories), but none found in {base_scores_dir}" + ) + + # Internal diversity metrics (mdm, entropy) need capabilities_dir + internal_metrics = {"mdm", "entropy"} + if internal_metrics.intersection(metrics_to_compute): + capabilities_dir = benchmark_source_cfg.get("capabilities_dir") + if not capabilities_dir: + raise ValueError( + f"Internal diversity metrics ({internal_metrics.intersection(metrics_to_compute)}) " + "require benchmark capabilities_dir" + ) + if not os.path.isdir(capabilities_dir): + raise ValueError( + f"benchmark capabilities_dir does not exist: {capabilities_dir}" + ) + # Check that capabilities_dir contains at least one capability.json file + single_cap_json = os.path.join(capabilities_dir, "capability.json") + if not os.path.exists(single_cap_json): + # Check subdirectories + has_capability = False + for item_name in os.listdir(capabilities_dir): + item_path = os.path.join(capabilities_dir, item_name) + if os.path.isdir(item_path): + cap_json = os.path.join(item_path, "capability.json") + if os.path.exists(cap_json): + has_capability = True + break + if not has_capability: + raise ValueError( + f"benchmark capabilities_dir '{capabilities_dir}' exists but contains " + "no capability.json files (neither directly nor in subdirectories)" + ) + + # Comparison metrics (pad, mmd, kl_divergence) need benchmark + reference data + comparison_metrics = {"pad", "mmd", "kl_divergence"} + if comparison_metrics.intersection(metrics_to_compute): + capabilities_dir = benchmark_source_cfg.get("capabilities_dir") + if not capabilities_dir: + raise ValueError( + f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " + "require benchmark capabilities_dir" + ) + if not os.path.isdir(capabilities_dir): + raise ValueError( + f"benchmark capabilities_dir does not exist: {capabilities_dir}" + ) + # Check that capabilities_dir contains at least one capability.json file + single_cap_json = os.path.join(capabilities_dir, "capability.json") + if not os.path.exists(single_cap_json): + # Check subdirectories + has_capability = False + for item_name in os.listdir(capabilities_dir): + item_path = os.path.join(capabilities_dir, item_name) + if os.path.isdir(item_path): + cap_json = os.path.join(item_path, "capability.json") + if os.path.exists(cap_json): + has_capability = True + break + if not has_capability: + raise ValueError( + f"benchmark capabilities_dir '{capabilities_dir}' exists but contains " + "no capability.json files (neither directly nor in subdirectories)" + ) + + if reference_data_source_cfg is None: + raise ValueError( + f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " + "require reference_data_source to be configured" + ) + + # Validate each reference source has either path or dataloader + cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) + sources = [] + if isinstance(cfg_container, list): + sources = cfg_container + elif isinstance(cfg_container, Mapping): + sources = [cfg_container] + + if not sources: + raise ValueError( + f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " + "require at least one reference_data_source entry" + ) + + for i, src in enumerate(sources): + src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + name = src_dict.get("name", f"reference_{i}") + path = src_dict.get("path") + dataloader = src_dict.get("dataloader") + + has_path = path and (os.path.isdir(path) or os.path.isfile(path)) + has_dataloader = dataloader and dataloader.get("type") == "huggingface" + + if not (has_path or has_dataloader): + raise ValueError( + f"reference_data_source[{i}] ({name}) must have either a valid 'path' " + "(existing file/directory) or 'dataloader' with type='huggingface'" + ) + + # Novelty needs reference_data_source with score directories (prior accuracies) + if "novelty" in metrics_to_compute: + if reference_data_source_cfg is None: + raise ValueError("Novelty metric requires reference_data_source (prior accuracies) to be configured") + + cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) + sources = [] + if isinstance(cfg_container, list): + sources = cfg_container + elif isinstance(cfg_container, Mapping): + sources = [cfg_container] + + if not sources: + raise ValueError("Novelty metric requires at least one reference_data_source entry (for prior accuracies)") + + scores_root_dir = benchmark_source_cfg.get("scores_root_dir") + has_valid_score_dir = False + checked: List[str] = [] + for i, src in enumerate(sources): + src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + scores_dir = src_dict.get("scores_dir") + if not scores_dir: + src_name = src_dict.get("name") + if scores_root_dir and src_name: + scores_dir = os.path.join(scores_root_dir, src_name) + else: + if not scores_root_dir: + checked.append( + f"entry {i} (name={src_dict.get('name')}): no scores_dir and scores_root_dir not set" + ) + elif not src_name: + checked.append(f"entry {i}: no scores_dir and no name to derive from scores_root_dir") + continue + if not scores_dir: + continue + if not os.path.isdir(scores_dir): + checked.append(f"{scores_dir!r} (does not exist)") + continue + model_dirs = [ + d for d in os.listdir(scores_dir) + if os.path.isdir(os.path.join(scores_dir, d)) + ] + if not model_dirs: + checked.append(f"{scores_dir!r} (exists but has no model subdirectories)") + continue + has_json = False + for model_name in model_dirs: + model_dir = os.path.join(scores_dir, model_name) + json_files = [f for f in os.listdir(model_dir) if f.endswith(".json")] + if json_files: + has_json = True + break + if has_json: + has_valid_score_dir = True + break + checked.append(f"{scores_dir!r} (exists, has model subdirs but no .json score files)") + + if not has_valid_score_dir: + detail = "; ".join(checked) if checked else "no scores_dir/name derived paths to check" + raise ValueError( + "Novelty uses real/reference data via prior accuracies: model scores from evaluating " + "models on those reference datasets (e.g. MATH-500, MATH-Hard). You must have run that " + "evaluation and saved scores so they exist at scores_dir (or scores_root_dir/). " + "Each directory must contain one subdir per model with Inspect eval JSON score files. " + f"Checked: {detail}. " + "Either run evaluation on the reference datasets and save scores there, or remove 'novelty' from metrics_to_compute." + ) + + +def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]: + """ + Collect all accuracy values from Inspect eval JSON files in a directory (recursively). + Single primitive: one dir -> list of accuracies. """ accuracies: List[float] = [] for root, _dirs, files in os.walk(directory): @@ -53,39 +281,42 @@ def _collect_accuracies_from_dir(directory: str) -> List[float]: if not fname.endswith(".json"): continue json_path = os.path.join(root, fname) - acc = _extract_accuracy_from_inspect_json(json_path) - if acc is not None: + try: + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as exc: # noqa: BLE001 + logger.warning("Failed to read %s: %s", json_path, exc) + continue + try: + if "error" in data or "results" not in data: + continue + scores = data["results"]["scores"] + if not scores: + continue + acc = float(scores[0]["metrics"]["accuracy"]["value"]) accuracies.append(acc) + except (KeyError, TypeError, ValueError) as exc: + logger.warning("Failed to extract accuracy from %s: %s", json_path, exc) return accuracies -def _load_avg_model_accuracies_from_dir(base_dir: str) -> Dict[str, float]: +def _load_average_accuracy_per_model_from_scores_dir(base_dir: str) -> Dict[str, float]: """ - Load model accuracies from a directory structure. - - Args: - base_dir: Directory containing per-model subdirectories with JSON files. - - Returns - ------- - Dictionary mapping model name to average accuracy. + Load a scores directory with one subdir per model (each containing Inspect eval JSONs) + and return model name -> average accuracy. Used for prior (reference) score dirs (e.g. novelty). + Returns empty dict if base_dir does not exist. """ model_to_accuracy: Dict[str, float] = {} - if not os.path.isdir(base_dir): logger.warning("Directory does not exist: %s", base_dir) return model_to_accuracy - for model_name in os.listdir(base_dir): model_dir = os.path.join(base_dir, model_name) if not os.path.isdir(model_dir): continue - - accuracies = _collect_accuracies_from_dir(model_dir) + accuracies = _collect_accuracies_from_inspect_eval_dir(model_dir) if accuracies: - avg_acc = sum(accuracies) / len(accuracies) - model_to_accuracy[model_name] = avg_acc - + model_to_accuracy[model_name] = sum(accuracies) / len(accuracies) return model_to_accuracy @@ -240,43 +471,12 @@ def _load_capabilities_and_generate_embeddings( return embeddings_array, texts -def _extract_accuracy_from_inspect_json(json_path: str) -> float | None: - """Extract the accuracy metric from a single Inspect eval JSON file.""" - try: - with open(json_path, "r", encoding="utf-8") as f: - data = json.load(f) - except Exception as exc: # noqa: BLE001 - logger.warning("Failed to read %s: %s", json_path, exc) - return None - - try: - # Check if file has results (successful evaluation) or error (failed evaluation) - if "error" in data or "results" not in data: - # File has error or no results, skip it - return None - - scores = data["results"]["scores"] - if not scores: - return None - metrics = scores[0]["metrics"] - acc = metrics["accuracy"]["value"] - return float(acc) - except (KeyError, TypeError, ValueError) as exc: - logger.warning("Failed to extract accuracy from %s: %s", json_path, exc) - return None - - -@hydra.main( - version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg" -) -def main(cfg: DictConfig) -> None: - """Compute benchmark-level quality metrics from saved capability scores.""" +def _load_benchmark_scores(cfg: DictConfig) -> Tuple[Dict[str, float], Dict[str, List[float]]]: + """Load model accuracies from the benchmark (evaluated) scores directory. Validation has already run.""" run_id = get_run_id(cfg) - - # Synthetic benchmark source (scores + capabilities) - synthetic_cfg = cfg.quality_eval_cfg.synthetic_source - scores_root_dir = synthetic_cfg.get("scores_root_dir") - scores_subdir = synthetic_cfg.get("scores_subdir", "scores") + benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + scores_root_dir = benchmark_source_cfg.get("scores_root_dir") + scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores") if scores_root_dir: base_scores_dir = scores_root_dir @@ -286,22 +486,9 @@ def main(cfg: DictConfig) -> None: scores_subdir, run_id, ) - logger.info("Using fallback scores directory: %s", base_scores_dir) - - if not os.path.isdir(base_scores_dir): - logger.error( - "Scores directory '%s' does not exist. " - "Please ensure scores are generated for run_id '%s'.", - base_scores_dir, - run_id, - ) - return logger.info("Loading model accuracies from %s", base_scores_dir) - - # For each model directory, walk all JSON files and average their accuracies. model_to_accuracy: Dict[str, float] = {} - # For consistency: map model to list of accuracies per generation model_to_generation_accuracies: Dict[str, List[float]] = {} for model_name in os.listdir(base_scores_dir): @@ -309,7 +496,6 @@ def main(cfg: DictConfig) -> None: if not os.path.isdir(model_dir): continue - # Check if model_dir contains subdirectories (generations/runs) subdirs = [ d for d in os.listdir(model_dir) @@ -317,13 +503,10 @@ def main(cfg: DictConfig) -> None: ] if subdirs: - # Structure: model_dir/generation_dir/...json files - # Each subdirectory represents a different dataset generation generation_accuracies: List[float] = [] for gen_dir_name in sorted(subdirs): gen_dir = os.path.join(model_dir, gen_dir_name) - gen_accuracies = _collect_accuracies_from_dir(gen_dir) - + gen_accuracies = _collect_accuracies_from_inspect_eval_dir(gen_dir) if gen_accuracies: avg_gen_acc = sum(gen_accuracies) / len(gen_accuracies) generation_accuracies.append(avg_gen_acc) @@ -334,10 +517,8 @@ def main(cfg: DictConfig) -> None: avg_gen_acc, len(gen_accuracies), ) - if generation_accuracies: model_to_generation_accuracies[model_name] = generation_accuracies - # Overall average across all generations avg_acc = sum(generation_accuracies) / len(generation_accuracies) model_to_accuracy[model_name] = avg_acc logger.info( @@ -346,17 +527,11 @@ def main(cfg: DictConfig) -> None: len(generation_accuracies), avg_acc, ) - # Continue to next model if we processed subdirs continue - # Structure: model_dir/...json files (no generation subdirectories) - accuracies = _collect_accuracies_from_dir(model_dir) + accuracies = _collect_accuracies_from_inspect_eval_dir(model_dir) if not accuracies: - logger.warning( - "No accuracies found for model '%s' in %s", model_name, model_dir - ) continue - avg_acc = sum(accuracies) / len(accuracies) model_to_accuracy[model_name] = avg_acc logger.info( @@ -367,361 +542,323 @@ def main(cfg: DictConfig) -> None: ) if not model_to_accuracy: - logger.error("No valid model accuracies found in %s", base_scores_dir) + raise RuntimeError( + f"Unexpected: No valid model accuracies found in {base_scores_dir} " + "despite validation passing. This may indicate a race condition or file system issue." + ) + return model_to_accuracy, model_to_generation_accuracies + + +def _compute_benchmark_metrics( + model_to_accuracy: Dict[str, float], + model_to_generation_accuracies: Dict[str, List[float]], + metrics_to_compute: set, +) -> None: + """Compute difficulty, separability, and consistency from model accuracies.""" + if "difficulty" in metrics_to_compute: + difficulty = compute_benchmark_difficulty(model_to_accuracy) + logger.info("Benchmark difficulty: %.4f", difficulty) + + if "separability" in metrics_to_compute: + separability = compute_benchmark_separability(model_to_accuracy) + logger.info("Benchmark separability: %.4f", separability) + + if "consistency" in metrics_to_compute: + if not model_to_generation_accuracies: + raise RuntimeError( + "Unexpected: No model generation accuracies found despite validation passing. " + "This may indicate a race condition or file system issue." + ) + consistency = compute_benchmark_consistency(model_to_generation_accuracies) + logger.info("Benchmark consistency: %.4f", consistency) + + +def _compute_novelty_metrics( + cfg: DictConfig, + benchmark_source_cfg: DictConfig, + model_to_accuracy: Dict[str, float], + metrics_to_compute: set, +) -> None: + """Load previous (prior) accuracies and compute novelty vs benchmark. Combined and/or per-dataset.""" + if "novelty" not in metrics_to_compute: return - difficulty = compute_benchmark_difficulty(model_to_accuracy) - separability = compute_benchmark_separability(model_to_accuracy) - logger.info("Benchmark difficulty: %.4f", difficulty) - logger.info("Benchmark separability: %.4f", separability) - - # Compute consistency if we have multiple generations per model - if model_to_generation_accuracies: - try: - consistency = compute_benchmark_consistency(model_to_generation_accuracies) - logger.info("Benchmark consistency: %.4f", consistency) - except ValueError as e: - logger.warning("Could not compute consistency: %s", e) - - # Compute novelty using score dirs derived from real_data_source. - novelty_score_dirs: List[str] = [] - real_source_cfg = cfg.quality_eval_cfg.get("real_data_source") - real_source_configs: List[Mapping[str, Any]] = [] - if real_source_cfg is not None: - cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True) - if isinstance(cfg_container, list): - real_source_configs = cfg_container # type: ignore[list-item] - elif isinstance(cfg_container, Mapping): - real_source_configs = [cfg_container] # type: ignore[list-item] + reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) + prior_source_configs = ( + cfg_container if isinstance(cfg_container, list) else [cfg_container] + ) - # Use synthetic_source.scores_root_dir for deriving default score dirs - scores_root_dir = synthetic_cfg.get("scores_root_dir") - for src in real_source_configs: - scores_dir = src.get("scores_dir") + scores_root_dir = benchmark_source_cfg.get("scores_root_dir") + prior_score_dirs: List[str] = [] + for src in prior_source_configs: + src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + scores_dir = src_dict.get("scores_dir") if not scores_dir: - src_name = src.get("name") + src_name = src_dict.get("name") if scores_root_dir and src_name: scores_dir = os.path.join(scores_root_dir, src_name) if scores_dir: - novelty_score_dirs.append(str(scores_dir)) + prior_score_dirs.append(str(scores_dir)) + + logger.info("Loading prior (previous) accuracies for novelty computation...") + prior_datasets_accuracies: List[Dict[str, float]] = [] + prior_labels: List[str] = [] + for prior_dir in prior_score_dirs: + prior_acc = _load_average_accuracy_per_model_from_scores_dir(prior_dir) + if not prior_acc: + raise RuntimeError( + f"Unexpected: No accuracies found in prior dataset {prior_dir} " + "despite validation passing. This may indicate a race condition or file system issue." + ) + prior_datasets_accuracies.append(prior_acc) + prior_labels.append(os.path.basename(os.path.normpath(prior_dir))) + logger.info( + "Loaded prior dataset from %s: %d models", + prior_dir, + len(prior_acc), + ) - if novelty_score_dirs: - try: - logger.info("Loading prior datasets for novelty computation...") - prior_datasets_accuracies: List[Dict[str, float]] = [] - prior_labels: List[str] = [] - for prior_dir in novelty_score_dirs: - prior_acc = _load_avg_model_accuracies_from_dir(prior_dir) - if prior_acc: - prior_datasets_accuracies.append(prior_acc) - prior_labels.append(os.path.basename(os.path.normpath(prior_dir))) - logger.info( - "Loaded prior dataset from %s: %d models", - prior_dir, - len(prior_acc), - ) - else: - logger.warning( - "No accuracies found in prior dataset: %s", prior_dir - ) + novelty_mode = str(cfg.quality_eval_cfg.get("novelty_mode", "combined")).lower() + if novelty_mode in ("combined", "both"): + novelty = compute_benchmark_novelty( + model_to_accuracy, + cast(List[Mapping[str, float]], prior_datasets_accuracies), + ) + logger.info("Benchmark novelty (combined): %.4f", novelty) + if novelty_mode in ("per_dataset", "both"): + for label, prior_acc in zip(prior_labels, prior_datasets_accuracies): + n_per = compute_benchmark_novelty(model_to_accuracy, [prior_acc]) + logger.info("Novelty[%s]: %.4f", label, n_per) + + +def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -> None: + """Load benchmark and reference embeddings; compute PAD, MMD, KL, MDM, entropy.""" + internal_metrics = {"mdm", "entropy"} + comparison_metrics = {"pad", "mmd", "kl_divergence"} + needs_embeddings = bool( + internal_metrics.intersection(metrics_to_compute) + or comparison_metrics.intersection(metrics_to_compute) + ) + if not needs_embeddings: + return - if prior_datasets_accuracies: - novelty_mode = str( - cfg.quality_eval_cfg.get("novelty_mode", "combined") - ).lower() - if novelty_mode in ("combined", "both"): - novelty = compute_benchmark_novelty( - model_to_accuracy, - cast( - List[Mapping[str, float]], prior_datasets_accuracies - ), - ) - logger.info("Benchmark novelty (combined): %.4f", novelty) - if novelty_mode in ("per_dataset", "both"): - for label, prior_acc in zip( - prior_labels, prior_datasets_accuracies - ): - n_per = compute_benchmark_novelty( - model_to_accuracy, [prior_acc] - ) - logger.info( - "Novelty[%s]: %.4f", label, n_per - ) - else: - logger.warning( - "No valid real data score dirs found (real_data_source with scores_dir or name), skipping novelty computation." - ) - except ValueError as e: - logger.warning("Could not compute novelty: %s", e) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing novelty: %s", e) - - # Compute embedding-based metrics if synthetic capabilities directory is provided - capabilities_dir = synthetic_cfg.get("capabilities_dir") - if capabilities_dir: - internal_diversity_metrics = cfg.quality_eval_cfg.internal_diversity_metrics - comparison_metrics = cfg.quality_eval_cfg.comparison_metrics - embedding_model = cfg.quality_eval_cfg.embedding_model - embedding_backend = cfg.quality_eval_cfg.embedding_backend - embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions + benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + capabilities_dir = benchmark_source_cfg.get("capabilities_dir") + embedding_model = cfg.quality_eval_cfg.embedding_model + embedding_backend = cfg.quality_eval_cfg.embedding_backend + embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions - logger.info( - "Computing embedding-based metrics for capabilities in %s", capabilities_dir + logger.info( + "Computing embedding-based metrics for capabilities in %s", capabilities_dir + ) + benchmark_embeddings, capabilities = _load_capabilities_and_generate_embeddings( + capabilities_dir=capabilities_dir, + embedding_model_name=embedding_model, + embed_dimensions=embed_dimensions, + dataloader_config=None, + embedding_backend=embedding_backend, + ) + if len(benchmark_embeddings) == 0: + raise RuntimeError( + f"Unexpected: No embeddings generated from {capabilities_dir} " + "despite validation passing. This may indicate an embedding API/network issue." ) - # Load capabilities and generate embeddings - synth_embeddings, capabilities = _load_capabilities_and_generate_embeddings( - capabilities_dir=capabilities_dir, - embedding_model_name=embedding_model, - embed_dimensions=embed_dimensions, - dataloader_config=None, - embedding_backend=embedding_backend, + reference_embeddings = None + reference_embeddings_list: List[np.ndarray] = [] + reference_names: List[str] = [] + + if comparison_metrics.intersection(metrics_to_compute): + reference_comparison_mode = str( + cfg.quality_eval_cfg.get("real_comparison_mode", "pooled") + ).lower() + reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) + raw_list = ( + cfg_container if isinstance(cfg_container, list) else [cfg_container] ) + reference_source_configs: List[Dict[str, Any]] = [] + for i, src in enumerate(raw_list): + src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + src_dict.setdefault("name", f"reference_{i}") + reference_source_configs.append(src_dict) + + for src in reference_source_configs: + name = src.get("name", "reference") + reference_data_path = src.get("path") + reference_dataloader_cfg = src.get("dataloader") + if reference_dataloader_cfg is not None and not isinstance(reference_dataloader_cfg, dict): + reference_dataloader_cfg = dict( + OmegaConf.to_container(reference_dataloader_cfg, resolve=True) + ) + if reference_dataloader_cfg is None: + reference_dataloader_cfg = {} - if len(synth_embeddings) == 0: - logger.warning("No embeddings generated, skipping diversity metrics") - else: - real_embeddings = None - # Real data sources for comparison metrics (PAD, MMD, KL) - real_mode = str( - cfg.quality_eval_cfg.get("real_comparison_mode", "pooled") - ).lower() - real_source_cfg = cfg.quality_eval_cfg.get("real_data_source") - - # Normalize to a list of source configs: each with optional name, path, dataloader. - # real_data_source can be a single mapping or a list of mappings. - real_source_configs: List[Dict[str, Any]] = [] - if real_source_cfg is None: + if reference_data_path: + logger.info("Loading reference data embeddings from %s", reference_data_path) + else: logger.info( - "real_data_source is not set in config; skipping comparison metrics (PAD, MMD, KL)." + "Loading reference data embeddings for %s using dataloader config (no local path)", + name, ) - else: - cfg_container = OmegaConf.to_container(real_source_cfg, resolve=True) - if isinstance(cfg_container, list): - raw_list: List[Any] = cfg_container - elif isinstance(cfg_container, Mapping): - raw_list = [cfg_container] + ref_emb, _ = _load_capabilities_and_generate_embeddings( + capabilities_dir=reference_data_path or "", + embedding_model_name=embedding_model, + embed_dimensions=embed_dimensions, + dataloader_config=reference_dataloader_cfg, + embedding_backend=embedding_backend, + ) + if ref_emb is None or len(ref_emb) == 0: + raise RuntimeError( + f"Failed to generate embeddings for reference source {name}. " + "Config validation passed, but embedding generation failed. " + "Check embedding API/network connectivity and dataloader configuration." + ) + reference_embeddings_list.append(ref_emb) + reference_names.append(name) + + if reference_embeddings_list: + reference_embeddings = np.vstack(reference_embeddings_list) + + if "pad" in metrics_to_compute: + if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1: + for name, ref_emb in zip(reference_names, reference_embeddings_list): + pad_score = compute_pad( + benchmark_embeddings, + ref_emb, + classifier_name=cfg.quality_eval_cfg.pad_classifier, + ) + logger.info("PAD[%s]: %.4f", name, pad_score) else: - raw_list = [] - for i, src in enumerate(raw_list): - src_dict = dict(src) - src_dict.setdefault("name", f"real_{i}") - real_source_configs.append(src_dict) - - real_embeddings_list: List[np.ndarray] = [] - real_names: List[str] = [] - - # Load embeddings for each real source - for src in real_source_configs: - name = src.get("name", "real") - real_data_path = src.get("path") - real_dataloader_cfg = src.get("dataloader") - if real_dataloader_cfg is not None and not isinstance( - real_dataloader_cfg, dict - ): - real_dataloader_cfg = dict( - OmegaConf.to_container(real_dataloader_cfg, resolve=True) + pad_score = compute_pad( + benchmark_embeddings, + reference_embeddings, + classifier_name=cfg.quality_eval_cfg.pad_classifier, + ) + logger.info("PAD (pooled reference): %.4f", pad_score) + + if "mmd" in metrics_to_compute: + mmd_kernel = cfg.quality_eval_cfg.mmd_kernel + mmd_degree = cfg.quality_eval_cfg.mmd_degree + if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1: + for name, ref_emb in zip(reference_names, reference_embeddings_list): + mmd_score = compute_mmd( + benchmark_embeddings, + ref_emb, + kernel=mmd_kernel, + degree=mmd_degree, + ) + logger.info( + "MMD[%s] (%s kernel): %.4f", + name, + mmd_kernel, + mmd_score, + ) + else: + mmd_score = compute_mmd( + benchmark_embeddings, + reference_embeddings, + kernel=mmd_kernel, + degree=mmd_degree, ) - - has_real_data = False - if real_data_path and ( - os.path.isdir(real_data_path) or os.path.isfile(real_data_path) - ): - has_real_data = True - elif real_dataloader_cfg and real_dataloader_cfg.get( - "type" - ) == "huggingface": - has_real_data = True - - if not has_real_data: logger.info( - "Skipping real source %s: no valid path or dataloader (type=huggingface) provided", - name, + "MMD (pooled reference, %s kernel): %.4f", + mmd_kernel, + mmd_score, ) - continue - if real_dataloader_cfg is None: - real_dataloader_cfg = {} + has_reference = reference_embeddings is not None and len(reference_embeddings) > 0 + umap_n_components = cfg.quality_eval_cfg.umap_n_components + umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors + umap_min_dist = cfg.quality_eval_cfg.umap_min_dist + umap_metric = cfg.quality_eval_cfg.umap_metric + need_umap = umap_n_components is not None and ( + "entropy" in metrics_to_compute + or ("kl_divergence" in metrics_to_compute and has_reference) + ) + benchmark_reduced = None + reference_reduced = None + if need_umap: + embeddings_to_reduce = [benchmark_embeddings] + if has_reference: + embeddings_to_reduce.append(reference_embeddings) + reduced_list = fit_umap( + embeddings_to_reduce, + umap_n_components, + n_neighbors=umap_n_neighbors, + min_dist=umap_min_dist, + metric=umap_metric, + ) + benchmark_reduced = reduced_list[0] + reference_reduced = reduced_list[1] if len(reduced_list) > 1 else None - if real_data_path: - logger.info("Loading real data embeddings from %s", real_data_path) - else: - logger.info( - "Loading real data embeddings for %s using dataloader config (no local path)", - name, - ) + if "kl_divergence" in metrics_to_compute: + kl_k = cfg.quality_eval_cfg.kl_k + kl_benchmark = ( + benchmark_reduced if reference_reduced is not None else benchmark_embeddings + ) + kl_reference = ( + reference_reduced if reference_reduced is not None else reference_embeddings + ) + kl_score = compute_kl_divergence(kl_benchmark, kl_reference, k=kl_k) + umap_info = ( + f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + ) + logger.info( + "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score + ) - emb_real, _ = _load_capabilities_and_generate_embeddings( - capabilities_dir=real_data_path or "", - embedding_model_name=embedding_model, - embed_dimensions=embed_dimensions, - dataloader_config=real_dataloader_cfg, - embedding_backend=embedding_backend, - ) - if emb_real is None or len(emb_real) == 0: - logger.warning( - "No real data embeddings generated for source %s, skipping it", - name, - ) - continue + if "mdm" in metrics_to_compute: + mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters + mdm_metric = cfg.quality_eval_cfg.mdm_metric + mdm_score = compute_mdm( + benchmark_embeddings, + n_clusters=mdm_n_clusters, + metric=mdm_metric, + ) + logger.info( + "MDM score (%d clusters, %s metric): %.4f", + mdm_n_clusters, + mdm_metric, + mdm_score, + ) - real_embeddings_list.append(emb_real) - real_names.append(name) - - if real_embeddings_list: - # Pooled real embeddings (used for KL + joint UMAP, and for PAD/MMD in 'pooled' mode) - real_embeddings = np.vstack(real_embeddings_list) - - # Comparison metrics (need both synth and real) - if "pad" in comparison_metrics: - try: - if real_mode == "per_dataset" and len(real_embeddings_list) > 1: - for name, emb_real in zip(real_names, real_embeddings_list): - pad_score = compute_pad( - synth_embeddings, - emb_real, - classifier_name=cfg.quality_eval_cfg.pad_classifier, - ) - logger.info("PAD[%s]: %.4f", name, pad_score) - else: - pad_score = compute_pad( - synth_embeddings, - real_embeddings, - classifier_name=cfg.quality_eval_cfg.pad_classifier, - ) - logger.info("PAD (pooled real): %.4f", pad_score) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing PAD: %s", e) - - if "mmd" in comparison_metrics: - try: - mmd_kernel = cfg.quality_eval_cfg.mmd_kernel - mmd_degree = cfg.quality_eval_cfg.mmd_degree - if real_mode == "per_dataset" and len(real_embeddings_list) > 1: - for name, emb_real in zip(real_names, real_embeddings_list): - mmd_score = compute_mmd( - synth_embeddings, - emb_real, - kernel=mmd_kernel, - degree=mmd_degree, - ) - logger.info( - "MMD[%s] (%s kernel): %.4f", - name, - mmd_kernel, - mmd_score, - ) - else: - mmd_score = compute_mmd( - synth_embeddings, - real_embeddings, - kernel=mmd_kernel, - degree=mmd_degree, - ) - logger.info( - "MMD (pooled real, %s kernel): %.4f", - mmd_kernel, - mmd_score, - ) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing MMD: %s", e) - elif real_source_configs: - logger.warning( - "No real data embeddings could be generated for any source. " - "Check dataloader config (e.g. dataset_name, text_field) and embedding API/network." - ) - # When real_source_configs is empty we already logged that real_data_source is not set + if "entropy" in metrics_to_compute: + entropy_k = cfg.quality_eval_cfg.entropy_k + entropy_emb = ( + benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings + ) + entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k) + umap_info = ( + f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + ) + logger.info( + "Differential entropy score (k=%d)%s: %.4f", + entropy_k, + umap_info, + entropy_score, + ) - # Joint UMAP (for entropy and/or KL in shared space) - has_real = ( - real_embeddings is not None and len(real_embeddings) > 0 - ) - umap_n_components = cfg.quality_eval_cfg.umap_n_components - umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors - umap_min_dist = cfg.quality_eval_cfg.umap_min_dist - umap_metric = cfg.quality_eval_cfg.umap_metric - need_umap = umap_n_components is not None and ( - "entropy" in internal_diversity_metrics - or ("kl_divergence" in comparison_metrics and has_real) - ) - synth_reduced = None - real_reduced = None - if need_umap: - embeddings_to_reduce = [synth_embeddings] - if has_real: - embeddings_to_reduce.append(real_embeddings) - reduced_list = fit_umap( - embeddings_to_reduce, - umap_n_components, - n_neighbors=umap_n_neighbors, - min_dist=umap_min_dist, - metric=umap_metric, - ) - synth_reduced = reduced_list[0] - real_reduced = reduced_list[1] if len(reduced_list) > 1 else None - - # KL divergence (joint UMAP so synth and real share a space) - if "kl_divergence" in comparison_metrics and has_real: - try: - kl_k = cfg.quality_eval_cfg.kl_k - kl_synth = ( - synth_reduced if real_reduced is not None else synth_embeddings - ) - kl_real = ( - real_reduced if real_reduced is not None else real_embeddings - ) - if kl_synth is not None and kl_real is not None: - kl_score = compute_kl_divergence(kl_synth, kl_real, k=kl_k) - else: - kl_score = 0.0 - umap_info = ( - f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - ) - logger.info( - "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score - ) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing KL divergence: %s", e) - - # Compute internal diversity metrics (only need synthetic data) - if "mdm" in internal_diversity_metrics: - try: - mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters - mdm_metric = cfg.quality_eval_cfg.mdm_metric - mdm_score = compute_mdm( - synth_embeddings, - n_clusters=mdm_n_clusters, - metric=mdm_metric, - ) - logger.info( - "MDM score (%d clusters, %s metric): %.4f", - mdm_n_clusters, - mdm_metric, - mdm_score, - ) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing MDM: %s", e) - - if "entropy" in internal_diversity_metrics: - try: - entropy_k = cfg.quality_eval_cfg.entropy_k - entropy_emb = ( - synth_reduced if synth_reduced is not None else synth_embeddings - ) - entropy_score = compute_differential_entropy( - entropy_emb, k=entropy_k - ) - umap_info = ( - f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - ) - logger.info( - "Differential entropy score (k=%d)%s: %.4f", - entropy_k, - umap_info, - entropy_score, - ) - except Exception as e: # noqa: BLE001 - logger.warning("Error computing differential entropy: %s", e) + +@hydra.main( + version_base=None, config_path="cfg", config_name="run_quality_evaluation_cfg" +) +def main(cfg: DictConfig) -> None: + """Compute benchmark-level quality metrics from saved capability scores.""" + _validate_metric_requirements(cfg) + + metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute) + benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + + model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg) + _compute_benchmark_metrics( + model_to_accuracy, + model_to_generation_accuracies, + metrics_to_compute, + ) + _compute_novelty_metrics(cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute) + _compute_embedding_based_metrics(cfg, metrics_to_compute) if __name__ == "__main__": From 31d73c4bef9c99e199cbed0ad7d236ed5497a53d Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 12 Feb 2026 00:03:55 -0500 Subject: [PATCH 11/14] Updated quality evaluation --- src/cfg/run_quality_evaluation_cfg.yaml | 27 +-- src/run_quality_evaluation.py | 271 +++++++++++++----------- 2 files changed, 149 insertions(+), 149 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index 00acebf..9682bfd 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -2,13 +2,8 @@ prompt_cfg: sys_msg: Compute benchmark quality metrics from existing scores. quality_eval_cfg: - # Synthetic benchmark source (scores + capabilities) - synthetic_source: - # Root directory containing per-model score subdirs for the synthetic benchmark + new_data_source: scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" - # Optional subdirectory name when falling back to BASE_ARTIFACTS_DIR - scores_subdir: "scores" - # Capability directory for the synthetic benchmark capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" # List of metrics to compute. Available metrics: @@ -20,32 +15,24 @@ quality_eval_cfg: - "difficulty" - "separability" - "consistency" - # - "novelty" + - "novelty" - "mdm" - "entropy" - "pad" - "mmd" - "kl_divergence" - # Novelty: "combined" = one score from all real sources (linear regression on all); - # "per_dataset" = one novelty per prior (how novel vs each benchmark separately); - # "both" = report combined and per-dataset. - novelty_mode: "combined" # "combined" | "per_dataset" | "both" - - # Source(s) of REAL data used for comparison metrics (PAD, MMD, KL) and novelty. - # real_data_source can be: + # Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty. + # previous_data_sources can be: # - a single mapping {path, dataloader, name, scores_dir}, OR # - a list of such mappings when you have multiple real datasets. - # - # When multiple sources are provided, real_comparison_mode controls whether - # they are pooled together into one real distribution ("pooled") or compared - # pairwise against the synthetic data ("per_dataset") for PAD/MMD. - real_comparison_mode: "pooled" # or "per_dataset" + # PAD and MMD are always reported per previous-data source. + # UMAP (for entropy/KL) is fit on all new + all previous data combined. # Example: multiple real datasets (HuggingFace math benchmarks). # Novelty uses score dirs from each source: set scores_dir explicitly, or # we use scores_root_dir/ when name is set. - real_data_source: + previous_data_sources: - name: "MATH-500" path: null # Optional: explicit scores directory for novelty; otherwise uses diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index a761de9..957b945 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -10,6 +10,14 @@ from omegaconf import DictConfig, OmegaConf from src.generate_embeddings import EmbeddingGenerator, EmbeddingModelName +from src.utils.diversity_metrics_dataloaders import ( + CapabilityDataloader, + CSVDataloader, + DatasetDataloader, + HuggingFaceDatasetDataloader, + JSONLDataloader, + load_texts_from_dataloader, +) from src.utils.quality_evaluation_utils import ( compute_benchmark_consistency, compute_benchmark_difficulty, @@ -22,21 +30,26 @@ compute_pad, fit_umap, ) -from src.utils.data_utils import get_run_id -from src.utils import constants -from src.utils.diversity_metrics_dataloaders import ( - CapabilityDataloader, - CSVDataloader, - DatasetDataloader, - HuggingFaceDatasetDataloader, - JSONLDataloader, - load_texts_from_dataloader, -) logger = logging.getLogger(__name__) +def _as_dict(obj: Any) -> Dict[str, Any]: + """ + Convert an OmegaConf container-like object to a plain dict. + + Raises if the object cannot be represented as a mapping. + """ + if isinstance(obj, dict): + return obj + container = OmegaConf.to_container(obj, resolve=True) + if isinstance(container, Mapping): + mapping = cast(Mapping[str, Any], container) + return dict(mapping) + raise TypeError(f"Expected mapping-like config, got: {type(container)}") + + def _validate_metric_requirements(cfg: DictConfig) -> None: """ Validate that all required data is provided for the requested metrics. @@ -51,33 +64,37 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: "mdm, entropy, pad, mmd, kl_divergence" ) - benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source - reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source + reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") # Benchmark metrics (difficulty, separability, consistency) need scores benchmark_metrics = {"difficulty", "separability", "consistency"} if benchmark_metrics.intersection(metrics_to_compute): scores_root_dir = benchmark_source_cfg.get("scores_root_dir") - scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores") - run_id = get_run_id(cfg) - - if scores_root_dir: - base_scores_dir = scores_root_dir - else: - base_scores_dir = os.path.join( - constants.BASE_ARTIFACTS_DIR, scores_subdir, run_id + + if not scores_root_dir: + raise ValueError( + "Benchmark metrics " + f"({benchmark_metrics.intersection(metrics_to_compute)}) " + "require 'scores_root_dir' to be set in " + "quality_eval_cfg.new_data_source. " + "Please provide the path to the directory containing one " + "subdirectory per subject model." ) - + + base_scores_dir = scores_root_dir + if not os.path.isdir(base_scores_dir): raise ValueError( f"Benchmark metrics ({benchmark_metrics.intersection(metrics_to_compute)}) " f"require scores directory to exist. " f"benchmark scores_root_dir or fallback directory not found: {base_scores_dir}" ) - + # Check that scores directory contains at least one model subdirectory model_dirs = [ - d for d in os.listdir(base_scores_dir) + d + for d in os.listdir(base_scores_dir) if os.path.isdir(os.path.join(base_scores_dir, d)) ] if not model_dirs: @@ -85,14 +102,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: f"Scores directory '{base_scores_dir}' exists but contains no model subdirectories. " "Please ensure scores are generated for at least one model." ) - - # For consistency metric, check that at least one model has generation subdirectories + + # For consistency metric, check that at least one model has generation + # subdirectories. if "consistency" in metrics_to_compute: has_generations = False for model_name in model_dirs: model_dir = os.path.join(base_scores_dir, model_name) subdirs = [ - d for d in os.listdir(model_dir) + d + for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) ] if subdirs: @@ -169,7 +188,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if reference_data_source_cfg is None: raise ValueError( f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " - "require reference_data_source to be configured" + "require previous_data_sources to be configured" ) # Validate each reference source has either path or dataloader @@ -183,11 +202,11 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if not sources: raise ValueError( f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " - "require at least one reference_data_source entry" + "require at least one previous_data_sources entry" ) for i, src in enumerate(sources): - src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + src_dict = _as_dict(src) name = src_dict.get("name", f"reference_{i}") path = src_dict.get("path") dataloader = src_dict.get("dataloader") @@ -197,14 +216,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if not (has_path or has_dataloader): raise ValueError( - f"reference_data_source[{i}] ({name}) must have either a valid 'path' " + f"previous_data_sources[{i}] ({name}) must have either a valid 'path' " "(existing file/directory) or 'dataloader' with type='huggingface'" ) - # Novelty needs reference_data_source with score directories (prior accuracies) + # Novelty needs previous_data_sources with score directories (prior accuracies) if "novelty" in metrics_to_compute: if reference_data_source_cfg is None: - raise ValueError("Novelty metric requires reference_data_source (prior accuracies) to be configured") + raise ValueError( + "Novelty metric requires previous_data_sources (prior accuracies) to be configured" + ) cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) sources = [] @@ -214,13 +235,15 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: sources = [cfg_container] if not sources: - raise ValueError("Novelty metric requires at least one reference_data_source entry (for prior accuracies)") + raise ValueError( + "Novelty metric requires at least one previous_data_sources entry (for prior accuracies)" + ) scores_root_dir = benchmark_source_cfg.get("scores_root_dir") has_valid_score_dir = False checked: List[str] = [] for i, src in enumerate(sources): - src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + src_dict = _as_dict(src) scores_dir = src_dict.get("scores_dir") if not scores_dir: src_name = src_dict.get("name") @@ -232,7 +255,9 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: f"entry {i} (name={src_dict.get('name')}): no scores_dir and scores_root_dir not set" ) elif not src_name: - checked.append(f"entry {i}: no scores_dir and no name to derive from scores_root_dir") + checked.append( + f"entry {i}: no scores_dir and no name to derive from scores_root_dir" + ) continue if not scores_dir: continue @@ -240,11 +265,14 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: checked.append(f"{scores_dir!r} (does not exist)") continue model_dirs = [ - d for d in os.listdir(scores_dir) + d + for d in os.listdir(scores_dir) if os.path.isdir(os.path.join(scores_dir, d)) ] if not model_dirs: - checked.append(f"{scores_dir!r} (exists but has no model subdirectories)") + checked.append( + f"{scores_dir!r} (exists but has no model subdirectories)" + ) continue has_json = False for model_name in model_dirs: @@ -256,10 +284,16 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if has_json: has_valid_score_dir = True break - checked.append(f"{scores_dir!r} (exists, has model subdirs but no .json score files)") + checked.append( + f"{scores_dir!r} (exists, has model subdirs but no .json score files)" + ) if not has_valid_score_dir: - detail = "; ".join(checked) if checked else "no scores_dir/name derived paths to check" + detail = ( + "; ".join(checked) + if checked + else "no scores_dir/name derived paths to check" + ) raise ValueError( "Novelty uses real/reference data via prior accuracies: model scores from evaluating " "models on those reference datasets (e.g. MATH-500, MATH-Hard). You must have run that " @@ -272,7 +306,11 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]: """ - Collect all accuracy values from Inspect eval JSON files in a directory (recursively). + Collect accuracy values from Inspect eval JSON files. + + Recursively walks a directory and extracts accuracy values from Inspect eval + JSON files. + Single primitive: one dir -> list of accuracies. """ accuracies: List[float] = [] @@ -302,8 +340,12 @@ def _collect_accuracies_from_inspect_eval_dir(directory: str) -> List[float]: def _load_average_accuracy_per_model_from_scores_dir(base_dir: str) -> Dict[str, float]: """ - Load a scores directory with one subdir per model (each containing Inspect eval JSONs) - and return model name -> average accuracy. Used for prior (reference) score dirs (e.g. novelty). + Load a scores directory with one subdir per model. + + Each model subdir contains Inspect eval JSON files. + + And return model name -> average accuracy. Used for prior (reference) score dirs + (e.g. novelty). Returns empty dict if base_dir does not exist. """ model_to_accuracy: Dict[str, float] = {} @@ -471,22 +513,25 @@ def _load_capabilities_and_generate_embeddings( return embeddings_array, texts -def _load_benchmark_scores(cfg: DictConfig) -> Tuple[Dict[str, float], Dict[str, List[float]]]: - """Load model accuracies from the benchmark (evaluated) scores directory. Validation has already run.""" - run_id = get_run_id(cfg) - benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source +def _load_benchmark_scores( + cfg: DictConfig, +) -> Tuple[Dict[str, float], Dict[str, List[float]]]: + """Load benchmark model accuracies from the evaluated scores directory. + + Validation has already run. + """ + benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source scores_root_dir = benchmark_source_cfg.get("scores_root_dir") - scores_subdir = benchmark_source_cfg.get("scores_subdir", "scores") - if scores_root_dir: - base_scores_dir = scores_root_dir - else: - base_scores_dir = os.path.join( - constants.BASE_ARTIFACTS_DIR, - scores_subdir, - run_id, + if not scores_root_dir: + raise ValueError( + "scores_root_dir must be set in quality_eval_cfg.new_data_source " + "to load benchmark scores. It should point to a directory that " + "contains one subdirectory per subject model." ) + base_scores_dir = scores_root_dir + logger.info("Loading model accuracies from %s", base_scores_dir) model_to_accuracy: Dict[str, float] = {} model_to_generation_accuracies: Dict[str, List[float]] = {} @@ -579,11 +624,11 @@ def _compute_novelty_metrics( model_to_accuracy: Dict[str, float], metrics_to_compute: set, ) -> None: - """Load previous (prior) accuracies and compute novelty vs benchmark. Combined and/or per-dataset.""" + """Load prior accuracies and compute one novelty metric using all priors.""" if "novelty" not in metrics_to_compute: return - reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) prior_source_configs = ( cfg_container if isinstance(cfg_container, list) else [cfg_container] @@ -592,7 +637,7 @@ def _compute_novelty_metrics( scores_root_dir = benchmark_source_cfg.get("scores_root_dir") prior_score_dirs: List[str] = [] for src in prior_source_configs: - src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + src_dict = _as_dict(src) scores_dir = src_dict.get("scores_dir") if not scores_dir: src_name = src_dict.get("name") @@ -603,7 +648,6 @@ def _compute_novelty_metrics( logger.info("Loading prior (previous) accuracies for novelty computation...") prior_datasets_accuracies: List[Dict[str, float]] = [] - prior_labels: List[str] = [] for prior_dir in prior_score_dirs: prior_acc = _load_average_accuracy_per_model_from_scores_dir(prior_dir) if not prior_acc: @@ -612,24 +656,17 @@ def _compute_novelty_metrics( "despite validation passing. This may indicate a race condition or file system issue." ) prior_datasets_accuracies.append(prior_acc) - prior_labels.append(os.path.basename(os.path.normpath(prior_dir))) logger.info( "Loaded prior dataset from %s: %d models", prior_dir, len(prior_acc), ) - novelty_mode = str(cfg.quality_eval_cfg.get("novelty_mode", "combined")).lower() - if novelty_mode in ("combined", "both"): - novelty = compute_benchmark_novelty( - model_to_accuracy, - cast(List[Mapping[str, float]], prior_datasets_accuracies), - ) - logger.info("Benchmark novelty (combined): %.4f", novelty) - if novelty_mode in ("per_dataset", "both"): - for label, prior_acc in zip(prior_labels, prior_datasets_accuracies): - n_per = compute_benchmark_novelty(model_to_accuracy, [prior_acc]) - logger.info("Novelty[%s]: %.4f", label, n_per) + novelty = compute_benchmark_novelty( + model_to_accuracy, + cast(List[Mapping[str, float]], prior_datasets_accuracies), + ) + logger.info("Benchmark novelty: %.4f", novelty) def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) -> None: @@ -643,7 +680,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - if not needs_embeddings: return - benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source capabilities_dir = benchmark_source_cfg.get("capabilities_dir") embedding_model = cfg.quality_eval_cfg.embedding_model embedding_backend = cfg.quality_eval_cfg.embedding_backend @@ -670,17 +707,12 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - reference_names: List[str] = [] if comparison_metrics.intersection(metrics_to_compute): - reference_comparison_mode = str( - cfg.quality_eval_cfg.get("real_comparison_mode", "pooled") - ).lower() - reference_data_source_cfg = cfg.quality_eval_cfg.get("real_data_source") + reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) - raw_list = ( - cfg_container if isinstance(cfg_container, list) else [cfg_container] - ) + raw_list = cfg_container if isinstance(cfg_container, list) else [cfg_container] reference_source_configs: List[Dict[str, Any]] = [] for i, src in enumerate(raw_list): - src_dict = dict(src) if isinstance(src, dict) else dict(OmegaConf.to_container(src, resolve=True)) + src_dict = _as_dict(src) src_dict.setdefault("name", f"reference_{i}") reference_source_configs.append(src_dict) @@ -688,15 +720,17 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - name = src.get("name", "reference") reference_data_path = src.get("path") reference_dataloader_cfg = src.get("dataloader") - if reference_dataloader_cfg is not None and not isinstance(reference_dataloader_cfg, dict): - reference_dataloader_cfg = dict( - OmegaConf.to_container(reference_dataloader_cfg, resolve=True) - ) + if reference_dataloader_cfg is not None and not isinstance( + reference_dataloader_cfg, dict + ): + reference_dataloader_cfg = _as_dict(reference_dataloader_cfg) if reference_dataloader_cfg is None: reference_dataloader_cfg = {} if reference_data_path: - logger.info("Loading reference data embeddings from %s", reference_data_path) + logger.info( + "Loading reference data embeddings from %s", reference_data_path + ) else: logger.info( "Loading reference data embeddings for %s using dataloader config (no local path)", @@ -722,48 +756,27 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - reference_embeddings = np.vstack(reference_embeddings_list) if "pad" in metrics_to_compute: - if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1: - for name, ref_emb in zip(reference_names, reference_embeddings_list): - pad_score = compute_pad( - benchmark_embeddings, - ref_emb, - classifier_name=cfg.quality_eval_cfg.pad_classifier, - ) - logger.info("PAD[%s]: %.4f", name, pad_score) - else: + for name, ref_emb in zip(reference_names, reference_embeddings_list): pad_score = compute_pad( benchmark_embeddings, - reference_embeddings, + ref_emb, classifier_name=cfg.quality_eval_cfg.pad_classifier, ) - logger.info("PAD (pooled reference): %.4f", pad_score) + logger.info("PAD[%s]: %.4f", name, pad_score) if "mmd" in metrics_to_compute: mmd_kernel = cfg.quality_eval_cfg.mmd_kernel mmd_degree = cfg.quality_eval_cfg.mmd_degree - if reference_comparison_mode == "per_dataset" and len(reference_embeddings_list) > 1: - for name, ref_emb in zip(reference_names, reference_embeddings_list): - mmd_score = compute_mmd( - benchmark_embeddings, - ref_emb, - kernel=mmd_kernel, - degree=mmd_degree, - ) - logger.info( - "MMD[%s] (%s kernel): %.4f", - name, - mmd_kernel, - mmd_score, - ) - else: + for name, ref_emb in zip(reference_names, reference_embeddings_list): mmd_score = compute_mmd( benchmark_embeddings, - reference_embeddings, + ref_emb, kernel=mmd_kernel, degree=mmd_degree, ) logger.info( - "MMD (pooled reference, %s kernel): %.4f", + "MMD[%s] (%s kernel): %.4f", + name, mmd_kernel, mmd_score, ) @@ -782,6 +795,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - if need_umap: embeddings_to_reduce = [benchmark_embeddings] if has_reference: + assert reference_embeddings is not None embeddings_to_reduce.append(reference_embeddings) reduced_list = fit_umap( embeddings_to_reduce, @@ -795,19 +809,18 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - if "kl_divergence" in metrics_to_compute: kl_k = cfg.quality_eval_cfg.kl_k - kl_benchmark = ( - benchmark_reduced if reference_reduced is not None else benchmark_embeddings - ) - kl_reference = ( - reference_reduced if reference_reduced is not None else reference_embeddings - ) + if reference_reduced is not None: + assert benchmark_reduced is not None + kl_benchmark = benchmark_reduced + kl_reference = reference_reduced + else: + kl_benchmark = benchmark_embeddings + assert reference_embeddings is not None + kl_reference = reference_embeddings + assert kl_reference is not None kl_score = compute_kl_divergence(kl_benchmark, kl_reference, k=kl_k) - umap_info = ( - f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - ) - logger.info( - "KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score - ) + umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" + logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) if "mdm" in metrics_to_compute: mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters @@ -830,9 +843,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings ) entropy_score = compute_differential_entropy(entropy_emb, k=entropy_k) - umap_info = ( - f" (UMAP: {umap_n_components}D)" if umap_n_components else "" - ) + umap_info = f" (UMAP: {umap_n_components}D)" if umap_n_components else "" logger.info( "Differential entropy score (k=%d)%s: %.4f", entropy_k, @@ -849,7 +860,7 @@ def main(cfg: DictConfig) -> None: _validate_metric_requirements(cfg) metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute) - benchmark_source_cfg = cfg.quality_eval_cfg.synthetic_source + benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg) _compute_benchmark_metrics( @@ -857,7 +868,9 @@ def main(cfg: DictConfig) -> None: model_to_generation_accuracies, metrics_to_compute, ) - _compute_novelty_metrics(cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute) + _compute_novelty_metrics( + cfg, benchmark_source_cfg, model_to_accuracy, metrics_to_compute + ) _compute_embedding_based_metrics(cfg, metrics_to_compute) From 3b818bae7fb24f045c1949fbe3c0d6fdef87bbd6 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 13 Feb 2026 23:15:36 -0500 Subject: [PATCH 12/14] Chnaged names --- src/cfg/run_quality_evaluation_cfg.yaml | 151 ++++++++++++------------ src/run_quality_evaluation.py | 62 +++++----- 2 files changed, 106 insertions(+), 107 deletions(-) diff --git a/src/cfg/run_quality_evaluation_cfg.yaml b/src/cfg/run_quality_evaluation_cfg.yaml index 9682bfd..0a00388 100644 --- a/src/cfg/run_quality_evaluation_cfg.yaml +++ b/src/cfg/run_quality_evaluation_cfg.yaml @@ -1,94 +1,93 @@ prompt_cfg: sys_msg: Compute benchmark quality metrics from existing scores. -quality_eval_cfg: - new_data_source: - scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" - capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" +target_data: + scores_root_dir: "/projects/DeepLesion/projects/automated_capability_evaluation/data/scores_sample" + capabilities_dir: "/projects/aieng/public/ace/artifacts/negin_ace/taks/math/" - # List of metrics to compute. Available metrics: - # - Benchmark metrics: "difficulty", "separability", "consistency" - # - Novelty: "novelty" - # - Internal diversity: "mdm", "entropy" - # - Comparison metrics: "pad", "mmd", "kl_divergence" - metrics_to_compute: - - "difficulty" - - "separability" - - "consistency" - - "novelty" - - "mdm" - - "entropy" - - "pad" - - "mmd" - - "kl_divergence" +# List of metrics to compute. Available metrics: +# - Benchmark metrics: "difficulty", "separability", "consistency" +# - Novelty: "novelty" +# - Internal diversity: "mdm", "entropy" +# - Comparison metrics: "pad", "mmd", "kl_divergence" +metrics_to_compute: + - "difficulty" + - "separability" + - "consistency" + - "novelty" + - "mdm" + - "entropy" + - "pad" + - "mmd" + - "kl_divergence" - # Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty. - # previous_data_sources can be: - # - a single mapping {path, dataloader, name, scores_dir}, OR - # - a list of such mappings when you have multiple real datasets. - # PAD and MMD are always reported per previous-data source. - # UMAP (for entropy/KL) is fit on all new + all previous data combined. +# Source(s) of previous data used for comparison metrics (PAD, MMD, KL) and novelty. +# reference_datasets can be: +# - a single mapping {path, dataloader, name, scores_dir}, OR +# - a list of such mappings when you have multiple real datasets. +# PAD and MMD are always reported per previous-data source. +# UMAP (for entropy/KL) is fit on all new + all previous data combined. - # Example: multiple real datasets (HuggingFace math benchmarks). - # Novelty uses score dirs from each source: set scores_dir explicitly, or - # we use scores_root_dir/ when name is set. - previous_data_sources: - - name: "MATH-500" - path: null - # Optional: explicit scores directory for novelty; otherwise uses - # scores_root_dir/name - scores_dir: null - dataloader: - type: "huggingface" - dataset_name: "HuggingFaceH4/MATH-500" - split: "test" - subset: null - text_field: "problem" +# Example: multiple real datasets (HuggingFace math benchmarks). +# Novelty uses score dirs from each source: set scores_dir explicitly, or +# we use scores_root_dir/ when name is set. +reference_datasets: + - name: "MATH-500" + path: null + # Optional: explicit scores directory for novelty; otherwise uses + # scores_root_dir/name + scores_dir: null + dataloader: + type: "huggingface" + dataset_name: "HuggingFaceH4/MATH-500" + split: "test" + subset: null + text_field: "problem" - - name: "MATH-Hard" - path: null - scores_dir: null - dataloader: - type: "huggingface" - dataset_name: "lighteval/MATH-Hard" - split: "test" - subset: null - text_field: "problem" + - name: "MATH-Hard" + path: null + scores_dir: null + dataloader: + type: "huggingface" + dataset_name: "lighteval/MATH-Hard" + split: "test" + subset: null + text_field: "problem" - # embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers - embedding_backend: "openai" - embedding_model: "text-embedding-3-large" - # embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) - embedding_dimensions: 3072 +# embedding_backend: "openai" uses OpenAI embeddings, "huggingface" uses sentence-transformers +embedding_backend: "openai" +embedding_model: "text-embedding-3-large" +# embedding_dimensions is ignored for HuggingFace models (uses model's native dimension) +embedding_dimensions: 3072 - pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" +pad_classifier: "LogisticRegression" # Options: "LogisticRegression", "RandomForest", "MLP" - mmd_kernel: "polynomial" # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid" - mmd_degree: 3 +mmd_kernel: "polynomial" # Options: "polynomial", "rbf", "laplacian", "linear", "sigmoid" +mmd_degree: 3 - mdm_n_clusters: 5 - mdm_metric: "euclidean" +mdm_n_clusters: 5 +mdm_metric: "euclidean" - entropy_k: 4 # Number of nearest neighbors for differential entropy computation +entropy_k: 4 # Number of nearest neighbors for differential entropy computation - kl_k: 4 # Number of nearest neighbors for KL divergence computation +kl_k: 4 # Number of nearest neighbors for KL divergence computation - # Optional UMAP dimensionality reduction (like InfoSynth) - umap_n_components: 10 # Set to null to disable and use original embeddings - umap_n_neighbors: 15 # Number of neighbors for UMAP - umap_min_dist: 0.1 # Minimum distance for UMAP - umap_metric: "cosine" # Distance metric for UMAP +# Optional UMAP dimensionality reduction (like InfoSynth) +umap_n_components: 10 # Set to null to disable and use original embeddings +umap_n_neighbors: 15 # Number of neighbors for UMAP +umap_min_dist: 0.1 # Minimum distance for UMAP +umap_metric: "cosine" # Distance metric for UMAP - # Evaluation settings to use if we need to (re-)evaluate prior or real datasets. - # These mirror the subject_llm settings in src/cfg/run_cfg.yaml. - evaluation_cfg: - subject_llm: - name: "o1-mini" - provider: "openai" - generation_cfg: - temperature: 0.7 - max_tokens: 2048 - seed: 42 +# Evaluation settings to use if we need to (re-)evaluate prior or real datasets. +# These mirror the subject_llm settings in src/cfg/run_cfg.yaml. +evaluation_cfg: + subject_llm: + name: "o1-mini" + provider: "openai" + generation_cfg: + temperature: 0.7 + max_tokens: 2048 + seed: 42 exp_cfg: exp_id: "quality_evaluation" diff --git a/src/run_quality_evaluation.py b/src/run_quality_evaluation.py index 957b945..75ff8d3 100644 --- a/src/run_quality_evaluation.py +++ b/src/run_quality_evaluation.py @@ -56,7 +56,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: Raises ValueError if any required data is missing. """ - metrics_to_compute = cfg.quality_eval_cfg.get("metrics_to_compute", []) + metrics_to_compute = cfg.get("metrics_to_compute", []) if not metrics_to_compute: raise ValueError( "metrics_to_compute must be specified in config. " @@ -64,8 +64,8 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: "mdm, entropy, pad, mmd, kl_divergence" ) - benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source - reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") + benchmark_source_cfg = cfg.target_data + reference_data_source_cfg = cfg.get("reference_datasets") # Benchmark metrics (difficulty, separability, consistency) need scores benchmark_metrics = {"difficulty", "separability", "consistency"} @@ -77,7 +77,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: "Benchmark metrics " f"({benchmark_metrics.intersection(metrics_to_compute)}) " "require 'scores_root_dir' to be set in " - "quality_eval_cfg.new_data_source. " + "target_data. " "Please provide the path to the directory containing one " "subdirectory per subject model." ) @@ -188,7 +188,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if reference_data_source_cfg is None: raise ValueError( f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " - "require previous_data_sources to be configured" + "require reference_datasets to be configured" ) # Validate each reference source has either path or dataloader @@ -202,7 +202,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if not sources: raise ValueError( f"Comparison metrics ({comparison_metrics.intersection(metrics_to_compute)}) " - "require at least one previous_data_sources entry" + "require at least one reference_datasets entry" ) for i, src in enumerate(sources): @@ -216,15 +216,15 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if not (has_path or has_dataloader): raise ValueError( - f"previous_data_sources[{i}] ({name}) must have either a valid 'path' " + f"reference_datasets[{i}] ({name}) must have either a valid 'path' " "(existing file/directory) or 'dataloader' with type='huggingface'" ) - # Novelty needs previous_data_sources with score directories (prior accuracies) + # Novelty needs reference_datasets with score directories (prior accuracies) if "novelty" in metrics_to_compute: if reference_data_source_cfg is None: raise ValueError( - "Novelty metric requires previous_data_sources (prior accuracies) to be configured" + "Novelty metric requires reference_datasets (prior accuracies) to be configured" ) cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) @@ -236,7 +236,7 @@ def _validate_metric_requirements(cfg: DictConfig) -> None: if not sources: raise ValueError( - "Novelty metric requires at least one previous_data_sources entry (for prior accuracies)" + "Novelty metric requires at least one reference_datasets entry (for prior accuracies)" ) scores_root_dir = benchmark_source_cfg.get("scores_root_dir") @@ -520,12 +520,12 @@ def _load_benchmark_scores( Validation has already run. """ - benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source + benchmark_source_cfg = cfg.target_data scores_root_dir = benchmark_source_cfg.get("scores_root_dir") if not scores_root_dir: raise ValueError( - "scores_root_dir must be set in quality_eval_cfg.new_data_source " + "scores_root_dir must be set in target_data " "to load benchmark scores. It should point to a directory that " "contains one subdirectory per subject model." ) @@ -628,7 +628,7 @@ def _compute_novelty_metrics( if "novelty" not in metrics_to_compute: return - reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") + reference_data_source_cfg = cfg.get("reference_datasets") cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) prior_source_configs = ( cfg_container if isinstance(cfg_container, list) else [cfg_container] @@ -680,11 +680,11 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - if not needs_embeddings: return - benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source + benchmark_source_cfg = cfg.target_data capabilities_dir = benchmark_source_cfg.get("capabilities_dir") - embedding_model = cfg.quality_eval_cfg.embedding_model - embedding_backend = cfg.quality_eval_cfg.embedding_backend - embed_dimensions = cfg.quality_eval_cfg.embedding_dimensions + embedding_model = cfg.embedding_model + embedding_backend = cfg.embedding_backend + embed_dimensions = cfg.embedding_dimensions logger.info( "Computing embedding-based metrics for capabilities in %s", capabilities_dir @@ -707,7 +707,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - reference_names: List[str] = [] if comparison_metrics.intersection(metrics_to_compute): - reference_data_source_cfg = cfg.quality_eval_cfg.get("previous_data_sources") + reference_data_source_cfg = cfg.get("reference_datasets") cfg_container = OmegaConf.to_container(reference_data_source_cfg, resolve=True) raw_list = cfg_container if isinstance(cfg_container, list) else [cfg_container] reference_source_configs: List[Dict[str, Any]] = [] @@ -760,13 +760,13 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - pad_score = compute_pad( benchmark_embeddings, ref_emb, - classifier_name=cfg.quality_eval_cfg.pad_classifier, + classifier_name=cfg.pad_classifier, ) logger.info("PAD[%s]: %.4f", name, pad_score) if "mmd" in metrics_to_compute: - mmd_kernel = cfg.quality_eval_cfg.mmd_kernel - mmd_degree = cfg.quality_eval_cfg.mmd_degree + mmd_kernel = cfg.mmd_kernel + mmd_degree = cfg.mmd_degree for name, ref_emb in zip(reference_names, reference_embeddings_list): mmd_score = compute_mmd( benchmark_embeddings, @@ -782,10 +782,10 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - ) has_reference = reference_embeddings is not None and len(reference_embeddings) > 0 - umap_n_components = cfg.quality_eval_cfg.umap_n_components - umap_n_neighbors = cfg.quality_eval_cfg.umap_n_neighbors - umap_min_dist = cfg.quality_eval_cfg.umap_min_dist - umap_metric = cfg.quality_eval_cfg.umap_metric + umap_n_components = cfg.umap_n_components + umap_n_neighbors = cfg.umap_n_neighbors + umap_min_dist = cfg.umap_min_dist + umap_metric = cfg.umap_metric need_umap = umap_n_components is not None and ( "entropy" in metrics_to_compute or ("kl_divergence" in metrics_to_compute and has_reference) @@ -808,7 +808,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - reference_reduced = reduced_list[1] if len(reduced_list) > 1 else None if "kl_divergence" in metrics_to_compute: - kl_k = cfg.quality_eval_cfg.kl_k + kl_k = cfg.kl_k if reference_reduced is not None: assert benchmark_reduced is not None kl_benchmark = benchmark_reduced @@ -823,8 +823,8 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - logger.info("KL divergence score (k=%d)%s: %.4f", kl_k, umap_info, kl_score) if "mdm" in metrics_to_compute: - mdm_n_clusters = cfg.quality_eval_cfg.mdm_n_clusters - mdm_metric = cfg.quality_eval_cfg.mdm_metric + mdm_n_clusters = cfg.mdm_n_clusters + mdm_metric = cfg.mdm_metric mdm_score = compute_mdm( benchmark_embeddings, n_clusters=mdm_n_clusters, @@ -838,7 +838,7 @@ def _compute_embedding_based_metrics(cfg: DictConfig, metrics_to_compute: set) - ) if "entropy" in metrics_to_compute: - entropy_k = cfg.quality_eval_cfg.entropy_k + entropy_k = cfg.entropy_k entropy_emb = ( benchmark_reduced if benchmark_reduced is not None else benchmark_embeddings ) @@ -859,8 +859,8 @@ def main(cfg: DictConfig) -> None: """Compute benchmark-level quality metrics from saved capability scores.""" _validate_metric_requirements(cfg) - metrics_to_compute = set(cfg.quality_eval_cfg.metrics_to_compute) - benchmark_source_cfg = cfg.quality_eval_cfg.new_data_source + metrics_to_compute = set(cfg.metrics_to_compute) + benchmark_source_cfg = cfg.target_data model_to_accuracy, model_to_generation_accuracies = _load_benchmark_scores(cfg) _compute_benchmark_metrics( From 1eb6448ea3fb0f78032d1918a4508ce696760332 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Tue, 24 Feb 2026 00:09:58 -0500 Subject: [PATCH 13/14] Added pytest --- tests/src/test_differential_entropy.py | 182 +++++++++++++++++++++++++ tests/src/test_kl_divergence.py | 162 ++++++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 tests/src/test_differential_entropy.py create mode 100644 tests/src/test_kl_divergence.py diff --git a/tests/src/test_differential_entropy.py b/tests/src/test_differential_entropy.py new file mode 100644 index 0000000..43af4ab --- /dev/null +++ b/tests/src/test_differential_entropy.py @@ -0,0 +1,182 @@ +import numpy as np +import pytest +from scipy.special import digamma, gammaln + +from src.utils import compute_differential_entropy + + +def _rng(seed=0): + return np.random.default_rng(seed) + + +def test_returns_float_and_finite(): + rng = _rng(0) + x = rng.normal(size=(300, 16)) + h = compute_differential_entropy(x, k=4) + assert isinstance(h, float) + assert np.isfinite(h) + + +def test_permutation_invariance(): + rng = _rng(1) + x = rng.normal(size=(250, 8)) + h1 = compute_differential_entropy(x, k=4) + + x_perm = x[rng.permutation(x.shape[0])] + h2 = compute_differential_entropy(x_perm, k=4) + + assert np.isfinite(h1) and np.isfinite(h2) + assert abs(h1 - h2) < 1e-10 + + +def test_translation_invariance(): + """ + Differential entropy is translation-invariant; kNN estimators based on distances should be too. + """ + rng = _rng(2) + x = rng.normal(size=(400, 10)) + shift = rng.normal(size=(1, 10)) * 100.0 + + h1 = compute_differential_entropy(x, k=4) + h2 = compute_differential_entropy(x + shift, k=4) + + assert np.isfinite(h1) and np.isfinite(h2) + assert abs(h1 - h2) < 1e-6 + + +def test_scaling_increases_entropy(): + """ + Scaling embeddings by a>1 should increase entropy by about d*log(a). + We don't require exact equality, just the direction and rough magnitude. + """ + rng = _rng(3) + n, d = 1200, 6 + x = rng.normal(size=(n, d)) + a = 3.0 + + h1 = compute_differential_entropy(x, k=4) + h2 = compute_differential_entropy(x * a, k=4) + + assert np.isfinite(h1) and np.isfinite(h2) + assert h2 > h1 + + expected_shift = d * np.log(a) + assert (h2 - h1) == pytest.approx(expected_shift, abs=0.5) + + +def test_more_spread_more_entropy(): + """ + A distribution with larger variance should have higher differential entropy. + """ + rng = _rng(4) + x_small = rng.normal(size=(800, 12)) * 0.5 + x_large = rng.normal(size=(800, 12)) * 2.0 + + h_small = compute_differential_entropy(x_small, k=4) + h_large = compute_differential_entropy(x_large, k=4) + + assert np.isfinite(h_small) and np.isfinite(h_large) + assert h_large > h_small + + +def test_k_affects_estimate_but_is_finite(): + """ + Different k values should produce finite results and usually slightly different estimates. + """ + rng = _rng(5) + x = rng.normal(size=(600, 9)) + h4 = compute_differential_entropy(x, k=4) + h8 = compute_differential_entropy(x, k=8) + + assert np.isfinite(h4) and np.isfinite(h8) + assert abs(h4 - h8) > 1e-6 + + +def test_rejects_non_2d_input(): + rng = _rng(6) + x = rng.normal(size=(100, 5, 1)) + with pytest.raises((ValueError, AssertionError)): + compute_differential_entropy(x, k=4) + + +def test_rejects_empty_input(): + x = np.empty((0, 10)) + with pytest.raises((ValueError, AssertionError)): + compute_differential_entropy(x, k=4) + + +def test_rejects_k_too_large(): + rng = _rng(7) + x = rng.normal(size=(10, 3)) + with pytest.raises((ValueError, AssertionError)): + compute_differential_entropy(x, k=10) + + +def test_duplicate_points_does_not_nan(): + """ + Duplicate points can cause zero kNN distances -> log(0). + Depending on your implementation, this might: + - raise, or + - return -inf, or + - remain finite if distances are clipped. + We only enforce: it should not be NaN (silent failure). + """ + rng = _rng(8) + base = rng.normal(size=(80, 4)) + x = np.vstack([base, base[:20]]) + + try: + h = compute_differential_entropy(x, k=4) + except (ValueError, AssertionError): + return + + assert not np.isnan(h) + + +def test_differential_entropy_1d(): + """ + Hand-computed Kozachenko–Leonenko (kNN) differential entropy test. + + We assume the implementation matches the formula: + H = psi(n) - psi(k) + log(V_d) + d * mean(log(eps)) + where: + - eps_i is the distance to the (k+1)-th nearest neighbor in X when using + NearestNeighbors(n_neighbors=k+1) on X and then taking distances[:, k] + (i.e., self-distance at index 0, first *other* neighbor at index 1 for k=1). + - V_d is the volume of the unit ball in R^d. + + Choose a tiny 1D dataset with uniform spacing: + x = [0, 1, 2], n=3, d=1, k=1 + + Step 1) kNN distances (k=1): + For each point, the nearest *other* neighbor is at distance 1: + eps = [1, 1, 1] + Therefore: + mean(log(eps)) = mean(log(1)) = 0 + + Step 2) Unit ball volume term in 1D: + The "unit ball" in 1D is the interval [-1, 1], so: + V_1 = 2 + log(V_1) = log(2) + + Step 3) Digamma simplification for integers: + For integer n: + psi(n) = -gamma + H_{n-1} + where H_{m} is the m-th harmonic number. + + psi(3) = -gamma + (1 + 1/2) = -gamma + 3/2 + psi(1) = -gamma + So: + psi(3) - psi(1) = 3/2 + + Step 4) Combine terms: + H = (psi(3) - psi(1)) + log(2) + 1 * 0 + = 3/2 + log(2) + ≈ 1.5 + 0.6931471805599453 + = 2.1931471805599454 + """ + x = np.array([[0.0], [1.0], [2.0]], dtype=np.float64) + expected = 2.1931471805599454 # 1.5 + ln(2) + + actual = compute_differential_entropy(x, k=1) + assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12) \ No newline at end of file diff --git a/tests/src/test_kl_divergence.py b/tests/src/test_kl_divergence.py new file mode 100644 index 0000000..8c88800 --- /dev/null +++ b/tests/src/test_kl_divergence.py @@ -0,0 +1,162 @@ +import numpy as np +import pytest +from sklearn.neighbors import NearestNeighbors + +from src.utils import compute_kl_divergence as kl_divergence + + +def _rng(seed=0): + return np.random.default_rng(seed) + + +def test_returns_python_float(): + rng = _rng(0) + p = rng.normal(size=(100, 16)) + q = rng.normal(size=(120, 16)) + val = kl_divergence(p, q, k=4, eps=1e-10) + assert isinstance(val, float) + assert np.isfinite(val) + + +def test_identical_distributions_near_zero(): + """ + If p and q are the same sample set, KL(P||Q) should be ~0. + For kNN estimators it won't be exactly 0, so allow a tolerance. + """ + rng = _rng(1) + p = rng.normal(size=(300, 8)) + val = kl_divergence(p, p.copy(), k=4, eps=1e-10) + assert np.isfinite(val) + assert abs(val) < 0.5 + + +def test_same_distribution_independent_samples_near_zero(): + """ + Two independent draws from the same distribution -> KL should be small. + """ + rng = _rng(2) + p = rng.normal(size=(400, 12)) + q = rng.normal(size=(450, 12)) + val = kl_divergence(p, q, k=4, eps=1e-10) + assert np.isfinite(val) + assert abs(val) < 0.3 + + +def test_shifted_distribution_positive(): + """ + If q is a shifted version, KL(P||Q) should be > 0 (usually noticeably). + """ + rng = _rng(3) + p = rng.normal(size=(500, 10)) + q = rng.normal(size=(500, 10)) + 1.5 + val = kl_divergence(p, q, k=4, eps=1e-10) + assert np.isfinite(val) + assert val > 0.2 + + +def test_not_symmetric_in_general(): + """ + KL is not symmetric: KL(P||Q) != KL(Q||P) generally. + """ + rng = _rng(4) + p = rng.normal(size=(400, 6)) + q = rng.normal(size=(400, 6)) + 0.8 + pq = kl_divergence(p, q, k=4, eps=1e-10) + qp = kl_divergence(q, p, k=4, eps=1e-10) + assert np.isfinite(pq) and np.isfinite(qp) + assert abs(pq - qp) > 1e-3 + + +def test_permutation_invariance(): + """ + Reordering rows should not change the result. + """ + rng = _rng(5) + p = rng.normal(size=(300, 7)) + q = rng.normal(size=(320, 7)) + val1 = kl_divergence(p, q, k=4, eps=1e-10) + + p_perm = p[rng.permutation(p.shape[0])] + q_perm = q[rng.permutation(q.shape[0])] + val2 = kl_divergence(p_perm, q_perm, k=4, eps=1e-10) + + assert np.isfinite(val1) and np.isfinite(val2) + assert abs(val1 - val2) < 1e-10 + + +def test_translation_invariance_if_distance_based(): + """ + If your estimator depends only on pairwise distances (typical for kNN), + adding the same offset to both sets should not change KL. + """ + rng = _rng(6) + p = rng.normal(size=(350, 9)) + q = rng.normal(size=(360, 9)) + offset = rng.normal(size=(1, 9)) * 10.0 + + val1 = kl_divergence(p, q, k=4, eps=1e-10) + val2 = kl_divergence(p + offset, q + offset, k=4, eps=1e-10) + + assert np.isfinite(val1) and np.isfinite(val2) + assert abs(val1 - val2) < 1e-6 + + +def test_rejects_bad_shapes(): + rng = _rng(7) + p = rng.normal(size=(100, 8)) + q = rng.normal(size=(100, 9)) + with pytest.raises((ValueError, AssertionError)): + kl_divergence(p, q, k=4, eps=1e-10) + + +def test_rejects_non_2d_inputs(): + rng = _rng(8) + p = rng.normal(size=(100, 8, 1)) + q = rng.normal(size=(120, 8, 1)) + with pytest.raises((ValueError, AssertionError)): + kl_divergence(p, q, k=4, eps=1e-10) + + +def test_k_too_large_raises(): + rng = _rng(9) + p = rng.normal(size=(10, 4)) + q = rng.normal(size=(12, 4)) + with pytest.raises((ValueError, AssertionError)): + kl_divergence(p, q, k=50, eps=1e-10) + + +def test_eps_prevents_nan_with_duplicates(): + """ + Duplicate points can create zero distances; eps should prevent log(0)/div-by-0. + """ + rng = _rng(10) + base = rng.normal(size=(50, 5)) + p = np.vstack([base, base[:10]]) # duplicates + q = rng.normal(size=(70, 5)) + val = kl_divergence(p, q, k=4, eps=1e-10) + assert np.isfinite(val) + + +def test_kl_value(): + """ + Manual KL computation for: + + P = [[0], [1], [3]] + Q = [[10], [11], [13]] + k = 1, d = 1 + + rho = [1, 1, 2] + nu = [10, 9, 7] + + KL = (1/3) * (ln(10) + ln(9) + ln(3.5)) + ln(3/2) + ≈ 2.322989 + """ + + p = np.array([[0.0], [1.0], [3.0]]) + q = np.array([[10.0], [11.0], [13.0]]) + + expected = 2.322989 + + actual = kl_divergence(p, q, k=1, eps=1e-12) + + assert actual == pytest.approx(expected, rel=1e-6, abs=1e-6) \ No newline at end of file From 32355fefae513bd4bb21da7a1e56a127c0014849 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Tue, 24 Feb 2026 00:29:55 -0500 Subject: [PATCH 14/14] Added more pytest --- tests/src/test_benchmark_novelty.py | 70 +++++++++++++++++++++++++++++ tests/src/test_mmd.py | 33 ++++++++++++++ tests/src/test_pad.py | 39 ++++++++++++++++ 3 files changed, 142 insertions(+) create mode 100644 tests/src/test_benchmark_novelty.py create mode 100644 tests/src/test_mmd.py create mode 100644 tests/src/test_pad.py diff --git a/tests/src/test_benchmark_novelty.py b/tests/src/test_benchmark_novelty.py new file mode 100644 index 0000000..2268e14 --- /dev/null +++ b/tests/src/test_benchmark_novelty.py @@ -0,0 +1,70 @@ +import pytest + +from src.utils import compute_benchmark_novelty + + +def test_benchmark_novelty_zero_when_current_matches_a_prior_dataset(): + """ + Hand-computed benchmark novelty test. + + Novelty(D_c, D_prev, M) = 1 - SpearmanCorr(v_c, v_hat_c) + + Construct a case where v_c is exactly equal to one of the prior benchmark + accuracy vectors. Then linear regression can predict perfectly: + + Let V_prev be a single column vector equal to v_c. + A linear model v_hat = theta * v_prev + b can fit with theta=1, b=0, + so v_hat = v_c exactly. + + Therefore: + SpearmanCorr(v_c, v_hat) = 1 (identical values -> identical ranks) + Novelty = 1 - 1 = 0 + """ + current = {"modelA": 0.90, "modelB": 0.70, "modelC": 0.80} + prior1 = {"modelA": 0.90, "modelB": 0.70, "modelC": 0.80} # exactly the same vector + + expected = 0.0 + actual = compute_benchmark_novelty(current, [prior1]) + + assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12) + + +def test_benchmark_novelty_nonzero_hand_computed_case(): + """ + Hand-computed novelty case with non-zero novelty. + + Models: A, B, C, D + + Prior vectors (as "accuracies", just numeric features): + prior1 x1 = [-1, +1, -1, +1] + prior2 x2 = [-1, -1, +1, +1] + + Current: + v_c = [0.1, 0.2, 0.6, 0.3] + + Least-squares with intercept: + b = mean(v_c) = 0.3 + theta1 = (x1^T v_c) / (x1^T x1) = (-0.2)/4 = -0.05 + theta2 = (x2^T v_c) / (x2^T x2) = (0.6)/4 = 0.15 + + Predicted: + v_hat = b + theta1*x1 + theta2*x2 + = [0.2, 0.1, 0.5, 0.4] + + Spearman ranks: + rank(v_c) = [1, 2, 4, 3] + rank(v_hat) = [2, 1, 4, 3] + sum d^2 = 2 + rho = 1 - 6*2/(4*(16-1)) = 0.8 + + Novelty = 1 - rho = 0.2 + """ + current = {"A": 0.1, "B": 0.2, "C": 0.6, "D": 0.3} + + prior1 = {"A": -1.0, "B": 1.0, "C": -1.0, "D": 1.0} + prior2 = {"A": -1.0, "B": -1.0, "C": 1.0, "D": 1.0} + + expected = 0.2 + actual = compute_benchmark_novelty(current, [prior1, prior2]) + + assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12) diff --git a/tests/src/test_mmd.py b/tests/src/test_mmd.py new file mode 100644 index 0000000..cd667ea --- /dev/null +++ b/tests/src/test_mmd.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +from src.utils import compute_mmd + + +def test_mmd_linear_hand_computed(): + """ + Hand-computed MMD test for the linear kernel. + + For k(a,b) = a^T b, we have: + MMD^2 = || mean(x) - mean(y) ||^2 + + Choose 1D samples: + x = [0, 2] -> mean(x) = (0 + 2)/2 = 1 + y = [1, 3] -> mean(y) = (1 + 3)/2 = 2 + + Difference in means: + mean(x) - mean(y) = 1 - 2 = -1 + + Therefore: + MMD^2 = (-1)^2 = 1 + + The implementation returns MMD^2 (mean of kernel Gram matrices formula), + so expected = 1.0. + """ + x = np.array([[0.0], [2.0]], dtype=np.float64) + y = np.array([[1.0], [3.0]], dtype=np.float64) + + expected = 1.0 + + actual = compute_mmd(x, y, kernel="linear") + assert actual == pytest.approx(expected, rel=1e-12, abs=1e-12) diff --git a/tests/src/test_pad.py b/tests/src/test_pad.py new file mode 100644 index 0000000..f483273 --- /dev/null +++ b/tests/src/test_pad.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest + +from src.utils import compute_pad + + +def test_pad(): + """ + Hand-computed PAD test. + + PAD = 2(1 - 2ε) + + Choose perfectly linearly separable 1D embeddings: + + Synthetic: [-10, -9, -8] + Real: [ 8, 9, 10] + + These are separable by threshold at 0. + + Classification error ε = 0 + + Therefore: + PAD = 2(1 - 2*0) = 2 + + The implementation uses a train/validation split, so the returned value + may be slightly below 2; we assert it is close to 2 (high separability). + """ + x_syn = np.array([[-10.0], [-9.0], [-8.0]]) + x_real = np.array([[8.0], [9.0], [10.0]]) + + expected = 2.0 + + actual = compute_pad( + x_syn, + x_real, + classifier_name="LogisticRegression", + ) + + assert actual == pytest.approx(expected, rel=0.05, abs=0.1)