VectorInstitute · kohankhaki · Feb 25, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/legacy/README.md b/legacy/README.md
@@ -0,0 +1,9 @@
+# Legacy LBO Code
+
+This directory contains legacy code for **Latent Bayesian Optimization (LBO)** from an earlier version of the repository. LBO was used for intelligent capability selection during evaluation.
+
+## Compatible Version
+
+This LBO code is compatible with the repository at commit [`a224c5ec`](https://github.com/VectorInstitute/automated_capability_evaluation/tree/a224c5ec7dd208e04ef2edc059e6e7a2d0d4bcf6). That commit contains the full working version of the codebase used for the **initial paper submission**.
+
+**This code does not work with the current codebase.** This was the base legacy code before the generation and evaluation pipelines were standardized. If you need to understand how LBO integrated with the rest of the system, refer to that version.
diff --git a/...pts/example_cfg/plot_lbo_results_cfg.yaml → ...pts/example_cfg/plot_lbo_results_cfg.yaml b/...pts/example_cfg/plot_lbo_results_cfg.yaml → ...pts/example_cfg/plot_lbo_results_cfg.yaml
diff --git a/example_scripts/plot_lbo_results.py → legacy/example_scripts/plot_lbo_results.py b/example_scripts/plot_lbo_results.py → legacy/example_scripts/plot_lbo_results.py
diff --git a/src/lbo.py → legacy/src/lbo.py b/src/lbo.py → legacy/src/lbo.py
diff --git a/src/run_lbo.py → legacy/src/run_lbo.py b/src/run_lbo.py → legacy/src/run_lbo.py
diff --git a/src/utils/capability_discovery_utils.py → ...y/src/utils/capability_discovery_utils.py b/src/utils/capability_discovery_utils.py → ...y/src/utils/capability_discovery_utils.py
diff --git a/src/utils/lbo_utils.py → legacy/src/utils/lbo_utils.py b/src/utils/lbo_utils.py → legacy/src/utils/lbo_utils.py
diff --git a/tests/src/test_lbo.py → legacy/tests/test_lbo.py b/tests/src/test_lbo.py → legacy/tests/test_lbo.py
diff --git a/tests/src/test_lbo_utils.py → legacy/tests/test_lbo_utils.py b/tests/src/test_lbo_utils.py → legacy/tests/test_lbo_utils.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "datasets>=3.2.0",
     "google-cloud-storage>=3.0.0",
     "hydra-core>=1.3.2",
-    "inspect-ai>=0.3.80",
+    "inspect-ai>=0.3.159",
     "langchain_openai>=0.3.6",
     "langchain>=0.3.19",
     "matplotlib>=3.10.0",
@@ -177,6 +177,8 @@ env = [
 filterwarnings = [
     "ignore::DeprecationWarning",
 ]
+# Exclude legacy tests (imports are broken after code was moved)
+norecursedirs = ["legacy"]
 
 [tool.coverage]
     [tool.coverage.run]

diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
@@ -1,3 +1,22 @@
+# =============================================================================
+# EXPERIMENT CONFIGURATION
+# =============================================================================
+
+exp_cfg:
+  exp_id: "test_exp"
+  seed: 37
+  trial_run: false
+
+global_cfg:
+  domain: personal finance
+  output_dir: base_output/
+  pipeline_type: base
+
+# =============================================================================
+# GENERATION PIPELINE
+# =============================================================================
+
+# LLM for generation stages (1-5)
 scientist_llm:
   name: o4-mini
   provider: openai
@@ -14,126 +33,64 @@ scientist_llm:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-    judge_llm:
-      temperature: 1.0
-      max_tokens: 2048
-      seed: 42
     task_verify:
       temperature: 0.7
       max_tokens: 2048
       seed: 42
-  local_launch_cfg:
-    # Number of threads to use for local LLM
-    max_num_seqs: 1
-    # Type of GPU to use for local LLM
-    partition: "a40"
-    # QoS for local LLM
-    qos: "m2"
-    # Time limit for local LLM
-    time: "01:00:00"
-
-subject_llm:
-  name: o1-mini
-  provider: openai
-  generation_cfg:
-    temperature: 0.7
-    max_tokens: 2048
-    seed: 42
-  local_launch_cfg:
-    # Type of GPU to use for local LLM
-    partition: "a100"
-    # Number of nodes to use for local LLM
-    num_nodes: 1
-    # Number of GPUs to use for local LLM
-    gpus_per_node: 4
-    # QoS for local LLM
-    qos: "deadline"
-    # Account for local LLM
-    account: "deadline"
-    # Time limit for local LLM
-    time: "10:00:00"
-    # vLLM args
-    vllm_args: "--max-model-len=8192,--max-num-seqs=50,--compilation-config=0,--tensor-parallel-size=4,--pipeline-parallel-size=1"
 
-prompt_cfg:
-  sys_msg: Complete the given task to the best of your ability.
-
-# Diverse task generation configuration (Stage 3)
-task_generation_cfg:
-  tasks_per_blueprint: 1  # Number of tasks to generate per blueprint
-  min_subtopics: 1  # Suggested minimum number of sub-topics
-  max_subtopics: 1  # Suggested maximum number of sub-topics
+# Stage control
+stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
 
-# Task verification configuration (Stage 5)
-task_verification_cfg:
-  pass_threshold: 0.8  # Minimum pass rate to consider successful
-  strict_mode: false  # If true, all alignment criteria must pass
+# Stage tags (for running individual stages or resuming)
+areas_tag: null           # Stage 1 output tag (required for stage 2 standalone)
+capabilities_tag: null    # Stage 2 output tag (required for stage 3 standalone)
+tasks_tag: null           # Stage 3 output tag (required for stage 4 standalone)
+solution_tag: null        # Stage 4 output tag (required for stage 5 standalone)
+validation_tag: null      # Stage 5 output tag (required for eval pipeline)
+eval_tag: null            # Eval Stage 1 output tag (required for eval stage 2, optional for stage 1 resume)
 
-# Area generation configuration (Stage 1)
+# Stage 1: Area generation
 areas_cfg:
-  num_areas: 2  # Number of areas to generate
+  num_areas: 2
 
-# Capability generation configuration (Stage 2)
+# Stage 2: Capability generation
 capabilities_cfg:
-  capabilities_dir: ./ace-output/
-  results_dir: gs://ace-artifacts
-  inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals
-  num_seed_capabilities: 1
   num_capabilities: 4
-  num_capabilities_buffer: 0.5  # Raised from 0.1 to compensate for filtering
-  num_gen_capabilities_per_run: 1  # Raised from 1 for more diversity per batch
-  num_gen_tasks_per_capability: 100
-  num_gen_tasks_buffer: 0.0
-  task_gen_few_shot: false
-  task_gen_prompt_version: "v1"
-  num_eval_tasks_per_capability: 2
-  capabilities_gen_retry_attempts: 5
-  tasks_gen_retry_attempts: 3
-  concurrency_task_solver: 2
-  concurrency_task_verifier: 2
-  concurrency_task_eval: 2
-  inspect_eval_log_level: "info"
-
-lbo_cfg:
-  num_lbo_runs: 2
-  pipeline_id: "no_discovery"
-  train_frac: 0.5
-  num_initial_train: 2
-  acquisition_function: "variance"
+  num_capabilities_buffer: 0.1
+  num_gen_capabilities_per_run: 1
 
+# Embedding config (used for capability filtering in Stage 2)
 embedding_cfg:
-  embedding_model: "text-embedding-3-small"
-  embedding_size: 256
-  filtering_similarity_threshold: 0.85  # Raised from 0.7 to keep more diverse capabilities
-
-dimensionality_reduction_cfg:
-  reduce_dimensionality_method: "pca"
-  reduced_dimensionality_size: 2
-  no_discovery_reduced_dimensionality_method: "pca"
-  no_discovery_reduced_dimensionality_size: 2
+  embedding_model: text-embedding-3-small
+  embedding_size: 1536
+  filtering_similarity_threshold: 0.85
 
-exp_cfg:
-  seed: 37
-  trial_run: false
-  exp_id: "test_exp"
+# Stage 3: Task generation
+task_generation_cfg:
+  tasks_per_blueprint: 1
+  min_subtopics: 1
+  max_subtopics: 1
 
-# Stage control
-stage: "all"  # Which stage to run: 0, 1, 2, 3, 4, 5, or "all"
-areas_tag: null  # Areas tag from Stage 1 (required for stage 2 standalone)
-capabilities_tag: null  # Capabilities tag from Stage 2 (required for stage 3 standalone)
-tasks_tag: null  # Tasks tag from Stage 3 (required for stage 4 standalone)
-solution_tag: null  # Solution tag from Stage 4 (required for stage 5 standalone)
-validation_tag: null  # Validation tag from Stage 5 (optional for resume)
+# =============================================================================
+# EVALUATION PIPELINE
+# =============================================================================
 
-# Debug settings
-use_langchain: false  # Set to false for easier debugging (disables LangChain features)
+eval_cfg:
+  # LLMs to evaluate (required)
+  subject_llms:
+    - name: gpt-4o
+      provider: openai
+    - name: claude-3-sonnet
+      provider: anthropic
 
-# Global configuration
+  # Judge LLM for scoring (required)
+  judge_llm:
+    name: gpt-4o-mini
+    provider: openai
 
-global_cfg:
-  domain: personal finance
-  output_dir: base_output/ #Base output directory for all agentic outputs
-  pipeline_type: base
+# =============================================================================
+# HYDRA
+# =============================================================================
 
 defaults:
   - _self_

diff --git a/src/eval_stages/__init__.py b/src/eval_stages/__init__.py
@@ -0,0 +1,18 @@
+"""Evaluation pipeline stages.
+
+Stage 0: Setup and Dataset Preparation (no LLM calls)
+Stage 1: Evaluation Execution (runs subject LLMs, creates eval_tag)
+Stage 2: Score Aggregation (no LLM calls)
+"""
+
+from src.eval_stages.stage0_setup_and_dataset import EvalSetupError, run_eval_stage0
+from src.eval_stages.stage1_eval_execution import run_eval_stage1
+from src.eval_stages.stage2_score_aggregation import run_eval_stage2
+
+
+__all__ = [
+    "run_eval_stage0",
+    "run_eval_stage1",
+    "run_eval_stage2",
+    "EvalSetupError",
+]
diff --git a/src/eval_stages/prompts.py b/src/eval_stages/prompts.py
@@ -0,0 +1,9 @@
+"""Prompts for evaluation pipeline stages."""
+
+# Default prompt template for Inspect AI evaluation
+# Used in Stage 1 (Dataset Preparation) when creating EvalDataset
+DEFAULT_EVAL_PROMPT_TEMPLATE = """You are an expert. Solve the following problem.
+
+Problem: {input}
+
+Provide your final answer."""