From 93b798e76040572f13e57bc8f207a95a0e24fe4d Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Wed, 24 Dec 2025 10:52:19 +0530 Subject: [PATCH] Add rich feedback mode to k_module_problem example Introduces a RICH_FEEDBACK=1 mode that provides detailed feedback on which modules are correct or incorrect, along with actionable hints. Updates the evaluator and iterative agent to support and display this feedback, and documents the new mode and its impact in the README. --- examples/k_module_problem/README.md | 19 ++++++++++++ examples/k_module_problem/evaluator.py | 32 ++++++++++++++++++-- examples/k_module_problem/iterative_agent.py | 27 ++++++++++++++++- 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/examples/k_module_problem/README.md b/examples/k_module_problem/README.md index dc995c279..bcd12ade4 100644 --- a/examples/k_module_problem/README.md +++ b/examples/k_module_problem/README.md @@ -166,6 +166,25 @@ This establishes the "no learning" baseline. Any method that beats this is demon **Key insight**: While OpenEvolve takes more iterations on average (52.3 vs 13), it has a **100% success rate** compared to iterative refinement's 33%. The evolutionary approach's population diversity ensures it eventually escapes local optima that trap single-trajectory methods. +### Rich Feedback Mode: Proving Attribution Matters + +To verify that feedback attribution is the key factor, we added a `RICH_FEEDBACK=1` mode that tells the agent exactly which modules are correct/incorrect: + +```bash +RICH_FEEDBACK=1 python run_iterative_trials.py --trials 3 --iterations 100 +``` + +| Method | Success Rate | Avg Iterations | +|--------|-------------|----------------| +| **Iterative (no feedback)** | 33% | 13 (when found) | +| **Iterative (rich feedback)** | **100%** | **3** | + +With rich feedback, iterative refinement achieves **100% success rate in only 3 iterations** - dramatically faster than OpenEvolve's 52 iterations! This proves that: + +1. **Feedback attribution is the key factor**, not the optimization method +2. When feedback is attributable, iterative refinement is highly effective +3. Evolution is necessary when feedback is NOT attributable (you can't tell which component is wrong) + ## Why This Matters This example illustrates when you should prefer evolutionary approaches: diff --git a/examples/k_module_problem/evaluator.py b/examples/k_module_problem/evaluator.py index 6d60e34d0..ff6fbdcd1 100644 --- a/examples/k_module_problem/evaluator.py +++ b/examples/k_module_problem/evaluator.py @@ -9,13 +9,21 @@ This creates a challenging landscape for iterative refinement but allows evolutionary crossover to combine good "building blocks" from different individuals. + +Set RICH_FEEDBACK=1 to enable rich feedback mode, which tells you +exactly which modules are correct/incorrect. This demonstrates that +iterative refinement works well when feedback is attributable. """ +import os import sys import time import traceback import importlib.util +# Rich feedback mode - when enabled, reveals which modules are correct +RICH_FEEDBACK = os.environ.get("RICH_FEEDBACK", "0") == "1" + # The correct solution (hidden from the optimizer) # This represents the "optimal" pipeline configuration discovered through # extensive testing/domain expertise @@ -141,14 +149,34 @@ def score_config(config: dict) -> tuple: def build_artifacts(config: dict, correct_count: int, module_results: dict, eval_time: float) -> dict: """ - Build artifacts that provide useful feedback without revealing - exactly which modules are correct. + Build artifacts that provide useful feedback. + + In normal mode: Only reveals how many modules are correct, not which ones. + In rich feedback mode (RICH_FEEDBACK=1): Reveals exactly which modules are correct/incorrect. """ artifacts = {} # Configuration summary artifacts["configuration"] = str(config) + # Rich feedback mode - reveals which modules are correct/incorrect + if RICH_FEEDBACK: + correct_modules = [m for m, is_correct in module_results.items() if is_correct] + incorrect_modules = [m for m, is_correct in module_results.items() if not is_correct] + + artifacts["module_feedback"] = { + "correct": correct_modules, + "incorrect": incorrect_modules, + } + + if incorrect_modules: + hints = [] + for module in incorrect_modules: + hints.append(f"'{module}' is WRONG - try a different option from {VALID_OPTIONS[module]}") + artifacts["actionable_hints"] = hints + else: + artifacts["actionable_hints"] = ["All modules are correct!"] + # Score feedback - tells you how many are correct, but not which ones if correct_count == NUM_MODULES: artifacts["status"] = "PERFECT! All modules correctly configured!" diff --git a/examples/k_module_problem/iterative_agent.py b/examples/k_module_problem/iterative_agent.py index d53fdb72a..68da152db 100644 --- a/examples/k_module_problem/iterative_agent.py +++ b/examples/k_module_problem/iterative_agent.py @@ -64,6 +64,26 @@ def write_program(program_path: str, code: str) -> None: f.write(code) +def format_rich_feedback(artifacts: dict) -> str: + """Format rich feedback if available (RICH_FEEDBACK=1).""" + if "module_feedback" not in artifacts: + return "" + + feedback = artifacts["module_feedback"] + hints = artifacts.get("actionable_hints", []) + + result = "\n## DETAILED MODULE FEEDBACK (Rich Feedback Mode)\n" + result += f"- CORRECT modules: {feedback.get('correct', [])}\n" + result += f"- INCORRECT modules: {feedback.get('incorrect', [])}\n" + + if hints: + result += "\n### Actionable Hints:\n" + for hint in hints: + result += f"- {hint}\n" + + return result + + def create_improvement_prompt( current_code: str, metrics: dict, @@ -108,6 +128,7 @@ def create_improvement_prompt( - Score: {metrics.get('combined_score', 0):.2%} - Status: {artifacts.get('status', 'N/A')} - Suggestion: {artifacts.get('suggestion', 'N/A')} +{format_rich_feedback(artifacts)} {history_str} ## Your Task @@ -205,7 +226,11 @@ def run_iterative_refinement( # Evaluate current program eval_result = evaluate(str(current_program_path)) - metrics = eval_result.get("metrics", {}) + # Handle both flat (success) and nested (error) return formats + if "metrics" in eval_result: + metrics = eval_result["metrics"] + else: + metrics = {k: v for k, v in eval_result.items() if k != "artifacts"} artifacts = eval_result.get("artifacts", {}) score = metrics.get("combined_score", 0)