From 886c1d627bdc99d03374878cb58ce10ef6b96bf3 Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 05:14:53 +0200
Subject: [PATCH 1/6] Default system prompts in config.py refer to the default
 template name in templates.py

---
 openevolve/config.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/openevolve/config.py b/openevolve/config.py
index c742ef945..1f42f6132 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -43,9 +43,7 @@ class LLMConfig(LLMModelConfig):
     name: str = "gpt-4o"
 
     # Generation parameters
-    system_message: Optional[str] = (
-        "You are an expert coder helping to improve programs through evolution."
-    )
+    system_message: Optional[str] = "system_message"
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 4096
@@ -116,8 +114,8 @@ class PromptConfig:
     """Configuration for prompt generation"""
 
     template_dir: Optional[str] = None
-    system_message: str = "You are an expert coder helping to improve programs through evolution."
-    evaluator_system_message: str = """You are an expert code reviewer."""
+    system_message: str = "system_message"
+    evaluator_system_message: str = "evaluator_system_message"
 
     # Number of examples to include in the prompt
     num_top_programs: int = 3

From 1e8961475473f6826ad6ca763b92c109a881cf71 Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 05:16:53 +0200
Subject: [PATCH 2/6] parse_full_rewrite() falls back to direct pass-through of
 LLM response, when it does not find code blocks

---
 openevolve/utils/code_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openevolve/utils/code_utils.py b/openevolve/utils/code_utils.py
index 2d97ef4ea..60fb63001 100644
--- a/openevolve/utils/code_utils.py
+++ b/openevolve/utils/code_utils.py
@@ -109,7 +109,8 @@ def parse_full_rewrite(llm_response: str, language: str = "python") -> Optional[
     if matches:
         return matches[0].strip()
 
-    return None
+    # Fallback to plain text
+    return llm_response
 
 
 def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str:

From dabcc74b2e661934e72b09300b1063bc743b48aa Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 05:23:15 +0200
Subject: [PATCH 3/6] Integration of EleutherAI's lm-evaluation-harness

---
 .gitignore                                    |   6 +-
 openevolve/config.py                          |   4 +-
 scripts/README.md                             |   0
 scripts/lm_eval/README.md                     |  78 +++++++
 scripts/lm_eval/config.yml                    |  48 +++++
 scripts/lm_eval/evaluator_stub.py             |   5 +
 scripts/lm_eval/initial_content_stub.txt      |   1 +
 scripts/lm_eval/lm-eval.py                    | 200 ++++++++++++++++++
 scripts/lm_eval/prompts/diff_user.txt         |  34 +++
 scripts/lm_eval/prompts/evaluation.txt        |  24 +++
 scripts/lm_eval/prompts/evolution_history.txt |   7 +
 scripts/lm_eval/prompts/full_rewrite_user.txt |  17 ++
 scripts/lm_eval/prompts/previous_attempt.txt  |   4 +
 scripts/lm_eval/prompts/top_program.txt       |   5 +
 scripts/lm_eval/requirements.txt              |   2 +
 tests/test_valid_configs.py                   |   6 +-
 16 files changed, 438 insertions(+), 3 deletions(-)
 create mode 100644 scripts/README.md
 create mode 100644 scripts/lm_eval/README.md
 create mode 100644 scripts/lm_eval/config.yml
 create mode 100644 scripts/lm_eval/evaluator_stub.py
 create mode 100644 scripts/lm_eval/initial_content_stub.txt
 create mode 100644 scripts/lm_eval/lm-eval.py
 create mode 100644 scripts/lm_eval/prompts/diff_user.txt
 create mode 100644 scripts/lm_eval/prompts/evaluation.txt
 create mode 100644 scripts/lm_eval/prompts/evolution_history.txt
 create mode 100644 scripts/lm_eval/prompts/full_rewrite_user.txt
 create mode 100644 scripts/lm_eval/prompts/previous_attempt.txt
 create mode 100644 scripts/lm_eval/prompts/top_program.txt
 create mode 100644 scripts/lm_eval/requirements.txt

diff --git a/.gitignore b/.gitignore
index 341dda1ce..27e77c7b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+results/
+scripts/lm_eval/prompts/system_message.txt
+scripts/lm_eval/prompts/evaluator_system_message.txt
+
 # Python
 __pycache__/
 *.py[cod]
@@ -48,4 +52,4 @@ htmlcov/
 
 # For SR
 secrets.yaml
-problems
\ No newline at end of file
+problems
diff --git a/openevolve/config.py b/openevolve/config.py
index 1f42f6132..1a1a19338 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -232,7 +232,9 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
             if "models" in llm_dict:
                 llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
             if "evaluator_models" in llm_dict:
-                llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]]
+                llm_dict["evaluator_models"] = [
+                    LLMModelConfig(**m) for m in llm_dict["evaluator_models"]
+                ]
             config.llm = LLMConfig(**llm_dict)
         if "prompt" in config_dict:
             config.prompt = PromptConfig(**config_dict["prompt"])
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/lm_eval/README.md b/scripts/lm_eval/README.md
new file mode 100644
index 000000000..934320b48
--- /dev/null
+++ b/scripts/lm_eval/README.md
@@ -0,0 +1,78 @@
+# lm-eval.py
+
+`lm-eval.py` provides basic benchmark capability for LLM feedback-based evolutionary task solving. The benchmark framework is [EleutherAI's lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
+
+*Limitation:* Only generation-only tasks such as gsm8k are supported. This is because tasks that require loglikelihood probabilities are not well applicable to agents.
+
+## Usage
+
+```bash
+$ python3 scripts/lm_eval/lm-eval.py -h
+usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
+                  [--output_path OUTPUT_PATH]
+
+OpenEvolve <-> lm-evaluation-harness adapter.
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file
+  --init_file INIT_FILE
+                        initial content file
+  --evaluator_file EVALUATOR_FILE
+                        evaluator file
+  --iterations ITERATIONS
+                        number of iterations
+  --limit LIMIT         limit the number of examples per task that are executed
+  --tasks TASKS         list of tasks to evaluate
+  --output_path OUTPUT_PATH
+                        output path for results
+```
+
+Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
+```
+$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 80.000%
+[..]
+
+
+$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 90.000%
+[..]
+
+$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 80.000%
+[..]
+
+$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 70.000%
+[..]
+```
+
+## Warning
+
+- Be aware that this is an early implementation. No extensive benchmarks have been executed so far. With a limit to 10 tasks and 10 iterations, the benchmark is meaningless as is.
+- Use the --limit parameter only for tests, not for metric generation.
+- Do not cite the metrics that result from the script execution blindly without reviewing the solution first.
+
+## References
+
+```bibtex
+@misc{eval-harness,
+    author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+    title        = {The Language Model Evaluation Harness},
+    month        = 07,
+    year         = 2024,
+    publisher    = {Zenodo},
+    version      = {v0.4.3},
+    doi          = {10.5281/zenodo.12608602},
+    url          = {https://zenodo.org/records/12608602}
+}
+```
\ No newline at end of file
diff --git a/scripts/lm_eval/config.yml b/scripts/lm_eval/config.yml
new file mode 100644
index 000000000..269b0a726
--- /dev/null
+++ b/scripts/lm_eval/config.yml
@@ -0,0 +1,48 @@
+max_iterations: 1
+checkpoint_interval: 10
+log_level: "INFO"
+
+# LLM configuration
+llm:
+  primary_model: "gemma3:12b-it-qat"
+  #primary_model: "gpt-4o"
+  primary_model_weight: 0.8
+  secondary_model: "gemma3:12b-it-qat"
+  #secondary_model: "gpt-4.1"
+  secondary_model_weight: 0.2
+  # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+  # api_base: "https://api.openai.com/v1/"
+  api_base: "http://localhost:11434/v1/"
+  api_key: "ollama"
+  temperature: 0.7
+  top_p: 0.95
+  max_tokens: 4096
+
+# Prompt configuration
+prompt:
+  num_top_programs: 3
+  use_template_stochasticity: true
+  # System prompt is created dynamically during the benchmark in file system_message.txt!
+  template_dir: "scripts/lm_eval/prompts"
+
+# Database configuration
+database:
+  population_size: 50
+  archive_size: 20
+  num_islands: 3
+  elite_selection_ratio: 0.2
+  exploitation_ratio: 0.7
+
+# Evaluator configuration
+evaluator:
+  timeout: 60
+  cascade_evaluation: false
+  cascade_thresholds: [0.5, 0.75]
+  parallel_evaluations: 4
+  use_llm_feedback: true
+  llm_feedback_weight: 1.0
+
+
+# Evolution settings
+diff_based_evolution: false
+allow_full_rewrites: true
diff --git a/scripts/lm_eval/evaluator_stub.py b/scripts/lm_eval/evaluator_stub.py
new file mode 100644
index 000000000..2ad8c2bdc
--- /dev/null
+++ b/scripts/lm_eval/evaluator_stub.py
@@ -0,0 +1,5 @@
+def evaluate_stage1(file_path):
+    return {"not_implemented": 0.0}
+
+def evaluate(file_path):
+    return evaluate_stage1(file_path)
diff --git a/scripts/lm_eval/initial_content_stub.txt b/scripts/lm_eval/initial_content_stub.txt
new file mode 100644
index 000000000..8dff9fe7f
--- /dev/null
+++ b/scripts/lm_eval/initial_content_stub.txt
@@ -0,0 +1 @@
+insert the answer to the task here!
\ No newline at end of file
diff --git a/scripts/lm_eval/lm-eval.py b/scripts/lm_eval/lm-eval.py
new file mode 100644
index 000000000..86b6627b5
--- /dev/null
+++ b/scripts/lm_eval/lm-eval.py
@@ -0,0 +1,200 @@
+"""
+OpenEvolve <-> lm-evaluation-harness adapter
+
+Implements generation only, no loglikelihood. Tasks such as GSM8K / BoolQ / MMLU-Math /
+AQUA-RAT and most code suites should work fine because they grade on the generated
+answer string.
+"""
+
+from __future__ import annotations
+import subprocess, tempfile, json, os, argparse, math, pathlib
+from pathlib import Path
+from typing import List, Dict, Tuple, Any, Iterable
+
+import lm_eval
+from lm_eval.tasks import TaskManager
+from lm_eval.evaluator import evaluate
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from datetime import datetime
+
+# cd to the parent parent directory of this file
+os.chdir(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+PIPELINE_CMD = ["python3", "openevolve-run.py"]
+
+@register_model("openevolve")
+class OpenEvolve(LM):
+    def __init__(
+        self,
+        init_file: str = "initial_content_stub.txt",
+        evaluator_file: str = "evaluator_stub.py",
+        config_file: str = "config.yml",
+        iterations: int = 5,
+        extra_param: List[str] = [],
+        **kwargs,
+    ):
+        super().__init__()
+        self.init_file = init_file
+        self.evaluator_file = evaluator_file
+        self.iterations = iterations
+        self.extra_param = extra_param
+        self.config_file = config_file
+
+        # folder must match prompt:template_dir in config.yml!
+        self.prompt_path = "scripts/lm_eval/prompts/system_message.txt"
+        self.evaluator_prompt_path = "scripts/lm_eval/prompts/evaluator_system_message.txt"
+        self.best_path = "scripts/lm_eval/openevolve_output/best/best_program.txt"
+        self.base_system_message = "You are an expert task solver, with a lot of commonsense, math, language and coding knowledge.\n\nConsider this task:\n```{prompt}´´´"
+
+    def generate(self, prompts: List[str], max_gen_toks: int = None, stop=None, **kwargs):
+        outs = []
+        for prompt in prompts:
+            # Task prompt becomes the system message. User prompt is the evolutionary logic.
+            # We create temporary prompt files with the system message
+            with Path(self.prompt_path).open("w") as f:
+                f.write(self.base_system_message.format(prompt=prompt))
+
+            with Path(self.evaluator_prompt_path).open("w") as f:
+                f.write(self.base_system_message.format(prompt=prompt))
+
+            cmd = (
+                PIPELINE_CMD
+                + ["--config", self.config_file]
+                + ["--iterations", str(self.iterations)]
+                + self.extra_param
+                + [self.init_file, self.evaluator_file]
+            )
+            print(f"Running command: {' '.join(cmd)}")
+            try:
+                res = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                text = res.stdout.strip()
+                print(f"Process output: {text}")
+            except subprocess.CalledProcessError as e:
+                print(f"Command failed with return code {e.returncode}")
+                print(f"stderr: {e.stderr}")
+                text = ""
+
+            print(f"# Prompt: {prompt}")
+            with Path(self.best_path).open("r") as f:
+                best = f.read().strip()
+                print(f"# Answer: {best}")
+
+            # honour stop tokens
+            if stop:
+                for s in stop:
+                    idx = best.find(s)
+                    if idx != -1:
+                        best = best[:idx]
+                        break
+            outs.append(best)
+        return outs
+
+    # for tasks that ask for log likelihood, indicate that it is unsupported
+    def loglikelihood(self, requests: Iterable[Tuple[str, str]], **kw):
+        # return [(-math.inf, False) for _ in requests]
+        raise NotImplementedError
+
+    def loglikelihood_rolling(self, requests: Iterable[str], **kw):
+        # return [(-math.inf, False) for _ in requests]
+        raise NotImplementedError
+
+    def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
+        ctxs, stops = [], []
+
+        for req in requests:
+            # ---------------- old: plain tuple ----------------
+            if isinstance(req, tuple):
+                ctx, until = req
+
+            # -------------- new: Instance object --------------
+            else:
+                ctx = req.args[0]  # first positional arg
+                until = []
+                # if a second positional arg exists and is list-like,
+                # treat it as the stop sequence
+                if len(req.args) > 1 and isinstance(req.args[1], (list, tuple)):
+                    until = list(req.args[1])
+
+            ctxs.append(ctx)
+            stops.append(until)
+
+        # 2) run your real generator once per context
+        gens = self.generate(ctxs, stop=None)
+
+        # 3) post-trim at the first stop sequence
+        cleaned = []
+        for g, until in zip(gens, stops):
+            for s in until:
+                idx = g.find(s)
+                if idx != -1:
+                    g = g[:idx]
+                    break
+            cleaned.append(g)
+        return cleaned
+
+if __name__ == "__main__":
+    # cli arguments for primary model, secondary model, iterations, config and tasks
+    p = argparse.ArgumentParser(
+        description="OpenEvolve <-> lm-evaluation-harness adapter.",
+    )
+    p.add_argument("--config", default="scripts/lm_eval/config.yml", help="config file")
+    p.add_argument(
+        "--init_file",
+        default="scripts/lm_eval/initial_content_stub.txt",
+        help="initial content file",
+    )
+    p.add_argument(
+        "--evaluator_file", default="scripts/lm_eval/evaluator_stub.py", help="evaluator file"
+    )
+    p.add_argument("--iterations", default=5, type=int, help="number of iterations")
+    p.add_argument("--limit", default=None, type=int, help="limit the number of examples per task that are executed")
+    # p.add_argument("--tasks", default="boolq,gsm8k,mmlu", help="comma-list of tasks to evaluate")
+    p.add_argument("--tasks", default="gsm8k", help="list of tasks to evaluate")
+    p.add_argument("--output_path", default="results", help="output path for results")
+    args = p.parse_args()
+
+    lm_obj = OpenEvolve(
+        init_file=args.init_file,
+        evaluator_file=args.evaluator_file,
+        iterations=args.iterations,
+        config_file=args.config,
+    )
+
+    task_dict = lm_eval.tasks.get_task_dict(args.tasks.split(","))
+
+    results = evaluate(
+        lm=lm_obj,
+        task_dict=task_dict,
+        limit=args.limit,
+    )
+
+    # write out the results
+    pathlib.Path(
+        args.output_path,
+    ).mkdir(exist_ok=True)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_path = pathlib.Path(os.path.join(
+        args.output_path,
+        f"{timestamp}_iter{args.iterations}.json",
+    ))
+
+    with results_path.open("w") as f:
+        json.dump(results, f, indent=2)
+
+    # print result summary
+    short = {}
+    for task, metrics in results["results"].items():
+        # pick the first value that is a real number
+        for key, val in metrics.items():
+            if isinstance(val, (int, float)):
+                short[task] = (key, val)          # store *both* name & value
+                break
+
+    print(f"Full results written to {results_path}\n")
+    print("Headline metrics:")
+    for task, (name, value) in short.items():
+        print(f"  {task:<15} {name:<12} {value:.3%}")
+
+    print("\nNote: Never cite the overall average when some components were skipped!")
diff --git a/scripts/lm_eval/prompts/diff_user.txt b/scripts/lm_eval/prompts/diff_user.txt
new file mode 100644
index 000000000..dbda45a78
--- /dev/null
+++ b/scripts/lm_eval/prompts/diff_user.txt
@@ -0,0 +1,34 @@
+# Current Solution Information
+- Current performance metrics: {metrics}
+- Areas identified for improvement: {improvement_areas}
+
+# Evolution History
+{evolution_history}
+
+# Current Solution
+```
+{current_program}
+```
+
+# Task
+Suggest improvements to the answer that will lead to better performance on the specified metrics.
+
+You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
+
+<<<<<<< SEARCH
+# Original text to find and replace (must match exactly)
+=======
+# New replacement text
+>>>>>>> REPLACE
+
+Example of valid diff format:
+<<<<<<< SEARCH
+poem stub
+=======
+Tyger Tyger, burning bright, In the forests of the night; What immortal hand or eye
+>>>>>>> REPLACE
+
+You can suggest multiple changes. Each SEARCH section must exactly match text in the current solution.
+Be thoughtful about your changes and explain your reasoning thoroughly.
+
+IMPORTANT: Do not necessarily rewrite the entire solution - focus on targeted improvements.
\ No newline at end of file
diff --git a/scripts/lm_eval/prompts/evaluation.txt b/scripts/lm_eval/prompts/evaluation.txt
new file mode 100644
index 000000000..e1f3ef05e
--- /dev/null
+++ b/scripts/lm_eval/prompts/evaluation.txt
@@ -0,0 +1,24 @@
+Evaluate the following answer on a scale of 0.0 to 1.0 for the following metrics:
+1. Correctness: Is the answer factually correct?
+2. Task understanding: Did it capture the intent of the task well?
+3. Syntax: Is its syntax flawless?
+
+For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
+
+Task:
+```
+
+```
+
+Answer to evaluate:
+```
+{current_program}
+```
+
+Return your evaluation as a JSON object with the following format:
+{{
+    "correctness": [score],
+    "understanding": [score],
+    "syntax": [score],
+}}
+Even for invalid input, return nothing but the JSON object.
\ No newline at end of file
diff --git a/scripts/lm_eval/prompts/evolution_history.txt b/scripts/lm_eval/prompts/evolution_history.txt
new file mode 100644
index 000000000..a9ff965a8
--- /dev/null
+++ b/scripts/lm_eval/prompts/evolution_history.txt
@@ -0,0 +1,7 @@
+## Previous Attempts
+
+{previous_attempts}
+
+## Top Performing Solution
+
+{top_programs}
\ No newline at end of file
diff --git a/scripts/lm_eval/prompts/full_rewrite_user.txt b/scripts/lm_eval/prompts/full_rewrite_user.txt
new file mode 100644
index 000000000..ddaee85df
--- /dev/null
+++ b/scripts/lm_eval/prompts/full_rewrite_user.txt
@@ -0,0 +1,17 @@
+# Current Solution Information
+- Current metrics: {metrics}
+- Areas identified for improvement: {improvement_areas}
+
+# Evolution History
+{evolution_history}
+
+# Current Solution
+```
+{current_program}
+```
+
+# Task
+Rewrite the answer to improve its performance on the specified metrics.
+Provide the complete new answer.
+
+# Your rewritten answer here
diff --git a/scripts/lm_eval/prompts/previous_attempt.txt b/scripts/lm_eval/prompts/previous_attempt.txt
new file mode 100644
index 000000000..92d56264c
--- /dev/null
+++ b/scripts/lm_eval/prompts/previous_attempt.txt
@@ -0,0 +1,4 @@
+### Attempt {attempt_number}
+- Changes: {changes}
+- Performance: {performance}
+- Outcome: {outcome}
\ No newline at end of file
diff --git a/scripts/lm_eval/prompts/top_program.txt b/scripts/lm_eval/prompts/top_program.txt
new file mode 100644
index 000000000..9c245363d
--- /dev/null
+++ b/scripts/lm_eval/prompts/top_program.txt
@@ -0,0 +1,5 @@
+### Solution {program_number} (Score: {score})
+```
+{program_snippet}
+```
+Key features: {key_features}
\ No newline at end of file
diff --git a/scripts/lm_eval/requirements.txt b/scripts/lm_eval/requirements.txt
new file mode 100644
index 000000000..270391918
--- /dev/null
+++ b/scripts/lm_eval/requirements.txt
@@ -0,0 +1,2 @@
+datasets
+lm-eval
\ No newline at end of file
diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py
index c34a3a373..829d23b42 100644
--- a/tests/test_valid_configs.py
+++ b/tests/test_valid_configs.py
@@ -8,6 +8,7 @@
 
 from openevolve.config import Config, load_config
 
+
 class TestConfigValidity(unittest.TestCase):
     """Tests that all config files in the configs/ and examples/ directories are valid"""
 
@@ -28,7 +29,10 @@ def test_import_config_files(self):
         for config_file in config_files:
             print(f"Testing config file: {config_file}")
             config = load_config(config_file)
-            self.assertIsInstance(config, Config, f"Config file {config_file} did not load correctly")
+            self.assertIsInstance(
+                config, Config, f"Config file {config_file} did not load correctly"
+            )
+
 
 if __name__ == "__main__":
     unittest.main()

From cc9b6dbcbf65a9479f06a6dc9c51946d492d9ebb Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 05:30:19 +0200
Subject: [PATCH 4/6] Tiny docker fix and removed debug output

---
 Makefile                | 7 ++++++-
 README.md               | 2 +-
 openevolve/evaluator.py | 1 -
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a58b140db..172611dd4 100644
--- a/Makefile
+++ b/Makefile
@@ -48,4 +48,9 @@ docker-build:
 # Run the Docker container with the example
 .PHONY: docker-run
 docker-run:
-	docker run --rm -v $(PROJECT_DIR):/app $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
\ No newline at end of file
+	docker run --rm -v $(PROJECT_DIR):/app --network="host" $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+
+# Run the lm-eval benchmark
+.PHONY: lm-eval
+lm-eval:
+	$(PYTHON) scripts/lm_eval/lm-eval.py
diff --git a/README.md b/README.md
index 5c0ca1487..db8db4612 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
 You can also install and execute via Docker:
 ```bash
 docker build -t openevolve .
-docker run --rm -v $(pwd):/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+docker run --rm -v $(pwd):/app --network="host" openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
 ```
 
 ## Configuration
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
index 7c093667d..c7ba5e31c 100644
--- a/openevolve/evaluator.py
+++ b/openevolve/evaluator.py
@@ -343,7 +343,6 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")
-                traceback.print_exc()
                 return {}
 
         except Exception as e:

From 1dc29835f310b3cdb18e99b14a968eadb5e73c4b Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 06:24:12 +0200
Subject: [PATCH 5/6] Moved scripts/lm_eval/ to examples/lm_eval/

---
 .gitignore                                    |  4 +--
 {scripts => examples}/lm_eval/README.md       | 10 +++---
 {scripts => examples}/lm_eval/config.yml      |  2 +-
 .../lm_eval/evaluator_stub.py                 |  1 +
 .../lm_eval/initial_content_stub.txt          |  0
 {scripts => examples}/lm_eval/lm-eval.py      | 33 ++++++++++++-------
 .../lm_eval/prompts/diff_user.txt             |  0
 .../lm_eval/prompts/evaluation.txt            |  0
 .../lm_eval/prompts/evolution_history.txt     |  0
 .../lm_eval/prompts/full_rewrite_user.txt     |  0
 .../lm_eval/prompts/previous_attempt.txt      |  0
 .../lm_eval/prompts/top_program.txt           |  0
 .../lm_eval/requirements.txt                  |  0
 13 files changed, 30 insertions(+), 20 deletions(-)
 rename {scripts => examples}/lm_eval/README.md (88%)
 rename {scripts => examples}/lm_eval/config.yml (96%)
 rename {scripts => examples}/lm_eval/evaluator_stub.py (99%)
 rename {scripts => examples}/lm_eval/initial_content_stub.txt (100%)
 rename {scripts => examples}/lm_eval/lm-eval.py (88%)
 rename {scripts => examples}/lm_eval/prompts/diff_user.txt (100%)
 rename {scripts => examples}/lm_eval/prompts/evaluation.txt (100%)
 rename {scripts => examples}/lm_eval/prompts/evolution_history.txt (100%)
 rename {scripts => examples}/lm_eval/prompts/full_rewrite_user.txt (100%)
 rename {scripts => examples}/lm_eval/prompts/previous_attempt.txt (100%)
 rename {scripts => examples}/lm_eval/prompts/top_program.txt (100%)
 rename {scripts => examples}/lm_eval/requirements.txt (100%)

diff --git a/.gitignore b/.gitignore
index 27e77c7b9..c4c3f3dd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 results/
-scripts/lm_eval/prompts/system_message.txt
-scripts/lm_eval/prompts/evaluator_system_message.txt
+examples/lm_eval/prompts/system_message.txt
+examples/lm_eval/prompts/evaluator_system_message.txt
 
 # Python
 __pycache__/
diff --git a/scripts/lm_eval/README.md b/examples/lm_eval/README.md
similarity index 88%
rename from scripts/lm_eval/README.md
rename to examples/lm_eval/README.md
index 934320b48..1891c4438 100644
--- a/scripts/lm_eval/README.md
+++ b/examples/lm_eval/README.md
@@ -7,7 +7,7 @@
 ## Usage
 
 ```bash
-$ python3 scripts/lm_eval/lm-eval.py -h
+$ python3 examples/lm_eval/lm-eval.py -h
 usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
                   [--output_path OUTPUT_PATH]
 
@@ -30,26 +30,26 @@ options:
 
 Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
 ```
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 80.000%
 [..]
 
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 90.000%
 [..]
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 80.000%
 [..]
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 70.000%
diff --git a/scripts/lm_eval/config.yml b/examples/lm_eval/config.yml
similarity index 96%
rename from scripts/lm_eval/config.yml
rename to examples/lm_eval/config.yml
index 269b0a726..ca02da7e1 100644
--- a/scripts/lm_eval/config.yml
+++ b/examples/lm_eval/config.yml
@@ -23,7 +23,7 @@ prompt:
   num_top_programs: 3
   use_template_stochasticity: true
   # System prompt is created dynamically during the benchmark in file system_message.txt!
-  template_dir: "scripts/lm_eval/prompts"
+  template_dir: "examples/lm_eval/prompts"
 
 # Database configuration
 database:
diff --git a/scripts/lm_eval/evaluator_stub.py b/examples/lm_eval/evaluator_stub.py
similarity index 99%
rename from scripts/lm_eval/evaluator_stub.py
rename to examples/lm_eval/evaluator_stub.py
index 2ad8c2bdc..ef86af025 100644
--- a/scripts/lm_eval/evaluator_stub.py
+++ b/examples/lm_eval/evaluator_stub.py
@@ -1,5 +1,6 @@
 def evaluate_stage1(file_path):
     return {"not_implemented": 0.0}
 
+
 def evaluate(file_path):
     return evaluate_stage1(file_path)
diff --git a/scripts/lm_eval/initial_content_stub.txt b/examples/lm_eval/initial_content_stub.txt
similarity index 100%
rename from scripts/lm_eval/initial_content_stub.txt
rename to examples/lm_eval/initial_content_stub.txt
diff --git a/scripts/lm_eval/lm-eval.py b/examples/lm_eval/lm-eval.py
similarity index 88%
rename from scripts/lm_eval/lm-eval.py
rename to examples/lm_eval/lm-eval.py
index 86b6627b5..4f241deb0 100644
--- a/scripts/lm_eval/lm-eval.py
+++ b/examples/lm_eval/lm-eval.py
@@ -23,6 +23,7 @@
 
 PIPELINE_CMD = ["python3", "openevolve-run.py"]
 
+
 @register_model("openevolve")
 class OpenEvolve(LM):
     def __init__(
@@ -42,9 +43,9 @@ def __init__(
         self.config_file = config_file
 
         # folder must match prompt:template_dir in config.yml!
-        self.prompt_path = "scripts/lm_eval/prompts/system_message.txt"
-        self.evaluator_prompt_path = "scripts/lm_eval/prompts/evaluator_system_message.txt"
-        self.best_path = "scripts/lm_eval/openevolve_output/best/best_program.txt"
+        self.prompt_path = "examples/lm_eval/prompts/system_message.txt"
+        self.evaluator_prompt_path = "examples/lm_eval/prompts/evaluator_system_message.txt"
+        self.best_path = "examples/lm_eval/openevolve_output/best/best_program.txt"
         self.base_system_message = "You are an expert task solver, with a lot of commonsense, math, language and coding knowledge.\n\nConsider this task:\n```{prompt}´´´"
 
     def generate(self, prompts: List[str], max_gen_toks: int = None, stop=None, **kwargs):
@@ -133,22 +134,28 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
             cleaned.append(g)
         return cleaned
 
+
 if __name__ == "__main__":
     # cli arguments for primary model, secondary model, iterations, config and tasks
     p = argparse.ArgumentParser(
         description="OpenEvolve <-> lm-evaluation-harness adapter.",
     )
-    p.add_argument("--config", default="scripts/lm_eval/config.yml", help="config file")
+    p.add_argument("--config", default="examples/lm_eval/config.yml", help="config file")
     p.add_argument(
         "--init_file",
-        default="scripts/lm_eval/initial_content_stub.txt",
+        default="examples/lm_eval/initial_content_stub.txt",
         help="initial content file",
     )
     p.add_argument(
-        "--evaluator_file", default="scripts/lm_eval/evaluator_stub.py", help="evaluator file"
+        "--evaluator_file", default="examples/lm_eval/evaluator_stub.py", help="evaluator file"
     )
     p.add_argument("--iterations", default=5, type=int, help="number of iterations")
-    p.add_argument("--limit", default=None, type=int, help="limit the number of examples per task that are executed")
+    p.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="limit the number of examples per task that are executed",
+    )
     # p.add_argument("--tasks", default="boolq,gsm8k,mmlu", help="comma-list of tasks to evaluate")
     p.add_argument("--tasks", default="gsm8k", help="list of tasks to evaluate")
     p.add_argument("--output_path", default="results", help="output path for results")
@@ -175,10 +182,12 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
     ).mkdir(exist_ok=True)
 
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_path = pathlib.Path(os.path.join(
-        args.output_path,
-        f"{timestamp}_iter{args.iterations}.json",
-    ))
+    results_path = pathlib.Path(
+        os.path.join(
+            args.output_path,
+            f"{timestamp}_iter{args.iterations}.json",
+        )
+    )
 
     with results_path.open("w") as f:
         json.dump(results, f, indent=2)
@@ -189,7 +198,7 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
         # pick the first value that is a real number
         for key, val in metrics.items():
             if isinstance(val, (int, float)):
-                short[task] = (key, val)          # store *both* name & value
+                short[task] = (key, val)  # store *both* name & value
                 break
 
     print(f"Full results written to {results_path}\n")
diff --git a/scripts/lm_eval/prompts/diff_user.txt b/examples/lm_eval/prompts/diff_user.txt
similarity index 100%
rename from scripts/lm_eval/prompts/diff_user.txt
rename to examples/lm_eval/prompts/diff_user.txt
diff --git a/scripts/lm_eval/prompts/evaluation.txt b/examples/lm_eval/prompts/evaluation.txt
similarity index 100%
rename from scripts/lm_eval/prompts/evaluation.txt
rename to examples/lm_eval/prompts/evaluation.txt
diff --git a/scripts/lm_eval/prompts/evolution_history.txt b/examples/lm_eval/prompts/evolution_history.txt
similarity index 100%
rename from scripts/lm_eval/prompts/evolution_history.txt
rename to examples/lm_eval/prompts/evolution_history.txt
diff --git a/scripts/lm_eval/prompts/full_rewrite_user.txt b/examples/lm_eval/prompts/full_rewrite_user.txt
similarity index 100%
rename from scripts/lm_eval/prompts/full_rewrite_user.txt
rename to examples/lm_eval/prompts/full_rewrite_user.txt
diff --git a/scripts/lm_eval/prompts/previous_attempt.txt b/examples/lm_eval/prompts/previous_attempt.txt
similarity index 100%
rename from scripts/lm_eval/prompts/previous_attempt.txt
rename to examples/lm_eval/prompts/previous_attempt.txt
diff --git a/scripts/lm_eval/prompts/top_program.txt b/examples/lm_eval/prompts/top_program.txt
similarity index 100%
rename from scripts/lm_eval/prompts/top_program.txt
rename to examples/lm_eval/prompts/top_program.txt
diff --git a/scripts/lm_eval/requirements.txt b/examples/lm_eval/requirements.txt
similarity index 100%
rename from scripts/lm_eval/requirements.txt
rename to examples/lm_eval/requirements.txt

From af8499a59161a89f7573458de68bbfa57e05b16f Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 06:49:26 +0200
Subject: [PATCH 6/6] Removal of scripts/ folder

---
 scripts/README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 scripts/README.md

diff --git a/scripts/README.md b/scripts/README.md
deleted file mode 100644
index e69de29bb..000000000