diff --git a/README.md b/README.md
index 740909f5e5..c4feb65c18 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ OpenEvolve implements a sophisticated **evolutionary coding pipeline** that goes
 <details>
 <summary><b>Advanced LLM Integration</b></summary>
 
-- **Universal API**: Works with OpenAI, Google, local models, and proxies
+- **Universal API**: Works with OpenAI, Google, Claude Code CLI, local models, and proxies
 - **Intelligent Ensembles**: Weighted combinations with sophisticated fallback
 - **Test-Time Compute**: Enhanced reasoning through proxy systems (see [OptiLLM setup](#llm-provider-setup))
 - **Plugin Ecosystem**: Support for advanced reasoning plugins
@@ -233,7 +233,7 @@ OpenEvolve implements a sophisticated **evolutionary coding pipeline** that goes
 
 ### Requirements
 - **Python**: 3.10+ 
-- **LLM Access**: Any OpenAI-compatible API
+- **LLM Access**: Any OpenAI-compatible API, or [Claude Code CLI](https://docs.anthropic.com/en/docs/claude-code)
 - **Optional**: Docker for containerized runs
 
 ### Installation Options
@@ -356,6 +356,35 @@ llm:
 
 </details>
 
+<details>
+<summary><b>🔮 Claude Code CLI (No API Key)</b></summary>
+
+Use the [Claude Code CLI](https://docs.anthropic.com/en/docs/claude-code) as the LLM backend — no API keys needed, authentication uses the CLI's OAuth session.
+
+```bash
+# Install and authenticate
+npm install -g @anthropic-ai/claude-code
+claude login
+```
+
+```yaml
+# config.yaml
+llm:
+  provider: "claude_code"
+  models:
+    - name: "sonnet"
+      weight: 0.8
+      max_tokens: 16000
+      max_budget_usd: 1.0
+    - name: "haiku"
+      weight: 0.2
+      max_tokens: 8000
+```
+
+See the [Claude Code quickstart example](examples/claude_code_quickstart/) for a complete walkthrough.
+
+</details>
+
 ## Examples Gallery
 
 <div align="center">
diff --git a/examples/claude_code_quickstart/README.md b/examples/claude_code_quickstart/README.md
new file mode 100644
index 0000000000..585f475648
--- /dev/null
+++ b/examples/claude_code_quickstart/README.md
@@ -0,0 +1,72 @@
+# Claude Code CLI Quickstart
+
+This example shows how to use the [Claude Code CLI](https://docs.anthropic.com/en/docs/claude-code) as the LLM backend for OpenEvolve. No API keys are needed — authentication uses the CLI's existing OAuth session.
+
+## Prerequisites
+
+1. **Install Claude Code CLI:**
+   ```bash
+   npm install -g @anthropic-ai/claude-code
+   ```
+
+2. **Authenticate:**
+   ```bash
+   claude login
+   ```
+
+3. **Install OpenEvolve:**
+   ```bash
+   pip install openevolve
+   ```
+
+## Run
+
+```bash
+python openevolve-run.py \
+  examples/claude_code_quickstart/initial_program.py \
+  examples/claude_code_quickstart/evaluator.py \
+  --config examples/claude_code_quickstart/config.yaml \
+  --iterations 50
+```
+
+## How It Works
+
+The `config.yaml` sets `provider: "claude_code"` which routes all LLM calls through the `claude -p` subprocess instead of the OpenAI-compatible API. The CLI handles authentication, model selection, and billing.
+
+### Key Config Options
+
+| Field | Description | Default |
+|-------|-------------|---------|
+| `provider` | Set to `"claude_code"` to use the CLI backend | `"openai"` |
+| `name` | Claude model name (`sonnet`, `haiku`, `opus`) | `"sonnet"` |
+| `max_budget_usd` | Per-call spending cap in USD | `1.0` |
+| `timeout` | CLI timeout in seconds | `300` |
+| `retries` | Number of retry attempts on failure | `3` |
+| `retry_delay` | Seconds between retries | `5` |
+
+### Ensemble Example
+
+You can mix Claude models in an ensemble, just like with OpenAI models:
+
+```yaml
+llm:
+  provider: "claude_code"
+  models:
+    - name: "sonnet"
+      weight: 0.8
+      max_tokens: 16000
+    - name: "haiku"
+      weight: 0.2
+      max_tokens: 8000
+```
+
+### Programmatic Usage
+
+You can also inject the Claude Code backend at runtime without modifying config files:
+
+```python
+from openevolve.llm.claude_code import init_claude_code_client
+
+for model_cfg in config.llm.models:
+    model_cfg.init_client = init_claude_code_client
+```
diff --git a/examples/claude_code_quickstart/config.yaml b/examples/claude_code_quickstart/config.yaml
new file mode 100644
index 0000000000..d1c554e561
--- /dev/null
+++ b/examples/claude_code_quickstart/config.yaml
@@ -0,0 +1,57 @@
+# Configuration for function minimization using Claude Code CLI as the LLM backend.
+# No API keys needed — authentication uses `claude login` (OAuth session).
+#
+# Prerequisites:
+#   1. Install Claude Code CLI: npm install -g @anthropic-ai/claude-code
+#   2. Authenticate: claude login
+#
+# Run:
+#   python openevolve-run.py \
+#     examples/claude_code_quickstart/initial_program.py \
+#     examples/claude_code_quickstart/evaluator.py \
+#     --config examples/claude_code_quickstart/config.yaml \
+#     --iterations 50
+
+max_iterations: 50
+checkpoint_interval: 10
+
+llm:
+  provider: "claude_code"
+  models:
+    - name: "sonnet"
+      weight: 0.8
+      max_tokens: 16000
+      timeout: 300
+      max_budget_usd: 1.0
+    - name: "haiku"
+      weight: 0.2
+      max_tokens: 8000
+      timeout: 120
+      max_budget_usd: 0.5
+  retries: 3
+  retry_delay: 5
+
+prompt:
+  system_message: >
+    You are an expert programmer specializing in optimization algorithms.
+    Your task is to improve a function minimization algorithm to find the
+    global minimum of a complex function with many local minima.
+    The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20.
+    Focus on improving the search_algorithm function to reliably find the
+    global minimum, escaping local minima that might trap simple algorithms.
+
+database:
+  population_size: 50
+  archive_size: 20
+  num_islands: 3
+  elite_selection_ratio: 0.2
+  exploitation_ratio: 0.7
+  similarity_threshold: 0.99
+
+evaluator:
+  timeout: 60
+  cascade_thresholds: [1.3]
+  parallel_evaluations: 3
+
+diff_based_evolution: true
+max_code_length: 20000
diff --git a/examples/claude_code_quickstart/evaluator.py b/examples/claude_code_quickstart/evaluator.py
new file mode 100644
index 0000000000..f16318125b
--- /dev/null
+++ b/examples/claude_code_quickstart/evaluator.py
@@ -0,0 +1,493 @@
+"""
+Evaluator for the function minimization example
+"""
+
+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+import signal
+from openevolve.evaluation_result import EvaluationResult
+
+
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
+    """
+    Run a function with a timeout using concurrent.futures
+
+    Args:
+        func: Function to run
+        args: Arguments to pass to the function
+        kwargs: Keyword arguments to pass to the function
+        timeout_seconds: Timeout in seconds
+
+    Returns:
+        Result of the function or raises TimeoutError
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+
+
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        print(f"Warning: Could not convert {value} of type {type(value)} to float")
+        return 0.0
+
+
+def evaluate(program_path):
+    """
+    Evaluate the program by running it multiple times and checking how close
+    it gets to the known global minimum.
+
+    Args:
+        program_path: Path to the program file
+
+    Returns:
+        Dictionary of metrics
+    """
+    # Known global minimum (approximate)
+    GLOBAL_MIN_X = -1.704
+    GLOBAL_MIN_Y = 0.678
+    GLOBAL_MIN_VALUE = -1.519
+
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if the required function exists
+        if not hasattr(program, "run_search"):
+            print(f"Error: program does not have 'run_search' function")
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function",
+                },
+                artifacts=error_artifacts
+            )
+
+        # Run multiple trials
+        num_trials = 10
+        x_values = []
+        y_values = []
+        values = []
+        distances = []
+        times = []
+        success_count = 0
+
+        for trial in range(num_trials):
+            try:
+                start_time = time.time()
+
+                # Run with timeout
+                result = run_with_timeout(program.run_search, timeout_seconds=5)
+
+                # Handle different result formats
+                if isinstance(result, tuple):
+                    if len(result) == 3:
+                        x, y, value = result
+                    elif len(result) == 2:
+                        # Assume it's (x, y) and calculate value
+                        x, y = result
+                        # Calculate the function value since it wasn't returned
+                        value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                        print(f"Trial {trial}: Got 2 values, calculated function value: {value}")
+                    else:
+                        print(
+                            f"Trial {trial}: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                        )
+                        continue
+                else:
+                    print(
+                        f"Trial {trial}: Invalid result format, expected tuple but got {type(result)}"
+                    )
+                    continue
+
+                end_time = time.time()
+
+                # Ensure all values are float
+                x = safe_float(x)
+                y = safe_float(y)
+                value = safe_float(value)
+
+                # Check if the result is valid (not NaN or infinite)
+                if (
+                    np.isnan(x)
+                    or np.isnan(y)
+                    or np.isnan(value)
+                    or np.isinf(x)
+                    or np.isinf(y)
+                    or np.isinf(value)
+                ):
+                    print(f"Trial {trial}: Invalid result, got x={x}, y={y}, value={value}")
+                    continue
+
+                # Calculate metrics
+                x_diff = x - GLOBAL_MIN_X
+                y_diff = y - GLOBAL_MIN_Y
+                distance_to_global = np.sqrt(x_diff**2 + y_diff**2)
+
+                x_values.append(x)
+                y_values.append(y)
+                values.append(value)
+                distances.append(distance_to_global)
+                times.append(end_time - start_time)
+                success_count += 1
+
+            except TimeoutError as e:
+                print(f"Trial {trial}: {str(e)}")
+                continue
+            except IndexError as e:
+                # Specifically handle IndexError which often happens with early termination checks
+                print(f"Trial {trial}: IndexError - {str(e)}")
+                print(
+                    "This is likely due to a list index check before the list is fully populated."
+                )
+                continue
+            except Exception as e:
+                print(f"Trial {trial}: Error - {str(e)}")
+                print(traceback.format_exc())
+                continue
+
+        # If all trials failed, return zero scores
+        if success_count == 0:
+            error_artifacts = {
+                "error_type": "AllTrialsFailed",
+                "error_message": f"All {num_trials} trials failed - common issues: timeouts, crashes, or invalid return values",
+                "suggestion": "Check for infinite loops, ensure function returns (x, y) or (x, y, value), and verify algorithm terminates within time limit"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All trials failed",
+                },
+                artifacts=error_artifacts
+            )
+
+        # Calculate metrics
+        avg_value = float(np.mean(values))
+        avg_distance = float(np.mean(distances))
+        avg_time = float(np.mean(times)) if times else 1.0
+
+        # Convert to scores (higher is better)
+        value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))
+        distance_score = float(1.0 / (1.0 + avg_distance))
+        
+        # Add reliability score based on success rate
+        reliability_score = float(success_count / num_trials)
+
+        # Calculate solution quality based on distance to global minimum
+        if avg_distance < 0.5:  # Very close to the correct solution
+            solution_quality_multiplier = 1.5  # 50% bonus
+        elif avg_distance < 1.5:  # In the right region
+            solution_quality_multiplier = 1.2  # 20% bonus
+        elif avg_distance < 3.0:  # Getting closer
+            solution_quality_multiplier = 1.0  # No adjustment
+        else:  # Not finding the right region
+            solution_quality_multiplier = 0.7  # 30% penalty
+
+        # Calculate combined score that prioritizes finding the global minimum
+        # Base score from value and distance, then apply solution quality multiplier
+        base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
+        combined_score = float(base_score * solution_quality_multiplier)
+
+        # Add artifacts for successful runs
+        artifacts = {
+            "convergence_info": f"Converged in {num_trials} trials with {success_count} successes",
+            "best_position": f"Final position: x={x_values[-1]:.4f}, y={y_values[-1]:.4f}" if x_values else "No successful trials",
+            "average_distance_to_global": f"{avg_distance:.4f}",
+            "search_efficiency": f"Success rate: {reliability_score:.2%}"
+        }
+
+        return EvaluationResult(
+            metrics={
+                "value_score": value_score,
+                "distance_score": distance_score,
+                "reliability_score": reliability_score,
+                "combined_score": combined_score,
+            },
+            artifacts=artifacts
+        )
+    except Exception as e:
+        print(f"Evaluation failed completely: {str(e)}")
+        print(traceback.format_exc())
+        
+        # Create error artifacts
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": str(e),
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Check for syntax errors or missing imports in the generated code"
+        }
+        
+        return EvaluationResult(
+            metrics={
+                "value_score": 0.0,
+                "distance_score": 0.0,
+                "reliability_score": 0.0,
+                "combined_score": 0.0,
+                "error": str(e),
+            },
+            artifacts=error_artifacts
+        )
+
+
+# Stage-based evaluation for cascade evaluation
+def evaluate_stage1(program_path):
+    """First stage evaluation with fewer trials"""
+    # Known global minimum (approximate)
+    GLOBAL_MIN_X = float(-1.704)
+    GLOBAL_MIN_Y = float(0.678)
+    GLOBAL_MIN_VALUE = float(-1.519)
+
+    # Quick check to see if the program runs without errors
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if the required function exists
+        if not hasattr(program, "run_search"):
+            print(f"Stage 1 validation: Program does not have 'run_search' function")
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function"
+                },
+                artifacts=error_artifacts
+            )
+
+        try:
+            # Run a single trial with timeout
+            result = run_with_timeout(program.run_search, timeout_seconds=5)
+
+            # Handle different result formats
+            if isinstance(result, tuple):
+                if len(result) == 3:
+                    x, y, value = result
+                elif len(result) == 2:
+                    # Assume it's (x, y) and calculate value
+                    x, y = result
+                    # Calculate the function value since it wasn't returned
+                    value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                    print(f"Stage 1: Got 2 values, calculated function value: {value}")
+                else:
+                    print(
+                        f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                    )
+                    
+                    error_artifacts = {
+                        "error_type": "InvalidReturnFormat",
+                        "error_message": f"Stage 1: Function returned tuple with {len(result)} values, expected 2 or 3",
+                        "suggestion": "run_search() must return (x, y) or (x, y, value) - check your return statement"
+                    }
+                    
+                    return EvaluationResult(
+                        metrics={
+                            "runs_successfully": 0.0, 
+                            "combined_score": 0.0,
+                            "error": "Invalid result format"
+                        },
+                        artifacts=error_artifacts
+                    )
+            else:
+                print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
+                
+                error_artifacts = {
+                    "error_type": "InvalidReturnType",
+                    "error_message": f"Stage 1: Function returned {type(result)}, expected tuple",
+                    "suggestion": "run_search() must return a tuple like (x, y) or (x, y, value), not a single value or other type"
+                }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.0, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result format"
+                    },
+                    artifacts=error_artifacts
+                )
+
+            # Ensure all values are float
+            x = safe_float(x)
+            y = safe_float(y)
+            value = safe_float(value)
+
+            # Check if the result is valid
+            if (
+                np.isnan(x)
+                or np.isnan(y)
+                or np.isnan(value)
+                or np.isinf(x)
+                or np.isinf(y)
+                or np.isinf(value)
+            ):
+                print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
+                
+                error_artifacts = {
+                    "error_type": "InvalidResultValues",
+                    "error_message": f"Stage 1: Got invalid values - x={x}, y={y}, value={value}",
+                    "suggestion": "Function returned NaN or infinite values. Check for division by zero, invalid math operations, or uninitialized variables"
+                }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.5, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result values"
+                    },
+                    artifacts=error_artifacts
+                )
+
+            # Calculate distance safely
+            x_diff = float(x) - GLOBAL_MIN_X
+            y_diff = float(y) - GLOBAL_MIN_Y
+            distance = float(np.sqrt(x_diff**2 + y_diff**2))
+
+            # Calculate value-based score
+            value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE)))
+            distance_score = float(1.0 / (1.0 + distance))
+
+            # Calculate solution quality based on distance to global minimum
+            if distance < 0.5:  # Very close to the correct solution
+                solution_quality_multiplier = 1.4  # 40% bonus
+            elif distance < 1.5:  # In the right region
+                solution_quality_multiplier = 1.15  # 15% bonus
+            elif distance < 3.0:  # Getting closer
+                solution_quality_multiplier = 1.0  # No adjustment
+            else:  # Not finding the right region
+                solution_quality_multiplier = 0.8  # 20% penalty
+
+            # Calculate combined score for stage 1
+            base_score = 0.6 * value_score + 0.4 * distance_score
+            combined_score = float(base_score * solution_quality_multiplier)
+
+            # Add artifacts for successful stage 1
+            stage1_artifacts = {
+                "stage1_result": f"Found solution at x={x:.4f}, y={y:.4f} with value={value:.4f}",
+                "distance_to_global": f"{distance:.4f}",
+                "solution_quality": f"Distance < 0.5: Very close" if distance < 0.5 else f"Distance < 1.5: Good region" if distance < 1.5 else "Could be improved"
+            }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 1.0,
+                    "value_score": value_score,
+                    "distance_score": distance_score,
+                    "combined_score": combined_score,
+                },
+                artifacts=stage1_artifacts
+            )
+        except TimeoutError as e:
+            print(f"Stage 1 evaluation timed out: {e}")
+            
+            error_artifacts = {
+                "error_type": "TimeoutError",
+                "error_message": "Stage 1: Function execution exceeded 5 second timeout",
+                "suggestion": "Function is likely stuck in infinite loop or doing too much computation. Try reducing iterations or adding early termination conditions"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Timeout"
+                },
+                artifacts=error_artifacts
+            )
+        except IndexError as e:
+            # Specifically handle IndexError which often happens with early termination checks
+            print(f"Stage 1 evaluation failed with IndexError: {e}")
+            print("This is likely due to a list index check before the list is fully populated.")
+            
+            error_artifacts = {
+                "error_type": "IndexError",
+                "error_message": f"Stage 1: {str(e)}",
+                "suggestion": "List index out of range - likely accessing empty list or wrong index. Check list initialization and bounds"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": f"IndexError: {str(e)}"
+                },
+                artifacts=error_artifacts
+            )
+        except Exception as e:
+            print(f"Stage 1 evaluation failed: {e}")
+            print(traceback.format_exc())
+            
+            error_artifacts = {
+                "error_type": type(e).__name__,
+                "error_message": f"Stage 1: {str(e)}",
+                "full_traceback": traceback.format_exc(),
+                "suggestion": "Unexpected error occurred. Check the traceback for specific issue"
+            }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": str(e)
+                },
+                artifacts=error_artifacts
+            )
+
+    except Exception as e:
+        print(f"Stage 1 evaluation failed: {e}")
+        print(traceback.format_exc())
+        
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": f"Stage 1 outer exception: {str(e)}",
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Critical error during stage 1 evaluation. Check program syntax and imports"
+        }
+        
+        return EvaluationResult(
+            metrics={
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": str(e)
+            },
+            artifacts=error_artifacts
+        )
+
+
+def evaluate_stage2(program_path):
+    """Second stage evaluation with more thorough testing"""
+    # Full evaluation as in the main evaluate function
+    return evaluate(program_path)
diff --git a/examples/claude_code_quickstart/initial_program.py b/examples/claude_code_quickstart/initial_program.py
new file mode 100644
index 0000000000..670c02cc45
--- /dev/null
+++ b/examples/claude_code_quickstart/initial_program.py
@@ -0,0 +1,51 @@
+# EVOLVE-BLOCK-START
+"""Function minimization example for OpenEvolve"""
+import numpy as np
+
+
+def search_algorithm(iterations=1000, bounds=(-5, 5)):
+    """
+    A simple random search algorithm that often gets stuck in local minima.
+
+    Args:
+        iterations: Number of iterations to run
+        bounds: Bounds for the search space (min, max)
+
+    Returns:
+        Tuple of (best_x, best_y, best_value)
+    """
+    # Initialize with a random point
+    best_x = np.random.uniform(bounds[0], bounds[1])
+    best_y = np.random.uniform(bounds[0], bounds[1])
+    best_value = evaluate_function(best_x, best_y)
+
+    for _ in range(iterations):
+        # Simple random search
+        x = np.random.uniform(bounds[0], bounds[1])
+        y = np.random.uniform(bounds[0], bounds[1])
+        value = evaluate_function(x, y)
+
+        if value < best_value:
+            best_value = value
+            best_x, best_y = x, y
+
+    return best_x, best_y, best_value
+
+
+# EVOLVE-BLOCK-END
+
+
+# This part remains fixed (not evolved)
+def evaluate_function(x, y):
+    """The complex function we're trying to minimize"""
+    return np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+
+
+def run_search():
+    x, y, value = search_algorithm()
+    return x, y, value
+
+
+if __name__ == "__main__":
+    x, y, value = run_search()
+    print(f"Found minimum at ({x}, {y}) with value {value}")
diff --git a/openevolve/config.py b/openevolve/config.py
index bef193da21..fd03ed7a98 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -56,6 +56,9 @@ class LLMModelConfig:
     api_key: Optional[str] = None
     name: str = None
 
+    # LLM provider: "openai" (default), "claude_code" (Claude Code CLI)
+    provider: Optional[str] = None
+
     # Custom LLM client
     init_client: Optional[Callable] = None
 
@@ -79,6 +82,9 @@ class LLMModelConfig:
     # Reasoning parameters
     reasoning_effort: Optional[str] = None
 
+    # Claude Code CLI budget per call (USD)
+    max_budget_usd: Optional[float] = None
+
     # Manual mode (human-in-the-loop)
     manual_mode: Optional[bool] = None
     _manual_queue_dir: Optional[str] = None
diff --git a/openevolve/llm/__init__.py b/openevolve/llm/__init__.py
index 26bbef5676..856843d4b0 100644
--- a/openevolve/llm/__init__.py
+++ b/openevolve/llm/__init__.py
@@ -5,5 +5,12 @@
 from openevolve.llm.base import LLMInterface
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.llm.openai import OpenAILLM
+from openevolve.llm.claude_code import ClaudeCodeLLM, init_claude_code_client
 
-__all__ = ["LLMInterface", "OpenAILLM", "LLMEnsemble"]
+__all__ = [
+    "LLMInterface",
+    "OpenAILLM",
+    "ClaudeCodeLLM",
+    "init_claude_code_client",
+    "LLMEnsemble",
+]
diff --git a/openevolve/llm/claude_code.py b/openevolve/llm/claude_code.py
new file mode 100644
index 0000000000..ddbe2e3b7f
--- /dev/null
+++ b/openevolve/llm/claude_code.py
@@ -0,0 +1,138 @@
+"""
+Claude Code CLI interface for LLMs.
+
+Uses the Claude Code CLI (`claude -p`) as a non-interactive LLM backend,
+enabling OpenEvolve to run with Anthropic's Claude models without requiring
+direct API keys — authentication is handled by the CLI's OAuth session.
+
+Usage in config.yaml:
+    llm:
+      provider: "claude_code"
+      models:
+        - name: "sonnet"
+          weight: 1.0
+          max_tokens: 16000
+          timeout: 300
+
+Or inject programmatically:
+    from openevolve.llm.claude_code import init_claude_code_client
+    for model_cfg in config.llm.models:
+        model_cfg.init_client = init_claude_code_client
+"""
+
+import asyncio
+import logging
+import subprocess
+from typing import Dict, List, Optional
+
+from openevolve.llm.base import LLMInterface
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeCodeLLM(LLMInterface):
+    """LLM interface that uses the Claude Code CLI for generation.
+
+    Requires `claude` CLI to be installed and authenticated
+    (run `claude login` first).
+    """
+
+    def __init__(self, model_cfg=None):
+        self.model = getattr(model_cfg, "name", "sonnet")
+        self.system_message = getattr(model_cfg, "system_message", None)
+        self.max_tokens = getattr(model_cfg, "max_tokens", 16000)
+        self.timeout = getattr(model_cfg, "timeout", 300)
+        self.weight = getattr(model_cfg, "weight", 1.0)
+        self.retries = getattr(model_cfg, "retries", 3)
+        self.retry_delay = getattr(model_cfg, "retry_delay", 5)
+        self.max_budget_usd = getattr(model_cfg, "max_budget_usd", 1.0)
+        self.cwd = getattr(model_cfg, "cwd", None)
+        logger.info(f"Initialized ClaudeCodeLLM with model: {self.model}")
+
+    async def generate(self, prompt: str, **kwargs) -> str:
+        sys_msg = kwargs.pop("system_message", self.system_message) or ""
+        return await self.generate_with_context(
+            system_message=sys_msg,
+            messages=[{"role": "user", "content": prompt}],
+            **kwargs,
+        )
+
+    async def generate_with_context(
+        self, system_message: str, messages: List[Dict[str, str]], **kwargs
+    ) -> str:
+        user_content = "\n\n".join(
+            m.get("content", "") for m in messages if m.get("role") == "user"
+        )
+
+        cmd = [
+            "claude",
+            "-p",
+            "--model",
+            self.model,
+            "--no-session-persistence",
+            "--output-format",
+            "text",
+        ]
+        if system_message:
+            cmd.extend(["--system-prompt", system_message])
+
+        budget = kwargs.get("max_budget_usd", self.max_budget_usd)
+        cmd.extend(["--max-budget-usd", str(budget)])
+
+        cmd.append(user_content)
+
+        timeout = kwargs.get("timeout", self.timeout)
+        retries = kwargs.get("retries", self.retries)
+        retry_delay = kwargs.get("retry_delay", self.retry_delay)
+
+        loop = asyncio.get_event_loop()
+        for attempt in range(retries + 1):
+            try:
+                result = await asyncio.wait_for(
+                    loop.run_in_executor(None, lambda: self._run_cli(cmd, timeout)),
+                    timeout=timeout + 30,
+                )
+                return result
+            except asyncio.TimeoutError:
+                if attempt < retries:
+                    logger.warning(
+                        f"Claude Code CLI timeout on attempt {attempt + 1}/{retries + 1}. Retrying..."
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    logger.error(f"All {retries + 1} attempts failed with timeout")
+                    raise
+            except Exception as e:
+                if attempt < retries:
+                    logger.warning(
+                        f"Claude Code CLI error on attempt {attempt + 1}/{retries + 1}: {e}. Retrying..."
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    logger.error(f"All {retries + 1} attempts failed with error: {e}")
+                    raise
+
+    def _run_cli(self, cmd: list, timeout: int) -> str:
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                cwd=self.cwd,
+            )
+            if result.returncode != 0:
+                stderr = result.stderr.strip()
+                if stderr:
+                    logger.warning(f"Claude CLI stderr: {stderr[:500]}")
+            output = result.stdout.strip()
+            if not output:
+                raise RuntimeError(f"Empty response from Claude CLI. stderr: {result.stderr[:500]}")
+            return output
+        except subprocess.TimeoutExpired:
+            raise asyncio.TimeoutError("Claude CLI subprocess timed out")
+
+
+def init_claude_code_client(model_cfg):
+    """Factory function compatible with OpenEvolve's init_client config hook."""
+    return ClaudeCodeLLM(model_cfg)
diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py
index e3c4716735..b9161382a1 100644
--- a/openevolve/llm/ensemble.py
+++ b/openevolve/llm/ensemble.py
@@ -13,6 +13,25 @@
 
 logger = logging.getLogger(__name__)
 
+_PROVIDER_REGISTRY = {
+    "openai": lambda cfg: OpenAILLM(cfg),
+}
+
+try:
+    from openevolve.llm.claude_code import ClaudeCodeLLM
+    _PROVIDER_REGISTRY["claude_code"] = lambda cfg: ClaudeCodeLLM(cfg)
+except ImportError:
+    pass
+
+
+def _create_model(model_cfg: LLMModelConfig) -> LLMInterface:
+    if model_cfg.init_client:
+        return model_cfg.init_client(model_cfg)
+    provider = getattr(model_cfg, "provider", None)
+    if provider and provider in _PROVIDER_REGISTRY:
+        return _PROVIDER_REGISTRY[provider](model_cfg)
+    return OpenAILLM(model_cfg)
+
 
 class LLMEnsemble:
     """Ensemble of LLMs"""
@@ -21,10 +40,7 @@ def __init__(self, models_cfg: List[LLMModelConfig]):
         self.models_cfg = models_cfg
 
         # Initialize models from the configuration
-        self.models = [
-            model_cfg.init_client(model_cfg) if model_cfg.init_client else OpenAILLM(model_cfg)
-            for model_cfg in models_cfg
-        ]
+        self.models = [_create_model(model_cfg) for model_cfg in models_cfg]
 
         # Extract and normalize model weights
         self.weights = [model.weight for model in models_cfg]
diff --git a/tests/test_claude_code_llm.py b/tests/test_claude_code_llm.py
new file mode 100644
index 0000000000..b33a9d62e8
--- /dev/null
+++ b/tests/test_claude_code_llm.py
@@ -0,0 +1,160 @@
+"""Tests for the Claude Code CLI LLM backend."""
+
+import asyncio
+import unittest
+from unittest.mock import MagicMock, patch
+
+from openevolve.llm.claude_code import ClaudeCodeLLM, init_claude_code_client
+
+
+class TestClaudeCodeLLM(unittest.TestCase):
+    def _make_cfg(self, **overrides):
+        cfg = MagicMock()
+        cfg.name = overrides.get("name", "sonnet")
+        cfg.system_message = overrides.get("system_message", None)
+        cfg.max_tokens = overrides.get("max_tokens", 16000)
+        cfg.timeout = overrides.get("timeout", 60)
+        cfg.weight = overrides.get("weight", 1.0)
+        cfg.retries = overrides.get("retries", 3)
+        cfg.retry_delay = overrides.get("retry_delay", 5)
+        cfg.max_budget_usd = overrides.get("max_budget_usd", 1.0)
+        cfg.cwd = overrides.get("cwd", None)
+        return cfg
+
+    def test_init_defaults(self):
+        llm = ClaudeCodeLLM(self._make_cfg())
+        self.assertEqual(llm.model, "sonnet")
+        self.assertEqual(llm.max_tokens, 16000)
+        self.assertEqual(llm.timeout, 60)
+        self.assertEqual(llm.weight, 1.0)
+
+    def test_init_with_custom_model(self):
+        llm = ClaudeCodeLLM(self._make_cfg(name="opus"))
+        self.assertEqual(llm.model, "opus")
+
+    def test_factory_function(self):
+        cfg = self._make_cfg()
+        llm = init_claude_code_client(cfg)
+        self.assertIsInstance(llm, ClaudeCodeLLM)
+        self.assertEqual(llm.model, "sonnet")
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_generate_calls_cli(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=0, stdout="Generated response text", stderr="")
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10))
+        result = asyncio.run(llm.generate("test prompt"))
+        self.assertEqual(result, "Generated response text")
+        mock_run.assert_called_once()
+        cmd = mock_run.call_args[0][0]
+        self.assertEqual(cmd[0], "claude")
+        self.assertIn("-p", cmd)
+        self.assertIn("--model", cmd)
+        self.assertIn("sonnet", cmd)
+        self.assertIn("test prompt", cmd)
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_system_message_passed(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=0, stdout="response", stderr="")
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10))
+        asyncio.run(llm.generate("prompt", system_message="You are an expert."))
+        cmd = mock_run.call_args[0][0]
+        idx = cmd.index("--system-prompt")
+        self.assertEqual(cmd[idx + 1], "You are an expert.")
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_empty_response_raises(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="error msg")
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10, retries=0))
+        with self.assertRaises(RuntimeError):
+            asyncio.run(llm.generate("test prompt"))
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_retry_on_failure(self, mock_run):
+        mock_run.side_effect = [
+            MagicMock(returncode=1, stdout="", stderr="transient error"),
+            MagicMock(returncode=0, stdout="success after retry", stderr=""),
+        ]
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10, retries=1, retry_delay=0))
+        result = asyncio.run(llm.generate("test prompt", retry_delay=0))
+        self.assertEqual(result, "success after retry")
+        self.assertEqual(mock_run.call_count, 2)
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_retries_exhausted_raises(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="persistent error")
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10, retries=2, retry_delay=0))
+        with self.assertRaises(RuntimeError):
+            asyncio.run(llm.generate("test prompt", retry_delay=0))
+        self.assertEqual(mock_run.call_count, 3)
+
+    @patch("openevolve.llm.claude_code.subprocess.run")
+    def test_generate_with_context(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=0, stdout="ctx response", stderr="")
+        llm = ClaudeCodeLLM(self._make_cfg(timeout=10))
+        result = asyncio.run(
+            llm.generate_with_context(
+                system_message="sys",
+                messages=[
+                    {"role": "user", "content": "first"},
+                    {"role": "assistant", "content": "ignored"},
+                    {"role": "user", "content": "second"},
+                ],
+            )
+        )
+        self.assertEqual(result, "ctx response")
+        cmd = mock_run.call_args[0][0]
+        self.assertIn("first\n\nsecond", cmd[-1])
+
+
+class TestMaxBudgetConfig(unittest.TestCase):
+    def test_max_budget_usd_in_model_config(self):
+        from openevolve.config import LLMModelConfig
+
+        cfg = LLMModelConfig(max_budget_usd=2.5)
+        self.assertEqual(cfg.max_budget_usd, 2.5)
+
+    def test_max_budget_usd_default_none(self):
+        from openevolve.config import LLMModelConfig
+
+        cfg = LLMModelConfig()
+        self.assertIsNone(cfg.max_budget_usd)
+
+    def test_max_budget_usd_from_dict(self):
+        from openevolve.config import Config
+
+        config = Config.from_dict(
+            {
+                "llm": {
+                    "provider": "claude_code",
+                    "models": [{"name": "sonnet", "max_budget_usd": 3.0, "weight": 1.0}],
+                }
+            }
+        )
+        self.assertEqual(config.llm.models[0].max_budget_usd, 3.0)
+
+
+class TestProviderRegistry(unittest.TestCase):
+    def test_claude_code_in_registry(self):
+        from openevolve.llm.ensemble import _PROVIDER_REGISTRY
+
+        self.assertIn("claude_code", _PROVIDER_REGISTRY)
+
+    def test_ensemble_creates_claude_code(self):
+        from openevolve.llm.ensemble import _create_model
+
+        cfg = MagicMock()
+        cfg.init_client = None
+        cfg.provider = "claude_code"
+        cfg.name = "sonnet"
+        cfg.system_message = None
+        cfg.max_tokens = 4096
+        cfg.timeout = 60
+        cfg.weight = 1.0
+        cfg.max_budget_usd = 1.0
+        cfg.cwd = None
+        model = _create_model(cfg)
+        self.assertIsInstance(model, ClaudeCodeLLM)
+
+
+if __name__ == "__main__":
+    unittest.main()