From a01f07eb4ad6d404f978640fcb899fae4b3a78c9 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 13:33:10 +0000
Subject: [PATCH 1/8] fix(llm+swe): add tool_calls support to LiteLlmClient and
 fix pipeline bugs

LiteLlmClient tool_calls support (src/llm/litellm.rs):
- Add tools and tool_choice fields (as serde_json::Value) to ApiRequest with
  skip_serializing_if to avoid sending null to providers
- Add ApiToolCall and ApiToolCallFunction deserialization structs for parsing
  function call responses from the API
- Add reasoning and reasoning_content fields to ApiMessage for reasoning models
- Make finish_reason optional in ApiChoice (some providers omit it)
- Implement full tool_calls parsing in generate() response conversion, mapping
  API tool calls to ToolCallInfo/ToolCallFunction types matching OpenRouterProvider
- Add content extraction priority: content > reasoning_content > reasoning,
  with fallback to tool call arguments when content is empty
- Update test to verify tools/tool_choice are properly excluded when None

Patch extractor fix (src/swe/extractor.rs):
- Replace git show with git diff for extracting PR patches, which produces
  cleaner unified diffs without commit metadata that confused downstream parsing
- Handle all three cases: base..merge, HEAD..merge, and HEAD~1..HEAD

Test generator clippy fix (src/swe/test_generator.rs):
- Remove unnecessary reference on result.stdout.trim() call

Workspace validator runtime install (src/swe/workspace_validator.rs):
- Add language runtime installation step before running validation tests
  in Docker containers (Go, Node.js, Rust, Java) since the base image
  may not include them, causing false setup_error failures

Formatting (docker_sandbox.rs, harness.rs, test_generator.rs, extractor.rs):
- Apply cargo fmt to fix formatting across all modified SWE pipeline files

Gitignore (.gitignore):
- Add mine_test.log and test-easy-output/ to prevent committing test artifacts
---
 .gitignore                     |   2 +
 src/llm/litellm.rs             | 104 +++++++++++++++++++++++++++++----
 src/swe/docker_sandbox.rs      |  80 +++++++++++++++----------
 src/swe/extractor.rs           |  15 ++---
 src/swe/harness.rs             |   5 +-
 src/swe/test_generator.rs      |  50 ++++++++++++----
 src/swe/workspace_validator.rs |  20 +++++++
 7 files changed, 208 insertions(+), 68 deletions(-)
diff --git a/.gitignore b/.gitignore
index 3d29c38..5f04bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,3 +42,5 @@ test-verify/
 *.db
 *.db-shm
 *.db-wal
+mine_test.log
+test-easy-output/
diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index 4e82dc6..e0be9b4 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -490,6 +490,10 @@ struct ApiRequest {
     top_p: Option<f64>,
     #[serde(skip_serializing_if = "Option::is_none")]
     response_format: Option<ResponseFormat>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tools: Option<serde_json::Value>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_choice: Option<serde_json::Value>,
 }
 
 /// Internal response structure from the OpenAI-compatible API.
@@ -506,14 +510,44 @@ struct ApiResponse {
 struct ApiChoice {
     index: u32,
     message: ApiMessage,
-    finish_reason: String,
+    #[serde(default)]
+    finish_reason: Option<String>,
 }
 
 /// Internal message structure from the API response.
+/// Supports reasoning models and tool calls.
 #[derive(Debug, Deserialize)]
 struct ApiMessage {
     role: String,
+    #[serde(default)]
     content: String,
+    #[serde(default)]
+    reasoning: Option<String>,
+    #[serde(default)]
+    reasoning_content: Option<String>,
+    #[serde(default)]
+    tool_calls: Option<Vec<ApiToolCall>>,
+}
+
+/// A tool call returned by the model.
+#[derive(Debug, Deserialize)]
+#[allow(dead_code)]
+struct ApiToolCall {
+    #[serde(default)]
+    id: String,
+    #[serde(rename = "type", default)]
+    _tool_type: String,
+    function: ApiToolCallFunction,
+}
+
+/// Function details within a tool call response.
+#[derive(Debug, Deserialize)]
+#[allow(dead_code)]
+struct ApiToolCallFunction {
+    #[serde(default)]
+    name: String,
+    #[serde(default)]
+    arguments: String,
 }
 
 /// Internal usage structure from the API response.
@@ -549,6 +583,13 @@ impl LlmProvider for LiteLlmClient {
             request.model.clone()
         };
 
+        let tools = request
+            .tools
+            .map(|t| serde_json::to_value(&t).unwrap_or(serde_json::Value::Null));
+        let tool_choice = request
+            .tool_choice
+            .map(|tc| serde_json::to_value(&tc).unwrap_or(serde_json::Value::Null));
+
         let api_request = ApiRequest {
             model: model.clone(),
             messages: request.messages,
@@ -556,6 +597,8 @@ impl LlmProvider for LiteLlmClient {
             max_tokens: request.max_tokens,
             top_p: request.top_p,
             response_format: request.response_format,
+            tools,
+            tool_choice,
         };
 
         let url = format!("{}/chat/completions", self.api_base);
@@ -617,15 +660,52 @@ impl LlmProvider for LiteLlmClient {
         let choices = api_response
             .choices
             .into_iter()
-            .map(|choice| Choice {
-                index: choice.index,
-                message: Message {
-                    role: choice.message.role,
-                    content: choice.message.content,
-                    tool_calls: None,
-                    tool_call_id: None,
-                },
-                finish_reason: choice.finish_reason,
+            .map(|choice| {
+                let tool_calls_info = choice.message.tool_calls.as_ref().map(|tcs| {
+                    tcs.iter()
+                        .map(|tc| ToolCallInfo {
+                            id: tc.id.clone(),
+                            call_type: "function".to_string(),
+                            function: ToolCallFunction {
+                                name: tc.function.name.clone(),
+                                arguments: tc.function.arguments.clone(),
+                            },
+                        })
+                        .collect::<Vec<_>>()
+                });
+
+                let content = if let Some(ref tool_calls) = choice.message.tool_calls {
+                    if let Some(first_call) = tool_calls.first() {
+                        if !first_call.function.arguments.is_empty() {
+                            first_call.function.arguments.clone()
+                        } else {
+                            choice.message.content.clone()
+                        }
+                    } else {
+                        choice.message.content.clone()
+                    }
+                } else if !choice.message.content.trim().is_empty() {
+                    choice.message.content
+                } else if let Some(rc) = choice.message.reasoning_content {
+                    if !rc.trim().is_empty() {
+                        rc
+                    } else {
+                        choice.message.reasoning.unwrap_or_default()
+                    }
+                } else {
+                    choice.message.reasoning.unwrap_or_default()
+                };
+
+                Choice {
+                    index: choice.index,
+                    message: Message {
+                        role: choice.message.role,
+                        content,
+                        tool_calls: tool_calls_info,
+                        tool_call_id: None,
+                    },
+                    finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".to_string()),
+                }
             })
             .collect();
 
@@ -903,6 +983,8 @@ mod tests {
             max_tokens: Some(1000),
             top_p: None,
             response_format: None,
+            tools: None,
+            tool_choice: None,
         };
 
         let json = serde_json::to_string(&request).expect("serialization should succeed");
@@ -910,6 +992,8 @@ mod tests {
         assert!(json.contains("\"temperature\":0.7"));
         assert!(json.contains("\"max_tokens\":1000"));
         assert!(!json.contains("top_p")); // Should be skipped because None
+        assert!(!json.contains("tools")); // Should be skipped because None
+        assert!(!json.contains("tool_choice")); // Should be skipped because None
     }
 
     #[tokio::test]
diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
index 6e9899a..13b9104 100644
--- a/src/swe/docker_sandbox.rs
+++ b/src/swe/docker_sandbox.rs
@@ -81,7 +81,10 @@ impl DockerSandbox {
             );
         }
 
-        let sandbox = Self { container_name, tool_port };
+        let sandbox = Self {
+            container_name,
+            tool_port,
+        };
 
         // Install git (only hard dependency; agent installs everything else)
         let install = sandbox
@@ -99,10 +102,7 @@ impl DockerSandbox {
         }
 
         // Clone the repository (full clone for reliable checkout)
-        let clone_cmd = format!(
-            "git clone https://github.com/{}.git /repo 2>&1",
-            repo
-        );
+        let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", repo);
         let clone = sandbox.exec(&clone_cmd, 600_000).await;
         if clone.exit_code != 0 {
             sandbox.destroy().await;
@@ -154,7 +154,10 @@ impl DockerSandbox {
         }
 
         // Use write_file to inject the server script
-        if let Err(e) = self.write_file_abs("/tools/server.py", TOOL_SERVER_PY).await {
+        if let Err(e) = self
+            .write_file_abs("/tools/server.py", TOOL_SERVER_PY)
+            .await
+        {
             tracing::warn!(container = %self.container_name, error = %e, "Failed to write tool server");
             return;
         }
@@ -201,8 +204,13 @@ impl DockerSandbox {
             std::time::Duration::from_millis(2_000),
             Command::new("docker")
                 .args([
-                    "exec", "-w", "/repo", &self.container_name,
-                    "python3", "-c", &check_cmd,
+                    "exec",
+                    "-w",
+                    "/repo",
+                    &self.container_name,
+                    "python3",
+                    "-c",
+                    &check_cmd,
                 ])
                 .stdout(Stdio::null())
                 .stderr(Stdio::null())
@@ -230,29 +238,31 @@ impl DockerSandbox {
             self.tool_port, tool_name
         );
 
-        let result = tokio::time::timeout(
-            std::time::Duration::from_millis(65_000),
-            async {
-                let mut child = Command::new("docker")
-                    .args([
-                        "exec", "-i", "-w", "/repo",
-                        &self.container_name,
-                        "python3", "-c", &script,
-                    ])
-                    .stdin(Stdio::piped())
-                    .stdout(Stdio::piped())
-                    .stderr(Stdio::piped())
-                    .spawn()?;
-
-                if let Some(ref mut stdin) = child.stdin {
-                    use tokio::io::AsyncWriteExt;
-                    stdin.write_all(args_json.as_bytes()).await?;
-                    stdin.shutdown().await?;
-                }
-
-                child.wait_with_output().await
+        let result = tokio::time::timeout(std::time::Duration::from_millis(65_000), async {
+            let mut child = Command::new("docker")
+                .args([
+                    "exec",
+                    "-i",
+                    "-w",
+                    "/repo",
+                    &self.container_name,
+                    "python3",
+                    "-c",
+                    &script,
+                ])
+                .stdin(Stdio::piped())
+                .stdout(Stdio::piped())
+                .stderr(Stdio::piped())
+                .spawn()?;
+
+            if let Some(ref mut stdin) = child.stdin {
+                use tokio::io::AsyncWriteExt;
+                stdin.write_all(args_json.as_bytes()).await?;
+                stdin.shutdown().await?;
             }
-        )
+
+            child.wait_with_output().await
+        })
         .await;
 
         match result {
@@ -322,7 +332,14 @@ impl DockerSandbox {
         let tee_cmd = format!("cat > '{}'", abs_path);
         let mut child = Command::new("docker")
             .args([
-                "exec", "-i", "-w", "/repo", &self.container_name, "bash", "-c", &tee_cmd,
+                "exec",
+                "-i",
+                "-w",
+                "/repo",
+                &self.container_name,
+                "bash",
+                "-c",
+                &tee_cmd,
             ])
             .stdin(Stdio::piped())
             .stdout(Stdio::null())
@@ -416,7 +433,6 @@ impl DockerSandbox {
     pub fn name(&self) -> &str {
         &self.container_name
     }
-
 }
 
 /// Ensure the sandbox is destroyed when dropped (best-effort sync cleanup).
diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs
index ab84631..4059b73 100644
--- a/src/swe/extractor.rs
+++ b/src/swe/extractor.rs
@@ -192,7 +192,7 @@ impl PatchExtractor {
         let sandbox =
             DockerSandbox::start(input.repository, base_commit, input.language, None).await?;
 
-        let diff_ref = match (input.base_commit, input.merge_commit) {
+        let diff_cmd = match (input.base_commit, input.merge_commit) {
             (Some(base), Some(merge)) if !base.is_empty() && !merge.is_empty() => {
                 // Fetch the merge commit (shallow clone may not have it)
                 sandbox
@@ -201,7 +201,7 @@ impl PatchExtractor {
                         60_000,
                     )
                     .await;
-                format!("{base}..{merge}")
+                format!("git diff --no-color --unified=3 {} {} 2>&1", base, merge)
             }
             (_, Some(merge)) if !merge.is_empty() => {
                 sandbox
@@ -210,17 +210,12 @@ impl PatchExtractor {
                         60_000,
                     )
                     .await;
-                merge.to_string()
+                format!("git diff --no-color --unified=3 HEAD {} 2>&1", merge)
             }
-            _ => "HEAD".to_string(),
+            _ => "git diff --no-color --unified=3 HEAD~1 HEAD 2>&1".to_string(),
         };
 
-        let result = sandbox
-            .exec(
-                &format!("git show --no-color --unified=3 {} 2>&1", diff_ref),
-                60_000,
-            )
-            .await;
+        let result = sandbox.exec(&diff_cmd, 60_000).await;
 
         sandbox.destroy().await;
 
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 900f751..84ddea8 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -273,10 +273,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
     }
 
     // Clone repo (full clone for reliable checkout)
-    let clone_cmd = format!(
-        "git clone https://github.com/{}.git /repo 2>&1",
-        task.repo
-    );
+    let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", task.repo);
     let (code, _, err) = docker_exec(&cname, &clone_cmd, 600).await;
     if code != 0 {
         result.error = Some(format!("Clone failed: {}", truncate(&err, 500)));
diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs
index 61a5dd8..bc86573 100644
--- a/src/swe/test_generator.rs
+++ b/src/swe/test_generator.rs
@@ -701,7 +701,10 @@ impl TestGenerator {
 
         // Reset repo to clean state before applying patch (agent may have modified files)
         sandbox
-            .exec("cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", 30_000)
+            .exec(
+                "cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null",
+                30_000,
+            )
             .await;
 
         // Re-write test files (they were cleaned by git checkout)
@@ -718,12 +721,18 @@ impl TestGenerator {
         }
 
         let apply_result = sandbox
-            .exec("cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", 30_000)
+            .exec(
+                "cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1",
+                30_000,
+            )
             .await;
 
         if apply_result.exit_code != 0 {
             let apply_3way = sandbox
-                .exec("cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", 30_000)
+                .exec(
+                    "cd /repo && git apply --3way .swe_forge_pr.patch 2>&1",
+                    30_000,
+                )
                 .await;
             if apply_3way.exit_code != 0 {
                 tracing::warn!(
@@ -732,7 +741,9 @@ impl TestGenerator {
                     exit = apply_3way.exit_code,
                     "Patch apply failed, rejecting task"
                 );
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
                 return ValidationResult::Rejected(
                     "PR patch could not be applied to the base commit. The test cannot be validated.".to_string()
                 );
@@ -743,8 +754,12 @@ impl TestGenerator {
         for cmd in &submit.fail_to_pass {
             let result = sandbox.exec(cmd, 60_000).await;
             if result.exit_code != 0 {
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-                sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
+                sandbox
+                    .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+                    .await;
                 for tf in test_files {
                     let _ = sandbox.write_file(&tf.path, &tf.content).await;
                 }
@@ -762,8 +777,12 @@ impl TestGenerator {
         for cmd in &submit.pass_to_pass {
             let result = sandbox.exec(cmd, 60_000).await;
             if result.exit_code != 0 {
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-                sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
+                sandbox
+                    .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+                    .await;
                 for tf in test_files {
                     let _ = sandbox.write_file(&tf.path, &tf.content).await;
                 }
@@ -778,8 +797,12 @@ impl TestGenerator {
         }
 
         // Revert to base commit for cleanliness
-        sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-        sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+        sandbox
+            .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+            .await;
+        sandbox
+            .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+            .await;
         for tf in test_files {
             let _ = sandbox.write_file(&tf.path, &tf.content).await;
         }
@@ -847,14 +870,17 @@ impl TestGenerator {
                     .await;
                 let output = if !result.stdout.is_empty() {
                     // Parse JSON response from tool server
-                    match serde_json::from_str::<serde_json::Value>(&result.stdout.trim()) {
+                    match serde_json::from_str::<serde_json::Value>(result.stdout.trim()) {
                         Ok(v) => {
                             if let Some(err) = v.get("error").and_then(|e| e.as_str()) {
                                 err.to_string()
                             } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) {
                                 let mut s = out.to_string();
                                 if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) {
-                                    if v.get("truncated").and_then(|t| t.as_bool()).unwrap_or(false) {
+                                    if v.get("truncated")
+                                        .and_then(|t| t.as_bool())
+                                        .unwrap_or(false)
+                                    {
                                         s.push_str(&format!("\n\n[Showing {}/{} lines. Use offset/limit to see more.]",
                                             v.get("shown_lines").and_then(|s| s.as_u64()).unwrap_or(0), total));
                                     } else {
diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs
index b4619c8..8345297 100644
--- a/src/swe/workspace_validator.rs
+++ b/src/swe/workspace_validator.rs
@@ -84,6 +84,26 @@ impl WorkspaceValidator {
         sandbox: &DockerSandbox,
         task: &SweTask,
     ) -> Result<ValidationOutcome, anyhow::Error> {
+        // --- Install language runtime if needed ---
+        let runtime_install = match task.language.to_lowercase().as_str() {
+            "go" | "golang" => Some("apt-get update -qq && apt-get install -y -qq golang > /dev/null 2>&1"),
+            "javascript" | "typescript" | "js" | "ts" => Some("apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1"),
+            "rust" => Some("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1 && . $HOME/.cargo/env"),
+            "java" => Some("apt-get update -qq && apt-get install -y -qq default-jdk maven > /dev/null 2>&1"),
+            _ => None,
+        };
+        if let Some(cmd) = runtime_install {
+            let rt_result = sandbox.exec(&format!("{} 2>&1", cmd), 300_000).await;
+            if rt_result.exit_code != 0 {
+                tracing::warn!(
+                    task_id = %task.id,
+                    language = %task.language,
+                    exit = rt_result.exit_code,
+                    "Runtime install failed during validation (continuing)"
+                );
+            }
+        }
+
         // --- Install ---
         if let Some(install_cmd) = task.install_config.get("install") {
             if !install_cmd.is_empty() && !install_cmd.starts_with('#') {

From e777e92099b00dd9a4d5921afbdb3e2ad7a2a23c Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 13:43:29 +0000
Subject: [PATCH 2/8] security: add input validation for shell commands and fix
 expect() in library code

- Add validate_git_ref() and validate_repo_name() to prevent command
  injection via shell metacharacters in user-controlled inputs
- Validate commit hashes in extractor.rs before interpolating into
  git fetch/diff commands
- Validate repo name and base_commit in docker_sandbox.rs before
  interpolating into git clone/checkout commands
- Validate repo name and base_commit in harness.rs before
  interpolating into git clone/checkout commands
- Replace expect() with Result return types in LiteLlmClient::new()
  and LiteLlmClient::new_with_defaults() per AGENTS.md rules
- Add comprehensive unit tests for validation functions
---
 src/llm/litellm.rs        |  50 +++++++++++----
 src/swe/docker_sandbox.rs |   4 ++
 src/swe/extractor.rs      |   9 ++-
 src/swe/harness.rs        |  14 +++-
 src/swe/mod.rs            | 130 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 191 insertions(+), 16 deletions(-)

diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index e0be9b4..c153270 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -324,16 +324,26 @@ impl LiteLlmClient {
     /// * `api_base` - Base URL for the LiteLLM API (e.g., "http://localhost:4000")
     /// * `api_key` - Optional API key for authentication
     /// * `default_model` - Default model to use when none is specified
-    pub fn new(api_base: String, api_key: Option<String>, default_model: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new(
+        api_base: String,
+        api_key: Option<String>,
+        default_model: String,
+    ) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base,
             api_key,
             default_model,
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client pre-configured for OpenRouter.
@@ -347,16 +357,22 @@ impl LiteLlmClient {
     /// A client configured with:
     /// - api_base: "https://openrouter.ai/api/v1"
     /// - default_model: "anthropic/claude-opus-4.5"
-    pub fn new_with_defaults(api_key: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new_with_defaults(api_key: String) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base: "https://openrouter.ai/api/v1".to_string(),
             api_key: Some(api_key),
             default_model: "anthropic/claude-opus-4.5".to_string(),
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client from environment variables.
@@ -382,7 +398,9 @@ impl LiteLlmClient {
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
         })
     }
 
@@ -929,7 +947,8 @@ mod tests {
             "http://localhost:4000".to_string(),
             Some("test-key".to_string()),
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert_eq!(client.api_base(), "http://localhost:4000");
         assert_eq!(client.default_model(), "gpt-4");
@@ -942,14 +961,15 @@ mod tests {
             "http://localhost:4000".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert!(!client.has_api_key());
     }
 
     #[test]
     fn test_litellm_client_new_with_defaults() {
-        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string());
+        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()).unwrap();
 
         assert_eq!(client.api_base(), "https://openrouter.ai/api/v1");
         assert_eq!(client.default_model(), "anthropic/claude-opus-4.5");
@@ -963,7 +983,8 @@ mod tests {
             "http://localhost:65535".to_string(), // Use a port that's unlikely to have a server
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         let request = GenerationRequest::new("gpt-4", vec![Message::user("test")]);
         let result = client.generate(request).await;
@@ -1002,7 +1023,8 @@ mod tests {
             "http://localhost:65535".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
         let cache = PromptCache::new(100);
 
         // Create a request with a system prompt
diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
index 13b9104..2288b81 100644
--- a/src/swe/docker_sandbox.rs
+++ b/src/swe/docker_sandbox.rs
@@ -8,6 +8,7 @@ use std::process::Stdio;
 use tokio::process::Command;
 
 use crate::swe::tool_server::TOOL_SERVER_PY;
+use crate::swe::{validate_git_ref, validate_repo_name};
 
 /// Shell command output from inside the container.
 pub struct SandboxOutput {
@@ -37,6 +38,9 @@ impl DockerSandbox {
         language: &str,
         image_override: Option<&str>,
     ) -> Result<Self> {
+        validate_repo_name(repo)?;
+        validate_git_ref(base_commit)?;
+
         let image = image_override.unwrap_or_else(|| image_for_language(language));
         let safe_name = repo.replace('/', "-").replace(' ', "_");
         let ts_suffix = std::time::SystemTime::now()
diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs
index 4059b73..b13c47f 100644
--- a/src/swe/extractor.rs
+++ b/src/swe/extractor.rs
@@ -4,7 +4,7 @@
 use anyhow::Result;
 
 use crate::swe::docker_sandbox::DockerSandbox;
-use crate::swe::SweTask;
+use crate::swe::{validate_git_ref, SweTask};
 
 fn github_token() -> Option<String> {
     std::env::var("GITHUB_TOKEN")
@@ -188,6 +188,13 @@ impl PatchExtractor {
 
     /// Clone the repo inside a Docker container and extract the diff.
     async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result<String> {
+        if let Some(base) = input.base_commit {
+            validate_git_ref(base)?;
+        }
+        if let Some(merge) = input.merge_commit {
+            validate_git_ref(merge)?;
+        }
+
         let base_commit = input.base_commit.unwrap_or("");
         let sandbox =
             DockerSandbox::start(input.repository, base_commit, input.language, None).await?;
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 84ddea8..0658f10 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use tokio::process::Command;
 use tracing::{info, warn};
 
-use super::SweTask;
+use super::{validate_git_ref, validate_repo_name, SweTask};
 
 // ---------------------------------------------------------------------------
 // Config
@@ -203,6 +203,18 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
         container_id: Some(cname.clone()),
     };
 
+    // 0. INPUT VALIDATION: reject shell-unsafe repo names and commit refs
+    if let Err(e) = validate_repo_name(&task.repo) {
+        result.error = Some(format!("Invalid repo name: {e}"));
+        return result;
+    }
+    if !task.base_commit.is_empty() {
+        if let Err(e) = validate_git_ref(&task.base_commit) {
+            result.error = Some(format!("Invalid base commit: {e}"));
+            return result;
+        }
+    }
+
     // 1. SETUP: start container
     info!(task_id = %task.id, "Starting container {}", cname);
     let agent_dir_abs = if let Ok(docker_dir) = std::env::var("DOCKER_AGENT_DIR") {
diff --git a/src/swe/mod.rs b/src/swe/mod.rs
index 0e4f42c..db34ff8 100644
--- a/src/swe/mod.rs
+++ b/src/swe/mod.rs
@@ -46,6 +46,67 @@ pub use workspace_validator::{ValidationOutcome, WorkspaceValidator};
 /// Default output directory for generated SWE workspaces.
 pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe";
 
+/// Validate a git ref (commit SHA, branch name) to prevent shell injection.
+///
+/// Accepts hex-only SHAs (short or full) and standard git ref names
+/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters.
+pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> {
+    if s.is_empty() {
+        return Ok(());
+    }
+    if s.len() > 256 {
+        anyhow::bail!("git ref too long ({} chars, max 256)", s.len());
+    }
+    for ch in s.chars() {
+        if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') {
+            anyhow::bail!(
+                "invalid character '{}' in git ref '{}': only alphanumeric, /, ., -, _, ~, ^ allowed",
+                ch,
+                s
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Validate a GitHub repository name (`owner/repo`) to prevent shell injection.
+///
+/// Accepts the standard GitHub `owner/repo` format where both parts contain
+/// only alphanumeric characters, hyphens, underscores, and dots.
+pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> {
+    if s.is_empty() {
+        anyhow::bail!("repository name is empty");
+    }
+    if s.len() > 256 {
+        anyhow::bail!("repository name too long ({} chars, max 256)", s.len());
+    }
+    let parts: Vec<&str> = s.split('/').collect();
+    if parts.len() != 2 {
+        anyhow::bail!(
+            "invalid repository name '{}': expected 'owner/repo' format",
+            s
+        );
+    }
+    for part in &parts {
+        if part.is_empty() {
+            anyhow::bail!(
+                "invalid repository name '{}': owner and repo must be non-empty",
+                s
+            );
+        }
+        for ch in part.chars() {
+            if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') {
+                anyhow::bail!(
+                    "invalid character '{}' in repository name '{}': only alphanumeric, -, _, . allowed",
+                    ch,
+                    s
+                );
+            }
+        }
+    }
+    Ok(())
+}
+
 /// DataForge-compatible task format for SWE mined items.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SweTask {
@@ -213,3 +274,72 @@ impl SweTask {
         map
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn validate_git_ref_accepts_hex_sha() {
+        assert!(validate_git_ref("abc123def456").is_ok());
+        assert!(validate_git_ref("a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2").is_ok());
+    }
+
+    #[test]
+    fn validate_git_ref_accepts_empty() {
+        assert!(validate_git_ref("").is_ok());
+    }
+
+    #[test]
+    fn validate_git_ref_accepts_branch_names() {
+        assert!(validate_git_ref("main").is_ok());
+        assert!(validate_git_ref("feature/my-branch").is_ok());
+        assert!(validate_git_ref("HEAD~1").is_ok());
+        assert!(validate_git_ref("v1.2.3").is_ok());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_shell_injection() {
+        assert!(validate_git_ref("abc123; rm -rf /").is_err());
+        assert!(validate_git_ref("$(whoami)").is_err());
+        assert!(validate_git_ref("`id`").is_err());
+        assert!(validate_git_ref("abc | cat /etc/passwd").is_err());
+        assert!(validate_git_ref("abc && echo pwned").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_too_long() {
+        let long_ref = "a".repeat(257);
+        assert!(validate_git_ref(&long_ref).is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_accepts_valid() {
+        assert!(validate_repo_name("owner/repo").is_ok());
+        assert!(validate_repo_name("my-org/my-repo").is_ok());
+        assert!(validate_repo_name("user123/project.js").is_ok());
+        assert!(validate_repo_name("Org_Name/Repo_Name").is_ok());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_shell_injection() {
+        assert!(validate_repo_name("owner/repo; rm -rf /").is_err());
+        assert!(validate_repo_name("$(whoami)/repo").is_err());
+        assert!(validate_repo_name("owner/repo && echo pwned").is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_invalid_format() {
+        assert!(validate_repo_name("").is_err());
+        assert!(validate_repo_name("noslash").is_err());
+        assert!(validate_repo_name("too/many/slashes").is_err());
+        assert!(validate_repo_name("/repo").is_err());
+        assert!(validate_repo_name("owner/").is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_too_long() {
+        let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128));
+        assert!(validate_repo_name(&long_name).is_err());
+    }
+}

From 3e673e82b5d2fcad4bf8eb51b34afdd7d65d57fb Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 13:51:44 +0000
Subject: [PATCH 3/8] fix(dead-code): remove unnecessary #[allow(dead_code)]
 and unused PatchBlock.lines field

- Remove #[allow(dead_code)] from ApiToolCall: all fields are either
  accessed in generate() or suppressed by underscore prefix (_tool_type)
- Remove #[allow(dead_code)] from ApiToolCallFunction: both fields
  (name, arguments) are accessed in generate()
- Remove unused PatchBlock.lines field in extractor.rs: accumulated
  but never read (split_solution_and_tests only returns .patch)
---
 src/llm/litellm.rs   | 2 --
 src/swe/extractor.rs | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index c153270..a1b4cc2 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -549,7 +549,6 @@ struct ApiMessage {
 
 /// A tool call returned by the model.
 #[derive(Debug, Deserialize)]
-#[allow(dead_code)]
 struct ApiToolCall {
     #[serde(default)]
     id: String,
@@ -560,7 +559,6 @@ struct ApiToolCall {
 
 /// Function details within a tool call response.
 #[derive(Debug, Deserialize)]
-#[allow(dead_code)]
 struct ApiToolCallFunction {
     #[serde(default)]
     name: String,
diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs
index b13c47f..a2b9c19 100644
--- a/src/swe/extractor.rs
+++ b/src/swe/extractor.rs
@@ -265,7 +265,6 @@ impl PatchExtractor {
 #[derive(Default)]
 struct PatchBlock {
     patch: String,
-    lines: usize,
 }
 
 fn count_line_delta(raw: &str) -> (usize, usize) {
@@ -369,10 +368,8 @@ fn append_to_partition(
 ) {
     if is_test_file(file_path) {
         tests.patch.push_str(block);
-        tests.lines = tests.lines.saturating_add(block.lines().count());
     } else {
         solution.patch.push_str(block);
-        solution.lines = solution.lines.saturating_add(block.lines().count());
     }
 }
 

From 1a6bfc3c57703ef5844f7a5b39242673535169e5 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 13:56:43 +0000
Subject: [PATCH 4/8] fix(swe): log errors instead of silently dropping sandbox
 write_file results

Replace 'let _ = sandbox.write_file(...)' with proper error logging
using tracing::warn in test_generator.rs and workspace_validator.rs.
These were silently swallowing errors during test file restoration
in validation cleanup paths.
---
 src/swe/test_generator.rs      | 12 +++++++++---
 src/swe/workspace_validator.rs |  4 +++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs
index bc86573..653be13 100644
--- a/src/swe/test_generator.rs
+++ b/src/swe/test_generator.rs
@@ -761,7 +761,9 @@ impl TestGenerator {
                     .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
                     .await;
                 for tf in test_files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after f2p rejection");
+                    }
                 }
                 return ValidationResult::Rejected(format!(
                     "fail_to_pass test '{}' still FAILS after the PR patch is applied (exit={}, stderr={}). \
@@ -784,7 +786,9 @@ impl TestGenerator {
                     .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
                     .await;
                 for tf in test_files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after p2p rejection");
+                    }
                 }
                 return ValidationResult::Rejected(format!(
                     "pass_to_pass test '{}' FAILS after the PR patch (exit={}, stderr={}). \
@@ -804,7 +808,9 @@ impl TestGenerator {
             .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
             .await;
         for tf in test_files {
-            let _ = sandbox.write_file(&tf.path, &tf.content).await;
+            if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after validation");
+            }
         }
 
         ValidationResult::Accepted
diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs
index 8345297..190f280 100644
--- a/src/swe/workspace_validator.rs
+++ b/src/swe/workspace_validator.rs
@@ -211,7 +211,9 @@ impl WorkspaceValidator {
         if let Some(test_files_json) = task.meta.get("test_files") {
             if let Ok(files) = serde_json::from_str::<Vec<TestFile>>(test_files_json) {
                 for tf in &files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to re-write test file after patch");
+                    }
                 }
             }
         }

From 0382ea935233063e006f1b386be19759e77acb75 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 14:09:52 +0000
Subject: [PATCH 5/8] fix(security): harden input validation against shell
 injection and path traversal

- validate_git_ref: reject empty refs, '..' sequences, and leading '-' (flag injection)
- validate_repo_name: reject parts starting with '.' or '-' (path traversal, flag injection)
- Add validate_file_path: reject shell metacharacters, null bytes, '..', absolute paths
- docker_sandbox: validate file paths in write_file, write_file_abs, read_file
- docker_sandbox: restrict write_file_abs to /tools/ prefix, validate tool_name
- harness: validate file paths in docker_write_file
- litellm: replace unwrap_or(Null) with proper error propagation for tools serialization
- extractor: guard validate_git_ref calls with empty checks for optional refs
---
 src/llm/litellm.rs        |  19 ++++--
 src/swe/docker_sandbox.rs |  62 ++++++++++++++++-
 src/swe/extractor.rs      |   8 ++-
 src/swe/harness.rs        |   4 +-
 src/swe/mod.rs            | 138 ++++++++++++++++++++++++++++++++++++--
 5 files changed, 215 insertions(+), 16 deletions(-)

diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index a1b4cc2..820cda7 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -599,12 +599,19 @@ impl LlmProvider for LiteLlmClient {
             request.model.clone()
         };
 
-        let tools = request
-            .tools
-            .map(|t| serde_json::to_value(&t).unwrap_or(serde_json::Value::Null));
-        let tool_choice = request
-            .tool_choice
-            .map(|tc| serde_json::to_value(&tc).unwrap_or(serde_json::Value::Null));
+        let tools =
+            match request.tools {
+                Some(t) => Some(serde_json::to_value(&t).map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to serialize tools: {e}"))
+                })?),
+                None => None,
+            };
+        let tool_choice = match request.tool_choice {
+            Some(tc) => Some(serde_json::to_value(&tc).map_err(|e| {
+                LlmError::RequestFailed(format!("Failed to serialize tool_choice: {e}"))
+            })?),
+            None => None,
+        };
 
         let api_request = ApiRequest {
             model: model.clone(),
diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
index 2288b81..66518c6 100644
--- a/src/swe/docker_sandbox.rs
+++ b/src/swe/docker_sandbox.rs
@@ -8,7 +8,7 @@ use std::process::Stdio;
 use tokio::process::Command;
 
 use crate::swe::tool_server::TOOL_SERVER_PY;
-use crate::swe::{validate_git_ref, validate_repo_name};
+use crate::swe::{validate_file_path, validate_git_ref, validate_repo_name};
 
 /// Shell command output from inside the container.
 pub struct SandboxOutput {
@@ -39,7 +39,9 @@ impl DockerSandbox {
         image_override: Option<&str>,
     ) -> Result<Self> {
         validate_repo_name(repo)?;
-        validate_git_ref(base_commit)?;
+        if !base_commit.is_empty() {
+            validate_git_ref(base_commit)?;
+        }
 
         let image = image_override.unwrap_or_else(|| image_for_language(language));
         let safe_name = repo.replace('/', "-").replace(' ', "_");
@@ -227,6 +229,16 @@ impl DockerSandbox {
     /// Call a tool on the HTTP tool server running inside the container.
     /// Pipes the JSON args via stdin to avoid shell escaping issues.
     pub async fn tool_request(&self, tool_name: &str, args_json: &str) -> SandboxOutput {
+        for ch in tool_name.chars() {
+            if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
+                return SandboxOutput {
+                    stdout: String::new(),
+                    stderr: format!("Invalid tool name: {}", tool_name),
+                    exit_code: -1,
+                };
+            }
+        }
+
         let script = format!(
             "import sys, urllib.request, json\n\
              data = sys.stdin.read()\n\
@@ -329,7 +341,49 @@ impl DockerSandbox {
     }
 
     /// Write a file to an absolute path inside the container.
+    ///
+    /// Only allows paths under known safe prefixes (`/tools/`).
     async fn write_file_abs(&self, abs_path: &str, content: &str) -> Result<()> {
+        if !abs_path.starts_with("/tools/") {
+            anyhow::bail!(
+                "write_file_abs only allows paths under /tools/, got '{}'",
+                abs_path
+            );
+        }
+        for ch in abs_path.chars() {
+            if matches!(
+                ch,
+                '\'' | '"'
+                    | '`'
+                    | '$'
+                    | '!'
+                    | '&'
+                    | '|'
+                    | ';'
+                    | '('
+                    | ')'
+                    | '{'
+                    | '}'
+                    | '<'
+                    | '>'
+                    | '\\'
+                    | '\0'
+                    | '\n'
+                    | '\r'
+            ) {
+                anyhow::bail!(
+                    "invalid character in absolute path '{}': shell metacharacters not allowed",
+                    abs_path
+                );
+            }
+        }
+        if abs_path.contains("..") {
+            anyhow::bail!(
+                "absolute path '{}' contains '..' (path traversal not allowed)",
+                abs_path
+            );
+        }
+
         let mkdir_cmd = format!("mkdir -p \"$(dirname '{}')\"", abs_path);
         self.exec(&mkdir_cmd, 10_000).await;
 
@@ -369,6 +423,8 @@ impl DockerSandbox {
 
     /// Write a file inside the container by piping content via stdin.
     pub async fn write_file(&self, path: &str, content: &str) -> Result<()> {
+        validate_file_path(path)?;
+
         // First ensure the parent directory exists
         let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", path);
         self.exec(&mkdir_cmd, 10_000).await;
@@ -410,6 +466,8 @@ impl DockerSandbox {
 
     /// Read a file from inside the container.
     pub async fn read_file(&self, path: &str) -> Result<String> {
+        validate_file_path(path)?;
+
         let cmd = format!("cat '/repo/{}'", path);
         let result = self.exec(&cmd, 10_000).await;
         if result.exit_code != 0 {
diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs
index a2b9c19..871e77d 100644
--- a/src/swe/extractor.rs
+++ b/src/swe/extractor.rs
@@ -189,10 +189,14 @@ impl PatchExtractor {
     /// Clone the repo inside a Docker container and extract the diff.
     async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result<String> {
         if let Some(base) = input.base_commit {
-            validate_git_ref(base)?;
+            if !base.is_empty() {
+                validate_git_ref(base)?;
+            }
         }
         if let Some(merge) = input.merge_commit {
-            validate_git_ref(merge)?;
+            if !merge.is_empty() {
+                validate_git_ref(merge)?;
+            }
         }
 
         let base_commit = input.base_commit.unwrap_or("");
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 0658f10..47b4930 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use tokio::process::Command;
 use tracing::{info, warn};
 
-use super::{validate_git_ref, validate_repo_name, SweTask};
+use super::{validate_file_path, validate_git_ref, validate_repo_name, SweTask};
 
 // ---------------------------------------------------------------------------
 // Config
@@ -144,6 +144,8 @@ async fn docker_rm(container: &str) {
 }
 
 async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {
+    validate_file_path(path)?;
+
     use tokio::io::AsyncWriteExt;
     let tee_cmd = format!("cat > '/repo/{}'", path);
     let mut child = Command::new("docker")
diff --git a/src/swe/mod.rs b/src/swe/mod.rs
index db34ff8..4a25962 100644
--- a/src/swe/mod.rs
+++ b/src/swe/mod.rs
@@ -49,14 +49,24 @@ pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe";
 /// Validate a git ref (commit SHA, branch name) to prevent shell injection.
 ///
 /// Accepts hex-only SHAs (short or full) and standard git ref names
-/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters.
+/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters,
+/// `..` sequences (path traversal), and refs starting with `-` (flag injection).
 pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> {
     if s.is_empty() {
-        return Ok(());
+        anyhow::bail!("git ref is empty");
     }
     if s.len() > 256 {
         anyhow::bail!("git ref too long ({} chars, max 256)", s.len());
     }
+    if s.starts_with('-') {
+        anyhow::bail!(
+            "git ref '{}' must not start with '-' (could be interpreted as a flag)",
+            s
+        );
+    }
+    if s.contains("..") {
+        anyhow::bail!("git ref '{}' must not contain '..' (path traversal)", s);
+    }
     for ch in s.chars() {
         if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') {
             anyhow::bail!(
@@ -72,7 +82,8 @@ pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> {
 /// Validate a GitHub repository name (`owner/repo`) to prevent shell injection.
 ///
 /// Accepts the standard GitHub `owner/repo` format where both parts contain
-/// only alphanumeric characters, hyphens, underscores, and dots.
+/// only alphanumeric characters, hyphens, underscores, and dots. Parts must
+/// not start with `.` or `-` to prevent path traversal and flag injection.
 pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> {
     if s.is_empty() {
         anyhow::bail!("repository name is empty");
@@ -94,6 +105,12 @@ pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> {
                 s
             );
         }
+        if part.starts_with('.') || part.starts_with('-') {
+            anyhow::bail!(
+                "invalid repository name '{}': parts must not start with '.' or '-'",
+                s
+            );
+        }
         for ch in part.chars() {
             if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') {
                 anyhow::bail!(
@@ -107,6 +124,60 @@ pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> {
     Ok(())
 }
 
+/// Validate a file path used in shell commands inside Docker containers.
+///
+/// Rejects paths containing shell metacharacters, single/double quotes,
+/// null bytes, and `..` traversal sequences. This prevents shell injection
+/// when paths are interpolated into commands like `cat > '/repo/{path}'`.
+pub fn validate_file_path(path: &str) -> Result<(), anyhow::Error> {
+    if path.is_empty() {
+        anyhow::bail!("file path is empty");
+    }
+    if path.len() > 4096 {
+        anyhow::bail!("file path too long ({} chars, max 4096)", path.len());
+    }
+    if path.contains('\0') {
+        anyhow::bail!("file path contains null byte");
+    }
+    if path.contains("..") {
+        anyhow::bail!(
+            "file path '{}' contains '..' (path traversal not allowed)",
+            path
+        );
+    }
+    if path.starts_with('/') {
+        anyhow::bail!("file path '{}' must be relative (no leading '/')", path);
+    }
+    for ch in path.chars() {
+        if matches!(
+            ch,
+            '\'' | '"'
+                | '`'
+                | '$'
+                | '!'
+                | '&'
+                | '|'
+                | ';'
+                | '('
+                | ')'
+                | '{'
+                | '}'
+                | '<'
+                | '>'
+                | '\\'
+                | '\n'
+                | '\r'
+        ) {
+            anyhow::bail!(
+                "invalid character '{}' in file path '{}': shell metacharacters not allowed",
+                ch,
+                path
+            );
+        }
+    }
+    Ok(())
+}
+
 /// DataForge-compatible task format for SWE mined items.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SweTask {
@@ -286,8 +357,8 @@ mod tests {
     }
 
     #[test]
-    fn validate_git_ref_accepts_empty() {
-        assert!(validate_git_ref("").is_ok());
+    fn validate_git_ref_rejects_empty() {
+        assert!(validate_git_ref("").is_err());
     }
 
     #[test]
@@ -307,6 +378,18 @@ mod tests {
         assert!(validate_git_ref("abc && echo pwned").is_err());
     }
 
+    #[test]
+    fn validate_git_ref_rejects_double_dot() {
+        assert!(validate_git_ref("main..HEAD").is_err());
+        assert!(validate_git_ref("../../etc/passwd").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_leading_dash() {
+        assert!(validate_git_ref("--exec=whoami").is_err());
+        assert!(validate_git_ref("-n").is_err());
+    }
+
     #[test]
     fn validate_git_ref_rejects_too_long() {
         let long_ref = "a".repeat(257);
@@ -337,9 +420,54 @@ mod tests {
         assert!(validate_repo_name("owner/").is_err());
     }
 
+    #[test]
+    fn validate_repo_name_rejects_leading_dot_or_dash() {
+        assert!(validate_repo_name(".hidden/repo").is_err());
+        assert!(validate_repo_name("owner/.repo").is_err());
+        assert!(validate_repo_name("-flag/repo").is_err());
+        assert!(validate_repo_name("owner/-repo").is_err());
+        assert!(validate_repo_name("..traversal/repo").is_err());
+    }
+
     #[test]
     fn validate_repo_name_rejects_too_long() {
         let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128));
         assert!(validate_repo_name(&long_name).is_err());
     }
+
+    #[test]
+    fn validate_file_path_accepts_valid() {
+        assert!(validate_file_path("tests/test_foo.py").is_ok());
+        assert!(validate_file_path("src/main.rs").is_ok());
+        assert!(validate_file_path("a/b/c/d.txt").is_ok());
+        assert!(validate_file_path("file.txt").is_ok());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_traversal() {
+        assert!(validate_file_path("../etc/passwd").is_err());
+        assert!(validate_file_path("a/../../etc/passwd").is_err());
+        assert!(validate_file_path("..").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_shell_metacharacters() {
+        assert!(validate_file_path("file'; rm -rf /; echo '").is_err());
+        assert!(validate_file_path("file$(whoami)").is_err());
+        assert!(validate_file_path("file`id`").is_err());
+        assert!(validate_file_path("file|cat").is_err());
+        assert!(validate_file_path("file;ls").is_err());
+        assert!(validate_file_path("file&echo").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_empty_and_absolute() {
+        assert!(validate_file_path("").is_err());
+        assert!(validate_file_path("/etc/passwd").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_null_byte() {
+        assert!(validate_file_path("file\0.txt").is_err());
+    }
 }

From 50c8cad9d97164056bfc73034b004fad599746c2 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 14:27:09 +0000
Subject: [PATCH 6/8] fix(harness): validate test file paths before mkdir shell
 command

Add validate_file_path() check before interpolating tf.path into the
mkdir shell command in evaluate_task(). Previously, the mkdir command
at line 340 used the unvalidated path from deserialized test files,
while docker_write_file() on the next line did validate. This created
a window where a malicious path could inject shell commands via the
mkdir step before validation occurred.
---
 src/swe/harness.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 47b4930..5cbe5ed 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -337,6 +337,10 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
             serde_json::from_str::<Vec<super::test_generator::TestFile>>(test_files_json)
         {
             for tf in &files {
+                if let Err(e) = validate_file_path(&tf.path) {
+                    warn!(task_id = %task.id, path = %tf.path, error = %e, "Skipping test file with invalid path");
+                    continue;
+                }
                 let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path);
                 docker_exec(&cname, &mkdir_cmd, 10).await;
                 let write_result = docker_write_file(&cname, &tf.path, &tf.content).await;

From e28910d386ce16d76bc794c9d3fda006aca1d07a Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 14:37:32 +0000
Subject: [PATCH 7/8] fix(swe): replace error-swallowing let _ = patterns with
 proper logging

Replace 3 instances of 'let _ =' that silently discard Docker command
errors with 'if let Err(e)' + tracing::debug! for proper error
observability while maintaining best-effort cleanup semantics.

Files changed:
- src/swe/docker_sandbox.rs: start() stale container removal, destroy()
- src/swe/harness.rs: docker_rm() helper
---
 src/swe/docker_sandbox.rs | 14 ++++++++++----
 src/swe/harness.rs        |  7 +++++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
index 66518c6..2adf200 100644
--- a/src/swe/docker_sandbox.rs
+++ b/src/swe/docker_sandbox.rs
@@ -55,12 +55,15 @@ impl DockerSandbox {
         let tool_port = 10_000 + (ts_suffix as u16 % 50_000);
 
         // Remove stale container if it exists
-        let _ = Command::new("docker")
+        if let Err(e) = Command::new("docker")
             .args(["rm", "-f", &container_name])
             .stdout(Stdio::null())
             .stderr(Stdio::null())
             .status()
-            .await;
+            .await
+        {
+            tracing::debug!(container = %container_name, error = %e, "Failed to remove stale container (may not exist)");
+        }
 
         let run_output = Command::new("docker")
             .args([
@@ -482,12 +485,15 @@ impl DockerSandbox {
 
     /// Destroy the container.
     pub async fn destroy(&self) {
-        let _ = Command::new("docker")
+        if let Err(e) = Command::new("docker")
             .args(["rm", "-f", &self.container_name])
             .stdout(Stdio::null())
             .stderr(Stdio::null())
             .status()
-            .await;
+            .await
+        {
+            tracing::debug!(container = %self.container_name, error = %e, "Failed to destroy container");
+        }
         tracing::debug!(container = %self.container_name, "Docker sandbox destroyed");
     }
 
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 5cbe5ed..90146ed 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -137,10 +137,13 @@ async fn docker_exec(container: &str, cmd: &str, timeout_secs: u64) -> (i32, Str
 }
 
 async fn docker_rm(container: &str) {
-    let _ = Command::new("docker")
+    if let Err(e) = Command::new("docker")
         .args(["rm", "-f", container])
         .output()
-        .await;
+        .await
+    {
+        tracing::debug!(container = container, error = %e, "Failed to remove container (may not exist)");
+    }
 }
 
 async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {

From 8908f667da4eb8e82e90a7cc986b30f1d9a4f0a1 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Wed, 18 Feb 2026 14:52:45 +0000
Subject: [PATCH 8/8] docs: refresh AGENTS.md

---
 AGENTS.md         | 7 ++++---
 src/swe/AGENTS.md | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 2cf3ea6..66b8063 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -28,6 +28,7 @@ src/
 │   ├── pr_cache.rs          # SQLite-backed PR deduplication cache
 │   ├── progress.rs          # Background progress monitor for long-running pipeline runs
 │   ├── github_search.rs     # GitHub Search API client (alternative PR source)
+│   ├── tool_server.rs       # Embedded Python HTTP tool server injected into Docker containers
 │   └── workspace_validator.rs # Pre-export workspace validation (Docker-based)
 ├── llm/                     # LLM integration layer
 │   ├── litellm.rs           # OpenAI-compatible API client (function calling, tools)
@@ -41,7 +42,7 @@ src/
 ├── export/                  # Parquet dataset export + HuggingFace Hub upload
 ├── difficulty/              # Difficulty levels, resource limits, scoring
 ├── anti_hardcoding/         # Canary strings, sealed parameters, contamination detection
-├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs]
+├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs; exists as scaffolding]
 ├── utils/                   # JSON extraction from LLM responses
 └── error.rs                 # Typed error hierarchy (thiserror)
 ```
@@ -135,13 +136,13 @@ Git hooks are in `.githooks/` and activated via `git config core.hooksPath .gith
 
 3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated.
 
-4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–2GB), CPU (1–4 cores), PIDs (100–500). Never create containers without limits.
+4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: PIDs (100–500), storage (1–5 GB), network mode (none/internal). Never create containers without limits.
 
 5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline uses semaphore-based concurrency (no chunk barriers). Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 10x, pre-classification: 25x, deep processing: 8x).
 
 6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc<dyn LlmProvider>` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts.
 
-7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility.
+7. **Serde rename conventions** — Most serializable enums use `#[serde(rename_all = "snake_case")]` (e.g., `SweTaskStatus`, `HarnessStatus`, `CheckType`). `DifficultyLevel` and `NetworkMode` in `src/difficulty/` use `#[serde(rename_all = "lowercase")]`. Follow the existing convention for each type for YAML/JSON compatibility.
 
 8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`.
 
diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md
index 3ee4bc2..ac52fef 100644
--- a/src/swe/AGENTS.md
+++ b/src/swe/AGENTS.md
@@ -22,6 +22,7 @@ Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches
 | `pipeline.rs` | Streaming pipeline with semaphore-based concurrency |
 | `github_search.rs` | `GitHubSearchClient` — GitHub Search API as alternative PR source (30 req/min) |
 | `workspace_validator.rs` | `WorkspaceValidator` — pre-export Docker-based validation (install, tests, patch application) |
+| `tool_server.rs` | Embedded Python HTTP tool server injected into Docker containers (read_file, list_dir, grep_files, search_files, apply_patch) |
 | `pr_cache.rs` | SQLite-backed PR deduplication cache |
 | `progress.rs` | `ProgressMonitor` — background progress logging for long-running pipeline runs |