From a01f07eb4ad6d404f978640fcb899fae4b3a78c9 Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 13:33:10 +0000 Subject: [PATCH 1/8] fix(llm+swe): add tool_calls support to LiteLlmClient and fix pipeline bugs LiteLlmClient tool_calls support (src/llm/litellm.rs): - Add tools and tool_choice fields (as serde_json::Value) to ApiRequest with skip_serializing_if to avoid sending null to providers - Add ApiToolCall and ApiToolCallFunction deserialization structs for parsing function call responses from the API - Add reasoning and reasoning_content fields to ApiMessage for reasoning models - Make finish_reason optional in ApiChoice (some providers omit it) - Implement full tool_calls parsing in generate() response conversion, mapping API tool calls to ToolCallInfo/ToolCallFunction types matching OpenRouterProvider - Add content extraction priority: content > reasoning_content > reasoning, with fallback to tool call arguments when content is empty - Update test to verify tools/tool_choice are properly excluded when None Patch extractor fix (src/swe/extractor.rs): - Replace git show with git diff for extracting PR patches, which produces cleaner unified diffs without commit metadata that confused downstream parsing - Handle all three cases: base..merge, HEAD..merge, and HEAD~1..HEAD Test generator clippy fix (src/swe/test_generator.rs): - Remove unnecessary reference on result.stdout.trim() call Workspace validator runtime install (src/swe/workspace_validator.rs): - Add language runtime installation step before running validation tests in Docker containers (Go, Node.js, Rust, Java) since the base image may not include them, causing false setup_error failures Formatting (docker_sandbox.rs, harness.rs, test_generator.rs, extractor.rs): - Apply cargo fmt to fix formatting across all modified SWE pipeline files Gitignore (.gitignore): - Add mine_test.log and test-easy-output/ to prevent committing test artifacts --- .gitignore | 2 + src/llm/litellm.rs | 104 +++++++++++++++++++++++++++++---- src/swe/docker_sandbox.rs | 80 +++++++++++++++---------- src/swe/extractor.rs | 15 ++--- src/swe/harness.rs | 5 +- src/swe/test_generator.rs | 50 ++++++++++++---- src/swe/workspace_validator.rs | 20 +++++++ 7 files changed, 208 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index 3d29c38..5f04bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,5 @@ test-verify/ *.db *.db-shm *.db-wal +mine_test.log +test-easy-output/ diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs index 4e82dc6..e0be9b4 100644 --- a/src/llm/litellm.rs +++ b/src/llm/litellm.rs @@ -490,6 +490,10 @@ struct ApiRequest { top_p: Option, #[serde(skip_serializing_if = "Option::is_none")] response_format: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tools: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tool_choice: Option, } /// Internal response structure from the OpenAI-compatible API. @@ -506,14 +510,44 @@ struct ApiResponse { struct ApiChoice { index: u32, message: ApiMessage, - finish_reason: String, + #[serde(default)] + finish_reason: Option, } /// Internal message structure from the API response. +/// Supports reasoning models and tool calls. #[derive(Debug, Deserialize)] struct ApiMessage { role: String, + #[serde(default)] content: String, + #[serde(default)] + reasoning: Option, + #[serde(default)] + reasoning_content: Option, + #[serde(default)] + tool_calls: Option>, +} + +/// A tool call returned by the model. +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct ApiToolCall { + #[serde(default)] + id: String, + #[serde(rename = "type", default)] + _tool_type: String, + function: ApiToolCallFunction, +} + +/// Function details within a tool call response. +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct ApiToolCallFunction { + #[serde(default)] + name: String, + #[serde(default)] + arguments: String, } /// Internal usage structure from the API response. @@ -549,6 +583,13 @@ impl LlmProvider for LiteLlmClient { request.model.clone() }; + let tools = request + .tools + .map(|t| serde_json::to_value(&t).unwrap_or(serde_json::Value::Null)); + let tool_choice = request + .tool_choice + .map(|tc| serde_json::to_value(&tc).unwrap_or(serde_json::Value::Null)); + let api_request = ApiRequest { model: model.clone(), messages: request.messages, @@ -556,6 +597,8 @@ impl LlmProvider for LiteLlmClient { max_tokens: request.max_tokens, top_p: request.top_p, response_format: request.response_format, + tools, + tool_choice, }; let url = format!("{}/chat/completions", self.api_base); @@ -617,15 +660,52 @@ impl LlmProvider for LiteLlmClient { let choices = api_response .choices .into_iter() - .map(|choice| Choice { - index: choice.index, - message: Message { - role: choice.message.role, - content: choice.message.content, - tool_calls: None, - tool_call_id: None, - }, - finish_reason: choice.finish_reason, + .map(|choice| { + let tool_calls_info = choice.message.tool_calls.as_ref().map(|tcs| { + tcs.iter() + .map(|tc| ToolCallInfo { + id: tc.id.clone(), + call_type: "function".to_string(), + function: ToolCallFunction { + name: tc.function.name.clone(), + arguments: tc.function.arguments.clone(), + }, + }) + .collect::>() + }); + + let content = if let Some(ref tool_calls) = choice.message.tool_calls { + if let Some(first_call) = tool_calls.first() { + if !first_call.function.arguments.is_empty() { + first_call.function.arguments.clone() + } else { + choice.message.content.clone() + } + } else { + choice.message.content.clone() + } + } else if !choice.message.content.trim().is_empty() { + choice.message.content + } else if let Some(rc) = choice.message.reasoning_content { + if !rc.trim().is_empty() { + rc + } else { + choice.message.reasoning.unwrap_or_default() + } + } else { + choice.message.reasoning.unwrap_or_default() + }; + + Choice { + index: choice.index, + message: Message { + role: choice.message.role, + content, + tool_calls: tool_calls_info, + tool_call_id: None, + }, + finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".to_string()), + } }) .collect(); @@ -903,6 +983,8 @@ mod tests { max_tokens: Some(1000), top_p: None, response_format: None, + tools: None, + tool_choice: None, }; let json = serde_json::to_string(&request).expect("serialization should succeed"); @@ -910,6 +992,8 @@ mod tests { assert!(json.contains("\"temperature\":0.7")); assert!(json.contains("\"max_tokens\":1000")); assert!(!json.contains("top_p")); // Should be skipped because None + assert!(!json.contains("tools")); // Should be skipped because None + assert!(!json.contains("tool_choice")); // Should be skipped because None } #[tokio::test] diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 6e9899a..13b9104 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -81,7 +81,10 @@ impl DockerSandbox { ); } - let sandbox = Self { container_name, tool_port }; + let sandbox = Self { + container_name, + tool_port, + }; // Install git (only hard dependency; agent installs everything else) let install = sandbox @@ -99,10 +102,7 @@ impl DockerSandbox { } // Clone the repository (full clone for reliable checkout) - let clone_cmd = format!( - "git clone https://github.com/{}.git /repo 2>&1", - repo - ); + let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", repo); let clone = sandbox.exec(&clone_cmd, 600_000).await; if clone.exit_code != 0 { sandbox.destroy().await; @@ -154,7 +154,10 @@ impl DockerSandbox { } // Use write_file to inject the server script - if let Err(e) = self.write_file_abs("/tools/server.py", TOOL_SERVER_PY).await { + if let Err(e) = self + .write_file_abs("/tools/server.py", TOOL_SERVER_PY) + .await + { tracing::warn!(container = %self.container_name, error = %e, "Failed to write tool server"); return; } @@ -201,8 +204,13 @@ impl DockerSandbox { std::time::Duration::from_millis(2_000), Command::new("docker") .args([ - "exec", "-w", "/repo", &self.container_name, - "python3", "-c", &check_cmd, + "exec", + "-w", + "/repo", + &self.container_name, + "python3", + "-c", + &check_cmd, ]) .stdout(Stdio::null()) .stderr(Stdio::null()) @@ -230,29 +238,31 @@ impl DockerSandbox { self.tool_port, tool_name ); - let result = tokio::time::timeout( - std::time::Duration::from_millis(65_000), - async { - let mut child = Command::new("docker") - .args([ - "exec", "-i", "-w", "/repo", - &self.container_name, - "python3", "-c", &script, - ]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn()?; - - if let Some(ref mut stdin) = child.stdin { - use tokio::io::AsyncWriteExt; - stdin.write_all(args_json.as_bytes()).await?; - stdin.shutdown().await?; - } - - child.wait_with_output().await + let result = tokio::time::timeout(std::time::Duration::from_millis(65_000), async { + let mut child = Command::new("docker") + .args([ + "exec", + "-i", + "-w", + "/repo", + &self.container_name, + "python3", + "-c", + &script, + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + if let Some(ref mut stdin) = child.stdin { + use tokio::io::AsyncWriteExt; + stdin.write_all(args_json.as_bytes()).await?; + stdin.shutdown().await?; } - ) + + child.wait_with_output().await + }) .await; match result { @@ -322,7 +332,14 @@ impl DockerSandbox { let tee_cmd = format!("cat > '{}'", abs_path); let mut child = Command::new("docker") .args([ - "exec", "-i", "-w", "/repo", &self.container_name, "bash", "-c", &tee_cmd, + "exec", + "-i", + "-w", + "/repo", + &self.container_name, + "bash", + "-c", + &tee_cmd, ]) .stdin(Stdio::piped()) .stdout(Stdio::null()) @@ -416,7 +433,6 @@ impl DockerSandbox { pub fn name(&self) -> &str { &self.container_name } - } /// Ensure the sandbox is destroyed when dropped (best-effort sync cleanup). diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index ab84631..4059b73 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -192,7 +192,7 @@ impl PatchExtractor { let sandbox = DockerSandbox::start(input.repository, base_commit, input.language, None).await?; - let diff_ref = match (input.base_commit, input.merge_commit) { + let diff_cmd = match (input.base_commit, input.merge_commit) { (Some(base), Some(merge)) if !base.is_empty() && !merge.is_empty() => { // Fetch the merge commit (shallow clone may not have it) sandbox @@ -201,7 +201,7 @@ impl PatchExtractor { 60_000, ) .await; - format!("{base}..{merge}") + format!("git diff --no-color --unified=3 {} {} 2>&1", base, merge) } (_, Some(merge)) if !merge.is_empty() => { sandbox @@ -210,17 +210,12 @@ impl PatchExtractor { 60_000, ) .await; - merge.to_string() + format!("git diff --no-color --unified=3 HEAD {} 2>&1", merge) } - _ => "HEAD".to_string(), + _ => "git diff --no-color --unified=3 HEAD~1 HEAD 2>&1".to_string(), }; - let result = sandbox - .exec( - &format!("git show --no-color --unified=3 {} 2>&1", diff_ref), - 60_000, - ) - .await; + let result = sandbox.exec(&diff_cmd, 60_000).await; sandbox.destroy().await; diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 900f751..84ddea8 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -273,10 +273,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult } // Clone repo (full clone for reliable checkout) - let clone_cmd = format!( - "git clone https://github.com/{}.git /repo 2>&1", - task.repo - ); + let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", task.repo); let (code, _, err) = docker_exec(&cname, &clone_cmd, 600).await; if code != 0 { result.error = Some(format!("Clone failed: {}", truncate(&err, 500))); diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs index 61a5dd8..bc86573 100644 --- a/src/swe/test_generator.rs +++ b/src/swe/test_generator.rs @@ -701,7 +701,10 @@ impl TestGenerator { // Reset repo to clean state before applying patch (agent may have modified files) sandbox - .exec("cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", 30_000) + .exec( + "cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", + 30_000, + ) .await; // Re-write test files (they were cleaned by git checkout) @@ -718,12 +721,18 @@ impl TestGenerator { } let apply_result = sandbox - .exec("cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", 30_000) + .exec( + "cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", + 30_000, + ) .await; if apply_result.exit_code != 0 { let apply_3way = sandbox - .exec("cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", 30_000) + .exec( + "cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", + 30_000, + ) .await; if apply_3way.exit_code != 0 { tracing::warn!( @@ -732,7 +741,9 @@ impl TestGenerator { exit = apply_3way.exit_code, "Patch apply failed, rejecting task" ); - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; return ValidationResult::Rejected( "PR patch could not be applied to the base commit. The test cannot be validated.".to_string() ); @@ -743,8 +754,12 @@ impl TestGenerator { for cmd in &submit.fail_to_pass { let result = sandbox.exec(cmd, 60_000).await; if result.exit_code != 0 { - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { let _ = sandbox.write_file(&tf.path, &tf.content).await; } @@ -762,8 +777,12 @@ impl TestGenerator { for cmd in &submit.pass_to_pass { let result = sandbox.exec(cmd, 60_000).await; if result.exit_code != 0 { - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { let _ = sandbox.write_file(&tf.path, &tf.content).await; } @@ -778,8 +797,12 @@ impl TestGenerator { } // Revert to base commit for cleanliness - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { let _ = sandbox.write_file(&tf.path, &tf.content).await; } @@ -847,14 +870,17 @@ impl TestGenerator { .await; let output = if !result.stdout.is_empty() { // Parse JSON response from tool server - match serde_json::from_str::(&result.stdout.trim()) { + match serde_json::from_str::(result.stdout.trim()) { Ok(v) => { if let Some(err) = v.get("error").and_then(|e| e.as_str()) { err.to_string() } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) { let mut s = out.to_string(); if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) { - if v.get("truncated").and_then(|t| t.as_bool()).unwrap_or(false) { + if v.get("truncated") + .and_then(|t| t.as_bool()) + .unwrap_or(false) + { s.push_str(&format!("\n\n[Showing {}/{} lines. Use offset/limit to see more.]", v.get("shown_lines").and_then(|s| s.as_u64()).unwrap_or(0), total)); } else { diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs index b4619c8..8345297 100644 --- a/src/swe/workspace_validator.rs +++ b/src/swe/workspace_validator.rs @@ -84,6 +84,26 @@ impl WorkspaceValidator { sandbox: &DockerSandbox, task: &SweTask, ) -> Result { + // --- Install language runtime if needed --- + let runtime_install = match task.language.to_lowercase().as_str() { + "go" | "golang" => Some("apt-get update -qq && apt-get install -y -qq golang > /dev/null 2>&1"), + "javascript" | "typescript" | "js" | "ts" => Some("apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1"), + "rust" => Some("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1 && . $HOME/.cargo/env"), + "java" => Some("apt-get update -qq && apt-get install -y -qq default-jdk maven > /dev/null 2>&1"), + _ => None, + }; + if let Some(cmd) = runtime_install { + let rt_result = sandbox.exec(&format!("{} 2>&1", cmd), 300_000).await; + if rt_result.exit_code != 0 { + tracing::warn!( + task_id = %task.id, + language = %task.language, + exit = rt_result.exit_code, + "Runtime install failed during validation (continuing)" + ); + } + } + // --- Install --- if let Some(install_cmd) = task.install_config.get("install") { if !install_cmd.is_empty() && !install_cmd.starts_with('#') { From e777e92099b00dd9a4d5921afbdb3e2ad7a2a23c Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 13:43:29 +0000 Subject: [PATCH 2/8] security: add input validation for shell commands and fix expect() in library code - Add validate_git_ref() and validate_repo_name() to prevent command injection via shell metacharacters in user-controlled inputs - Validate commit hashes in extractor.rs before interpolating into git fetch/diff commands - Validate repo name and base_commit in docker_sandbox.rs before interpolating into git clone/checkout commands - Validate repo name and base_commit in harness.rs before interpolating into git clone/checkout commands - Replace expect() with Result return types in LiteLlmClient::new() and LiteLlmClient::new_with_defaults() per AGENTS.md rules - Add comprehensive unit tests for validation functions --- src/llm/litellm.rs | 50 +++++++++++---- src/swe/docker_sandbox.rs | 4 ++ src/swe/extractor.rs | 9 ++- src/swe/harness.rs | 14 +++- src/swe/mod.rs | 130 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 191 insertions(+), 16 deletions(-) diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs index e0be9b4..c153270 100644 --- a/src/llm/litellm.rs +++ b/src/llm/litellm.rs @@ -324,16 +324,26 @@ impl LiteLlmClient { /// * `api_base` - Base URL for the LiteLLM API (e.g., "http://localhost:4000") /// * `api_key` - Optional API key for authentication /// * `default_model` - Default model to use when none is specified - pub fn new(api_base: String, api_key: Option, default_model: String) -> Self { - Self { + /// + /// # Errors + /// + /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built. + pub fn new( + api_base: String, + api_key: Option, + default_model: String, + ) -> Result { + Ok(Self { api_base, api_key, default_model, http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), - } + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, + }) } /// Create a new LiteLLM client pre-configured for OpenRouter. @@ -347,16 +357,22 @@ impl LiteLlmClient { /// A client configured with: /// - api_base: "https://openrouter.ai/api/v1" /// - default_model: "anthropic/claude-opus-4.5" - pub fn new_with_defaults(api_key: String) -> Self { - Self { + /// + /// # Errors + /// + /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built. + pub fn new_with_defaults(api_key: String) -> Result { + Ok(Self { api_base: "https://openrouter.ai/api/v1".to_string(), api_key: Some(api_key), default_model: "anthropic/claude-opus-4.5".to_string(), http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), - } + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, + }) } /// Create a new LiteLLM client from environment variables. @@ -382,7 +398,9 @@ impl LiteLlmClient { http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, }) } @@ -929,7 +947,8 @@ mod tests { "http://localhost:4000".to_string(), Some("test-key".to_string()), "gpt-4".to_string(), - ); + ) + .unwrap(); assert_eq!(client.api_base(), "http://localhost:4000"); assert_eq!(client.default_model(), "gpt-4"); @@ -942,14 +961,15 @@ mod tests { "http://localhost:4000".to_string(), None, "gpt-4".to_string(), - ); + ) + .unwrap(); assert!(!client.has_api_key()); } #[test] fn test_litellm_client_new_with_defaults() { - let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()); + let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()).unwrap(); assert_eq!(client.api_base(), "https://openrouter.ai/api/v1"); assert_eq!(client.default_model(), "anthropic/claude-opus-4.5"); @@ -963,7 +983,8 @@ mod tests { "http://localhost:65535".to_string(), // Use a port that's unlikely to have a server None, "gpt-4".to_string(), - ); + ) + .unwrap(); let request = GenerationRequest::new("gpt-4", vec![Message::user("test")]); let result = client.generate(request).await; @@ -1002,7 +1023,8 @@ mod tests { "http://localhost:65535".to_string(), None, "gpt-4".to_string(), - ); + ) + .unwrap(); let cache = PromptCache::new(100); // Create a request with a system prompt diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 13b9104..2288b81 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -8,6 +8,7 @@ use std::process::Stdio; use tokio::process::Command; use crate::swe::tool_server::TOOL_SERVER_PY; +use crate::swe::{validate_git_ref, validate_repo_name}; /// Shell command output from inside the container. pub struct SandboxOutput { @@ -37,6 +38,9 @@ impl DockerSandbox { language: &str, image_override: Option<&str>, ) -> Result { + validate_repo_name(repo)?; + validate_git_ref(base_commit)?; + let image = image_override.unwrap_or_else(|| image_for_language(language)); let safe_name = repo.replace('/', "-").replace(' ', "_"); let ts_suffix = std::time::SystemTime::now() diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index 4059b73..b13c47f 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -4,7 +4,7 @@ use anyhow::Result; use crate::swe::docker_sandbox::DockerSandbox; -use crate::swe::SweTask; +use crate::swe::{validate_git_ref, SweTask}; fn github_token() -> Option { std::env::var("GITHUB_TOKEN") @@ -188,6 +188,13 @@ impl PatchExtractor { /// Clone the repo inside a Docker container and extract the diff. async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result { + if let Some(base) = input.base_commit { + validate_git_ref(base)?; + } + if let Some(merge) = input.merge_commit { + validate_git_ref(merge)?; + } + let base_commit = input.base_commit.unwrap_or(""); let sandbox = DockerSandbox::start(input.repository, base_commit, input.language, None).await?; diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 84ddea8..0658f10 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::{info, warn}; -use super::SweTask; +use super::{validate_git_ref, validate_repo_name, SweTask}; // --------------------------------------------------------------------------- // Config @@ -203,6 +203,18 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult container_id: Some(cname.clone()), }; + // 0. INPUT VALIDATION: reject shell-unsafe repo names and commit refs + if let Err(e) = validate_repo_name(&task.repo) { + result.error = Some(format!("Invalid repo name: {e}")); + return result; + } + if !task.base_commit.is_empty() { + if let Err(e) = validate_git_ref(&task.base_commit) { + result.error = Some(format!("Invalid base commit: {e}")); + return result; + } + } + // 1. SETUP: start container info!(task_id = %task.id, "Starting container {}", cname); let agent_dir_abs = if let Ok(docker_dir) = std::env::var("DOCKER_AGENT_DIR") { diff --git a/src/swe/mod.rs b/src/swe/mod.rs index 0e4f42c..db34ff8 100644 --- a/src/swe/mod.rs +++ b/src/swe/mod.rs @@ -46,6 +46,67 @@ pub use workspace_validator::{ValidationOutcome, WorkspaceValidator}; /// Default output directory for generated SWE workspaces. pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe"; +/// Validate a git ref (commit SHA, branch name) to prevent shell injection. +/// +/// Accepts hex-only SHAs (short or full) and standard git ref names +/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters. +pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> { + if s.is_empty() { + return Ok(()); + } + if s.len() > 256 { + anyhow::bail!("git ref too long ({} chars, max 256)", s.len()); + } + for ch in s.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') { + anyhow::bail!( + "invalid character '{}' in git ref '{}': only alphanumeric, /, ., -, _, ~, ^ allowed", + ch, + s + ); + } + } + Ok(()) +} + +/// Validate a GitHub repository name (`owner/repo`) to prevent shell injection. +/// +/// Accepts the standard GitHub `owner/repo` format where both parts contain +/// only alphanumeric characters, hyphens, underscores, and dots. +pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> { + if s.is_empty() { + anyhow::bail!("repository name is empty"); + } + if s.len() > 256 { + anyhow::bail!("repository name too long ({} chars, max 256)", s.len()); + } + let parts: Vec<&str> = s.split('/').collect(); + if parts.len() != 2 { + anyhow::bail!( + "invalid repository name '{}': expected 'owner/repo' format", + s + ); + } + for part in &parts { + if part.is_empty() { + anyhow::bail!( + "invalid repository name '{}': owner and repo must be non-empty", + s + ); + } + for ch in part.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') { + anyhow::bail!( + "invalid character '{}' in repository name '{}': only alphanumeric, -, _, . allowed", + ch, + s + ); + } + } + } + Ok(()) +} + /// DataForge-compatible task format for SWE mined items. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SweTask { @@ -213,3 +274,72 @@ impl SweTask { map } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_git_ref_accepts_hex_sha() { + assert!(validate_git_ref("abc123def456").is_ok()); + assert!(validate_git_ref("a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2").is_ok()); + } + + #[test] + fn validate_git_ref_accepts_empty() { + assert!(validate_git_ref("").is_ok()); + } + + #[test] + fn validate_git_ref_accepts_branch_names() { + assert!(validate_git_ref("main").is_ok()); + assert!(validate_git_ref("feature/my-branch").is_ok()); + assert!(validate_git_ref("HEAD~1").is_ok()); + assert!(validate_git_ref("v1.2.3").is_ok()); + } + + #[test] + fn validate_git_ref_rejects_shell_injection() { + assert!(validate_git_ref("abc123; rm -rf /").is_err()); + assert!(validate_git_ref("$(whoami)").is_err()); + assert!(validate_git_ref("`id`").is_err()); + assert!(validate_git_ref("abc | cat /etc/passwd").is_err()); + assert!(validate_git_ref("abc && echo pwned").is_err()); + } + + #[test] + fn validate_git_ref_rejects_too_long() { + let long_ref = "a".repeat(257); + assert!(validate_git_ref(&long_ref).is_err()); + } + + #[test] + fn validate_repo_name_accepts_valid() { + assert!(validate_repo_name("owner/repo").is_ok()); + assert!(validate_repo_name("my-org/my-repo").is_ok()); + assert!(validate_repo_name("user123/project.js").is_ok()); + assert!(validate_repo_name("Org_Name/Repo_Name").is_ok()); + } + + #[test] + fn validate_repo_name_rejects_shell_injection() { + assert!(validate_repo_name("owner/repo; rm -rf /").is_err()); + assert!(validate_repo_name("$(whoami)/repo").is_err()); + assert!(validate_repo_name("owner/repo && echo pwned").is_err()); + } + + #[test] + fn validate_repo_name_rejects_invalid_format() { + assert!(validate_repo_name("").is_err()); + assert!(validate_repo_name("noslash").is_err()); + assert!(validate_repo_name("too/many/slashes").is_err()); + assert!(validate_repo_name("/repo").is_err()); + assert!(validate_repo_name("owner/").is_err()); + } + + #[test] + fn validate_repo_name_rejects_too_long() { + let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128)); + assert!(validate_repo_name(&long_name).is_err()); + } +} From 3e673e82b5d2fcad4bf8eb51b34afdd7d65d57fb Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 13:51:44 +0000 Subject: [PATCH 3/8] fix(dead-code): remove unnecessary #[allow(dead_code)] and unused PatchBlock.lines field - Remove #[allow(dead_code)] from ApiToolCall: all fields are either accessed in generate() or suppressed by underscore prefix (_tool_type) - Remove #[allow(dead_code)] from ApiToolCallFunction: both fields (name, arguments) are accessed in generate() - Remove unused PatchBlock.lines field in extractor.rs: accumulated but never read (split_solution_and_tests only returns .patch) --- src/llm/litellm.rs | 2 -- src/swe/extractor.rs | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs index c153270..a1b4cc2 100644 --- a/src/llm/litellm.rs +++ b/src/llm/litellm.rs @@ -549,7 +549,6 @@ struct ApiMessage { /// A tool call returned by the model. #[derive(Debug, Deserialize)] -#[allow(dead_code)] struct ApiToolCall { #[serde(default)] id: String, @@ -560,7 +559,6 @@ struct ApiToolCall { /// Function details within a tool call response. #[derive(Debug, Deserialize)] -#[allow(dead_code)] struct ApiToolCallFunction { #[serde(default)] name: String, diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index b13c47f..a2b9c19 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -265,7 +265,6 @@ impl PatchExtractor { #[derive(Default)] struct PatchBlock { patch: String, - lines: usize, } fn count_line_delta(raw: &str) -> (usize, usize) { @@ -369,10 +368,8 @@ fn append_to_partition( ) { if is_test_file(file_path) { tests.patch.push_str(block); - tests.lines = tests.lines.saturating_add(block.lines().count()); } else { solution.patch.push_str(block); - solution.lines = solution.lines.saturating_add(block.lines().count()); } } From 1a6bfc3c57703ef5844f7a5b39242673535169e5 Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 13:56:43 +0000 Subject: [PATCH 4/8] fix(swe): log errors instead of silently dropping sandbox write_file results Replace 'let _ = sandbox.write_file(...)' with proper error logging using tracing::warn in test_generator.rs and workspace_validator.rs. These were silently swallowing errors during test file restoration in validation cleanup paths. --- src/swe/test_generator.rs | 12 +++++++++--- src/swe/workspace_validator.rs | 4 +++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs index bc86573..653be13 100644 --- a/src/swe/test_generator.rs +++ b/src/swe/test_generator.rs @@ -761,7 +761,9 @@ impl TestGenerator { .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after f2p rejection"); + } } return ValidationResult::Rejected(format!( "fail_to_pass test '{}' still FAILS after the PR patch is applied (exit={}, stderr={}). \ @@ -784,7 +786,9 @@ impl TestGenerator { .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after p2p rejection"); + } } return ValidationResult::Rejected(format!( "pass_to_pass test '{}' FAILS after the PR patch (exit={}, stderr={}). \ @@ -804,7 +808,9 @@ impl TestGenerator { .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after validation"); + } } ValidationResult::Accepted diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs index 8345297..190f280 100644 --- a/src/swe/workspace_validator.rs +++ b/src/swe/workspace_validator.rs @@ -211,7 +211,9 @@ impl WorkspaceValidator { if let Some(test_files_json) = task.meta.get("test_files") { if let Ok(files) = serde_json::from_str::>(test_files_json) { for tf in &files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to re-write test file after patch"); + } } } } From 0382ea935233063e006f1b386be19759e77acb75 Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 14:09:52 +0000 Subject: [PATCH 5/8] fix(security): harden input validation against shell injection and path traversal - validate_git_ref: reject empty refs, '..' sequences, and leading '-' (flag injection) - validate_repo_name: reject parts starting with '.' or '-' (path traversal, flag injection) - Add validate_file_path: reject shell metacharacters, null bytes, '..', absolute paths - docker_sandbox: validate file paths in write_file, write_file_abs, read_file - docker_sandbox: restrict write_file_abs to /tools/ prefix, validate tool_name - harness: validate file paths in docker_write_file - litellm: replace unwrap_or(Null) with proper error propagation for tools serialization - extractor: guard validate_git_ref calls with empty checks for optional refs --- src/llm/litellm.rs | 19 ++++-- src/swe/docker_sandbox.rs | 62 ++++++++++++++++- src/swe/extractor.rs | 8 ++- src/swe/harness.rs | 4 +- src/swe/mod.rs | 138 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 215 insertions(+), 16 deletions(-) diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs index a1b4cc2..820cda7 100644 --- a/src/llm/litellm.rs +++ b/src/llm/litellm.rs @@ -599,12 +599,19 @@ impl LlmProvider for LiteLlmClient { request.model.clone() }; - let tools = request - .tools - .map(|t| serde_json::to_value(&t).unwrap_or(serde_json::Value::Null)); - let tool_choice = request - .tool_choice - .map(|tc| serde_json::to_value(&tc).unwrap_or(serde_json::Value::Null)); + let tools = + match request.tools { + Some(t) => Some(serde_json::to_value(&t).map_err(|e| { + LlmError::RequestFailed(format!("Failed to serialize tools: {e}")) + })?), + None => None, + }; + let tool_choice = match request.tool_choice { + Some(tc) => Some(serde_json::to_value(&tc).map_err(|e| { + LlmError::RequestFailed(format!("Failed to serialize tool_choice: {e}")) + })?), + None => None, + }; let api_request = ApiRequest { model: model.clone(), diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 2288b81..66518c6 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -8,7 +8,7 @@ use std::process::Stdio; use tokio::process::Command; use crate::swe::tool_server::TOOL_SERVER_PY; -use crate::swe::{validate_git_ref, validate_repo_name}; +use crate::swe::{validate_file_path, validate_git_ref, validate_repo_name}; /// Shell command output from inside the container. pub struct SandboxOutput { @@ -39,7 +39,9 @@ impl DockerSandbox { image_override: Option<&str>, ) -> Result { validate_repo_name(repo)?; - validate_git_ref(base_commit)?; + if !base_commit.is_empty() { + validate_git_ref(base_commit)?; + } let image = image_override.unwrap_or_else(|| image_for_language(language)); let safe_name = repo.replace('/', "-").replace(' ', "_"); @@ -227,6 +229,16 @@ impl DockerSandbox { /// Call a tool on the HTTP tool server running inside the container. /// Pipes the JSON args via stdin to avoid shell escaping issues. pub async fn tool_request(&self, tool_name: &str, args_json: &str) -> SandboxOutput { + for ch in tool_name.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + return SandboxOutput { + stdout: String::new(), + stderr: format!("Invalid tool name: {}", tool_name), + exit_code: -1, + }; + } + } + let script = format!( "import sys, urllib.request, json\n\ data = sys.stdin.read()\n\ @@ -329,7 +341,49 @@ impl DockerSandbox { } /// Write a file to an absolute path inside the container. + /// + /// Only allows paths under known safe prefixes (`/tools/`). async fn write_file_abs(&self, abs_path: &str, content: &str) -> Result<()> { + if !abs_path.starts_with("/tools/") { + anyhow::bail!( + "write_file_abs only allows paths under /tools/, got '{}'", + abs_path + ); + } + for ch in abs_path.chars() { + if matches!( + ch, + '\'' | '"' + | '`' + | '$' + | '!' + | '&' + | '|' + | ';' + | '(' + | ')' + | '{' + | '}' + | '<' + | '>' + | '\\' + | '\0' + | '\n' + | '\r' + ) { + anyhow::bail!( + "invalid character in absolute path '{}': shell metacharacters not allowed", + abs_path + ); + } + } + if abs_path.contains("..") { + anyhow::bail!( + "absolute path '{}' contains '..' (path traversal not allowed)", + abs_path + ); + } + let mkdir_cmd = format!("mkdir -p \"$(dirname '{}')\"", abs_path); self.exec(&mkdir_cmd, 10_000).await; @@ -369,6 +423,8 @@ impl DockerSandbox { /// Write a file inside the container by piping content via stdin. pub async fn write_file(&self, path: &str, content: &str) -> Result<()> { + validate_file_path(path)?; + // First ensure the parent directory exists let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", path); self.exec(&mkdir_cmd, 10_000).await; @@ -410,6 +466,8 @@ impl DockerSandbox { /// Read a file from inside the container. pub async fn read_file(&self, path: &str) -> Result { + validate_file_path(path)?; + let cmd = format!("cat '/repo/{}'", path); let result = self.exec(&cmd, 10_000).await; if result.exit_code != 0 { diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index a2b9c19..871e77d 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -189,10 +189,14 @@ impl PatchExtractor { /// Clone the repo inside a Docker container and extract the diff. async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result { if let Some(base) = input.base_commit { - validate_git_ref(base)?; + if !base.is_empty() { + validate_git_ref(base)?; + } } if let Some(merge) = input.merge_commit { - validate_git_ref(merge)?; + if !merge.is_empty() { + validate_git_ref(merge)?; + } } let base_commit = input.base_commit.unwrap_or(""); diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 0658f10..47b4930 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::{info, warn}; -use super::{validate_git_ref, validate_repo_name, SweTask}; +use super::{validate_file_path, validate_git_ref, validate_repo_name, SweTask}; // --------------------------------------------------------------------------- // Config @@ -144,6 +144,8 @@ async fn docker_rm(container: &str) { } async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> { + validate_file_path(path)?; + use tokio::io::AsyncWriteExt; let tee_cmd = format!("cat > '/repo/{}'", path); let mut child = Command::new("docker") diff --git a/src/swe/mod.rs b/src/swe/mod.rs index db34ff8..4a25962 100644 --- a/src/swe/mod.rs +++ b/src/swe/mod.rs @@ -49,14 +49,24 @@ pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe"; /// Validate a git ref (commit SHA, branch name) to prevent shell injection. /// /// Accepts hex-only SHAs (short or full) and standard git ref names -/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters. +/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters, +/// `..` sequences (path traversal), and refs starting with `-` (flag injection). pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> { if s.is_empty() { - return Ok(()); + anyhow::bail!("git ref is empty"); } if s.len() > 256 { anyhow::bail!("git ref too long ({} chars, max 256)", s.len()); } + if s.starts_with('-') { + anyhow::bail!( + "git ref '{}' must not start with '-' (could be interpreted as a flag)", + s + ); + } + if s.contains("..") { + anyhow::bail!("git ref '{}' must not contain '..' (path traversal)", s); + } for ch in s.chars() { if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') { anyhow::bail!( @@ -72,7 +82,8 @@ pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> { /// Validate a GitHub repository name (`owner/repo`) to prevent shell injection. /// /// Accepts the standard GitHub `owner/repo` format where both parts contain -/// only alphanumeric characters, hyphens, underscores, and dots. +/// only alphanumeric characters, hyphens, underscores, and dots. Parts must +/// not start with `.` or `-` to prevent path traversal and flag injection. pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> { if s.is_empty() { anyhow::bail!("repository name is empty"); @@ -94,6 +105,12 @@ pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> { s ); } + if part.starts_with('.') || part.starts_with('-') { + anyhow::bail!( + "invalid repository name '{}': parts must not start with '.' or '-'", + s + ); + } for ch in part.chars() { if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') { anyhow::bail!( @@ -107,6 +124,60 @@ pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> { Ok(()) } +/// Validate a file path used in shell commands inside Docker containers. +/// +/// Rejects paths containing shell metacharacters, single/double quotes, +/// null bytes, and `..` traversal sequences. This prevents shell injection +/// when paths are interpolated into commands like `cat > '/repo/{path}'`. +pub fn validate_file_path(path: &str) -> Result<(), anyhow::Error> { + if path.is_empty() { + anyhow::bail!("file path is empty"); + } + if path.len() > 4096 { + anyhow::bail!("file path too long ({} chars, max 4096)", path.len()); + } + if path.contains('\0') { + anyhow::bail!("file path contains null byte"); + } + if path.contains("..") { + anyhow::bail!( + "file path '{}' contains '..' (path traversal not allowed)", + path + ); + } + if path.starts_with('/') { + anyhow::bail!("file path '{}' must be relative (no leading '/')", path); + } + for ch in path.chars() { + if matches!( + ch, + '\'' | '"' + | '`' + | '$' + | '!' + | '&' + | '|' + | ';' + | '(' + | ')' + | '{' + | '}' + | '<' + | '>' + | '\\' + | '\n' + | '\r' + ) { + anyhow::bail!( + "invalid character '{}' in file path '{}': shell metacharacters not allowed", + ch, + path + ); + } + } + Ok(()) +} + /// DataForge-compatible task format for SWE mined items. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SweTask { @@ -286,8 +357,8 @@ mod tests { } #[test] - fn validate_git_ref_accepts_empty() { - assert!(validate_git_ref("").is_ok()); + fn validate_git_ref_rejects_empty() { + assert!(validate_git_ref("").is_err()); } #[test] @@ -307,6 +378,18 @@ mod tests { assert!(validate_git_ref("abc && echo pwned").is_err()); } + #[test] + fn validate_git_ref_rejects_double_dot() { + assert!(validate_git_ref("main..HEAD").is_err()); + assert!(validate_git_ref("../../etc/passwd").is_err()); + } + + #[test] + fn validate_git_ref_rejects_leading_dash() { + assert!(validate_git_ref("--exec=whoami").is_err()); + assert!(validate_git_ref("-n").is_err()); + } + #[test] fn validate_git_ref_rejects_too_long() { let long_ref = "a".repeat(257); @@ -337,9 +420,54 @@ mod tests { assert!(validate_repo_name("owner/").is_err()); } + #[test] + fn validate_repo_name_rejects_leading_dot_or_dash() { + assert!(validate_repo_name(".hidden/repo").is_err()); + assert!(validate_repo_name("owner/.repo").is_err()); + assert!(validate_repo_name("-flag/repo").is_err()); + assert!(validate_repo_name("owner/-repo").is_err()); + assert!(validate_repo_name("..traversal/repo").is_err()); + } + #[test] fn validate_repo_name_rejects_too_long() { let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128)); assert!(validate_repo_name(&long_name).is_err()); } + + #[test] + fn validate_file_path_accepts_valid() { + assert!(validate_file_path("tests/test_foo.py").is_ok()); + assert!(validate_file_path("src/main.rs").is_ok()); + assert!(validate_file_path("a/b/c/d.txt").is_ok()); + assert!(validate_file_path("file.txt").is_ok()); + } + + #[test] + fn validate_file_path_rejects_traversal() { + assert!(validate_file_path("../etc/passwd").is_err()); + assert!(validate_file_path("a/../../etc/passwd").is_err()); + assert!(validate_file_path("..").is_err()); + } + + #[test] + fn validate_file_path_rejects_shell_metacharacters() { + assert!(validate_file_path("file'; rm -rf /; echo '").is_err()); + assert!(validate_file_path("file$(whoami)").is_err()); + assert!(validate_file_path("file`id`").is_err()); + assert!(validate_file_path("file|cat").is_err()); + assert!(validate_file_path("file;ls").is_err()); + assert!(validate_file_path("file&echo").is_err()); + } + + #[test] + fn validate_file_path_rejects_empty_and_absolute() { + assert!(validate_file_path("").is_err()); + assert!(validate_file_path("/etc/passwd").is_err()); + } + + #[test] + fn validate_file_path_rejects_null_byte() { + assert!(validate_file_path("file\0.txt").is_err()); + } } From 50c8cad9d97164056bfc73034b004fad599746c2 Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 14:27:09 +0000 Subject: [PATCH 6/8] fix(harness): validate test file paths before mkdir shell command Add validate_file_path() check before interpolating tf.path into the mkdir shell command in evaluate_task(). Previously, the mkdir command at line 340 used the unvalidated path from deserialized test files, while docker_write_file() on the next line did validate. This created a window where a malicious path could inject shell commands via the mkdir step before validation occurred. --- src/swe/harness.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 47b4930..5cbe5ed 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -337,6 +337,10 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult serde_json::from_str::>(test_files_json) { for tf in &files { + if let Err(e) = validate_file_path(&tf.path) { + warn!(task_id = %task.id, path = %tf.path, error = %e, "Skipping test file with invalid path"); + continue; + } let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path); docker_exec(&cname, &mkdir_cmd, 10).await; let write_result = docker_write_file(&cname, &tf.path, &tf.content).await; From e28910d386ce16d76bc794c9d3fda006aca1d07a Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 14:37:32 +0000 Subject: [PATCH 7/8] fix(swe): replace error-swallowing let _ = patterns with proper logging Replace 3 instances of 'let _ =' that silently discard Docker command errors with 'if let Err(e)' + tracing::debug! for proper error observability while maintaining best-effort cleanup semantics. Files changed: - src/swe/docker_sandbox.rs: start() stale container removal, destroy() - src/swe/harness.rs: docker_rm() helper --- src/swe/docker_sandbox.rs | 14 ++++++++++---- src/swe/harness.rs | 7 +++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 66518c6..2adf200 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -55,12 +55,15 @@ impl DockerSandbox { let tool_port = 10_000 + (ts_suffix as u16 % 50_000); // Remove stale container if it exists - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", &container_name]) .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .await; + .await + { + tracing::debug!(container = %container_name, error = %e, "Failed to remove stale container (may not exist)"); + } let run_output = Command::new("docker") .args([ @@ -482,12 +485,15 @@ impl DockerSandbox { /// Destroy the container. pub async fn destroy(&self) { - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", &self.container_name]) .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .await; + .await + { + tracing::debug!(container = %self.container_name, error = %e, "Failed to destroy container"); + } tracing::debug!(container = %self.container_name, "Docker sandbox destroyed"); } diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 5cbe5ed..90146ed 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -137,10 +137,13 @@ async fn docker_exec(container: &str, cmd: &str, timeout_secs: u64) -> (i32, Str } async fn docker_rm(container: &str) { - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", container]) .output() - .await; + .await + { + tracing::debug!(container = container, error = %e, "Failed to remove container (may not exist)"); + } } async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> { From 8908f667da4eb8e82e90a7cc986b30f1d9a4f0a1 Mon Sep 17 00:00:00 2001 From: echobt Date: Wed, 18 Feb 2026 14:52:45 +0000 Subject: [PATCH 8/8] docs: refresh AGENTS.md --- AGENTS.md | 7 ++++--- src/swe/AGENTS.md | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 2cf3ea6..66b8063 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,6 +28,7 @@ src/ │ ├── pr_cache.rs # SQLite-backed PR deduplication cache │ ├── progress.rs # Background progress monitor for long-running pipeline runs │ ├── github_search.rs # GitHub Search API client (alternative PR source) +│ ├── tool_server.rs # Embedded Python HTTP tool server injected into Docker containers │ └── workspace_validator.rs # Pre-export workspace validation (Docker-based) ├── llm/ # LLM integration layer │ ├── litellm.rs # OpenAI-compatible API client (function calling, tools) @@ -41,7 +42,7 @@ src/ ├── export/ # Parquet dataset export + HuggingFace Hub upload ├── difficulty/ # Difficulty levels, resource limits, scoring ├── anti_hardcoding/ # Canary strings, sealed parameters, contamination detection -├── runner/ # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs] +├── runner/ # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs; exists as scaffolding] ├── utils/ # JSON extraction from LLM responses └── error.rs # Typed error hierarchy (thiserror) ``` @@ -135,13 +136,13 @@ Git hooks are in `.githooks/` and activated via `git config core.hooksPath .gith 3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated. -4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–2GB), CPU (1–4 cores), PIDs (100–500). Never create containers without limits. +4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: PIDs (100–500), storage (1–5 GB), network mode (none/internal). Never create containers without limits. 5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline uses semaphore-based concurrency (no chunk barriers). Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 10x, pre-classification: 25x, deep processing: 8x). 6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts. -7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility. +7. **Serde rename conventions** — Most serializable enums use `#[serde(rename_all = "snake_case")]` (e.g., `SweTaskStatus`, `HarnessStatus`, `CheckType`). `DifficultyLevel` and `NetworkMode` in `src/difficulty/` use `#[serde(rename_all = "lowercase")]`. Follow the existing convention for each type for YAML/JSON compatibility. 8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`. diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md index 3ee4bc2..ac52fef 100644 --- a/src/swe/AGENTS.md +++ b/src/swe/AGENTS.md @@ -22,6 +22,7 @@ Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches | `pipeline.rs` | Streaming pipeline with semaphore-based concurrency | | `github_search.rs` | `GitHubSearchClient` — GitHub Search API as alternative PR source (30 req/min) | | `workspace_validator.rs` | `WorkspaceValidator` — pre-export Docker-based validation (install, tests, patch application) | +| `tool_server.rs` | Embedded Python HTTP tool server injected into Docker containers (read_file, list_dir, grep_files, search_files, apply_patch) | | `pr_cache.rs` | SQLite-backed PR deduplication cache | | `progress.rs` | `ProgressMonitor` — background progress logging for long-running pipeline runs |