diff --git a/.gitignore b/.gitignore index 3d29c38..5f04bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,5 @@ test-verify/ *.db *.db-shm *.db-wal +mine_test.log +test-easy-output/ diff --git a/AGENTS.md b/AGENTS.md index 2cf3ea6..66b8063 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,6 +28,7 @@ src/ │ ├── pr_cache.rs # SQLite-backed PR deduplication cache │ ├── progress.rs # Background progress monitor for long-running pipeline runs │ ├── github_search.rs # GitHub Search API client (alternative PR source) +│ ├── tool_server.rs # Embedded Python HTTP tool server injected into Docker containers │ └── workspace_validator.rs # Pre-export workspace validation (Docker-based) ├── llm/ # LLM integration layer │ ├── litellm.rs # OpenAI-compatible API client (function calling, tools) @@ -41,7 +42,7 @@ src/ ├── export/ # Parquet dataset export + HuggingFace Hub upload ├── difficulty/ # Difficulty levels, resource limits, scoring ├── anti_hardcoding/ # Canary strings, sealed parameters, contamination detection -├── runner/ # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs] +├── runner/ # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs; exists as scaffolding] ├── utils/ # JSON extraction from LLM responses └── error.rs # Typed error hierarchy (thiserror) ``` @@ -135,13 +136,13 @@ Git hooks are in `.githooks/` and activated via `git config core.hooksPath .gith 3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated. -4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–2GB), CPU (1–4 cores), PIDs (100–500). Never create containers without limits. +4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: PIDs (100–500), storage (1–5 GB), network mode (none/internal). Never create containers without limits. 5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline uses semaphore-based concurrency (no chunk barriers). Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 10x, pre-classification: 25x, deep processing: 8x). 6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts. -7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility. +7. **Serde rename conventions** — Most serializable enums use `#[serde(rename_all = "snake_case")]` (e.g., `SweTaskStatus`, `HarnessStatus`, `CheckType`). `DifficultyLevel` and `NetworkMode` in `src/difficulty/` use `#[serde(rename_all = "lowercase")]`. Follow the existing convention for each type for YAML/JSON compatibility. 8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`. diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs index 4e82dc6..820cda7 100644 --- a/src/llm/litellm.rs +++ b/src/llm/litellm.rs @@ -324,16 +324,26 @@ impl LiteLlmClient { /// * `api_base` - Base URL for the LiteLLM API (e.g., "http://localhost:4000") /// * `api_key` - Optional API key for authentication /// * `default_model` - Default model to use when none is specified - pub fn new(api_base: String, api_key: Option, default_model: String) -> Self { - Self { + /// + /// # Errors + /// + /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built. + pub fn new( + api_base: String, + api_key: Option, + default_model: String, + ) -> Result { + Ok(Self { api_base, api_key, default_model, http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), - } + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, + }) } /// Create a new LiteLLM client pre-configured for OpenRouter. @@ -347,16 +357,22 @@ impl LiteLlmClient { /// A client configured with: /// - api_base: "https://openrouter.ai/api/v1" /// - default_model: "anthropic/claude-opus-4.5" - pub fn new_with_defaults(api_key: String) -> Self { - Self { + /// + /// # Errors + /// + /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built. + pub fn new_with_defaults(api_key: String) -> Result { + Ok(Self { api_base: "https://openrouter.ai/api/v1".to_string(), api_key: Some(api_key), default_model: "anthropic/claude-opus-4.5".to_string(), http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), - } + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, + }) } /// Create a new LiteLLM client from environment variables. @@ -382,7 +398,9 @@ impl LiteLlmClient { http_client: Client::builder() .timeout(Duration::from_secs(300)) .build() - .expect("Failed to build HTTP client"), + .map_err(|e| { + LlmError::RequestFailed(format!("Failed to build HTTP client: {e}")) + })?, }) } @@ -490,6 +508,10 @@ struct ApiRequest { top_p: Option, #[serde(skip_serializing_if = "Option::is_none")] response_format: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tools: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tool_choice: Option, } /// Internal response structure from the OpenAI-compatible API. @@ -506,14 +528,42 @@ struct ApiResponse { struct ApiChoice { index: u32, message: ApiMessage, - finish_reason: String, + #[serde(default)] + finish_reason: Option, } /// Internal message structure from the API response. +/// Supports reasoning models and tool calls. #[derive(Debug, Deserialize)] struct ApiMessage { role: String, + #[serde(default)] content: String, + #[serde(default)] + reasoning: Option, + #[serde(default)] + reasoning_content: Option, + #[serde(default)] + tool_calls: Option>, +} + +/// A tool call returned by the model. +#[derive(Debug, Deserialize)] +struct ApiToolCall { + #[serde(default)] + id: String, + #[serde(rename = "type", default)] + _tool_type: String, + function: ApiToolCallFunction, +} + +/// Function details within a tool call response. +#[derive(Debug, Deserialize)] +struct ApiToolCallFunction { + #[serde(default)] + name: String, + #[serde(default)] + arguments: String, } /// Internal usage structure from the API response. @@ -549,6 +599,20 @@ impl LlmProvider for LiteLlmClient { request.model.clone() }; + let tools = + match request.tools { + Some(t) => Some(serde_json::to_value(&t).map_err(|e| { + LlmError::RequestFailed(format!("Failed to serialize tools: {e}")) + })?), + None => None, + }; + let tool_choice = match request.tool_choice { + Some(tc) => Some(serde_json::to_value(&tc).map_err(|e| { + LlmError::RequestFailed(format!("Failed to serialize tool_choice: {e}")) + })?), + None => None, + }; + let api_request = ApiRequest { model: model.clone(), messages: request.messages, @@ -556,6 +620,8 @@ impl LlmProvider for LiteLlmClient { max_tokens: request.max_tokens, top_p: request.top_p, response_format: request.response_format, + tools, + tool_choice, }; let url = format!("{}/chat/completions", self.api_base); @@ -617,15 +683,52 @@ impl LlmProvider for LiteLlmClient { let choices = api_response .choices .into_iter() - .map(|choice| Choice { - index: choice.index, - message: Message { - role: choice.message.role, - content: choice.message.content, - tool_calls: None, - tool_call_id: None, - }, - finish_reason: choice.finish_reason, + .map(|choice| { + let tool_calls_info = choice.message.tool_calls.as_ref().map(|tcs| { + tcs.iter() + .map(|tc| ToolCallInfo { + id: tc.id.clone(), + call_type: "function".to_string(), + function: ToolCallFunction { + name: tc.function.name.clone(), + arguments: tc.function.arguments.clone(), + }, + }) + .collect::>() + }); + + let content = if let Some(ref tool_calls) = choice.message.tool_calls { + if let Some(first_call) = tool_calls.first() { + if !first_call.function.arguments.is_empty() { + first_call.function.arguments.clone() + } else { + choice.message.content.clone() + } + } else { + choice.message.content.clone() + } + } else if !choice.message.content.trim().is_empty() { + choice.message.content + } else if let Some(rc) = choice.message.reasoning_content { + if !rc.trim().is_empty() { + rc + } else { + choice.message.reasoning.unwrap_or_default() + } + } else { + choice.message.reasoning.unwrap_or_default() + }; + + Choice { + index: choice.index, + message: Message { + role: choice.message.role, + content, + tool_calls: tool_calls_info, + tool_call_id: None, + }, + finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".to_string()), + } }) .collect(); @@ -849,7 +952,8 @@ mod tests { "http://localhost:4000".to_string(), Some("test-key".to_string()), "gpt-4".to_string(), - ); + ) + .unwrap(); assert_eq!(client.api_base(), "http://localhost:4000"); assert_eq!(client.default_model(), "gpt-4"); @@ -862,14 +966,15 @@ mod tests { "http://localhost:4000".to_string(), None, "gpt-4".to_string(), - ); + ) + .unwrap(); assert!(!client.has_api_key()); } #[test] fn test_litellm_client_new_with_defaults() { - let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()); + let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()).unwrap(); assert_eq!(client.api_base(), "https://openrouter.ai/api/v1"); assert_eq!(client.default_model(), "anthropic/claude-opus-4.5"); @@ -883,7 +988,8 @@ mod tests { "http://localhost:65535".to_string(), // Use a port that's unlikely to have a server None, "gpt-4".to_string(), - ); + ) + .unwrap(); let request = GenerationRequest::new("gpt-4", vec![Message::user("test")]); let result = client.generate(request).await; @@ -903,6 +1009,8 @@ mod tests { max_tokens: Some(1000), top_p: None, response_format: None, + tools: None, + tool_choice: None, }; let json = serde_json::to_string(&request).expect("serialization should succeed"); @@ -910,6 +1018,8 @@ mod tests { assert!(json.contains("\"temperature\":0.7")); assert!(json.contains("\"max_tokens\":1000")); assert!(!json.contains("top_p")); // Should be skipped because None + assert!(!json.contains("tools")); // Should be skipped because None + assert!(!json.contains("tool_choice")); // Should be skipped because None } #[tokio::test] @@ -918,7 +1028,8 @@ mod tests { "http://localhost:65535".to_string(), None, "gpt-4".to_string(), - ); + ) + .unwrap(); let cache = PromptCache::new(100); // Create a request with a system prompt diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md index 3ee4bc2..ac52fef 100644 --- a/src/swe/AGENTS.md +++ b/src/swe/AGENTS.md @@ -22,6 +22,7 @@ Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches | `pipeline.rs` | Streaming pipeline with semaphore-based concurrency | | `github_search.rs` | `GitHubSearchClient` — GitHub Search API as alternative PR source (30 req/min) | | `workspace_validator.rs` | `WorkspaceValidator` — pre-export Docker-based validation (install, tests, patch application) | +| `tool_server.rs` | Embedded Python HTTP tool server injected into Docker containers (read_file, list_dir, grep_files, search_files, apply_patch) | | `pr_cache.rs` | SQLite-backed PR deduplication cache | | `progress.rs` | `ProgressMonitor` — background progress logging for long-running pipeline runs | diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 6e9899a..2adf200 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -8,6 +8,7 @@ use std::process::Stdio; use tokio::process::Command; use crate::swe::tool_server::TOOL_SERVER_PY; +use crate::swe::{validate_file_path, validate_git_ref, validate_repo_name}; /// Shell command output from inside the container. pub struct SandboxOutput { @@ -37,6 +38,11 @@ impl DockerSandbox { language: &str, image_override: Option<&str>, ) -> Result { + validate_repo_name(repo)?; + if !base_commit.is_empty() { + validate_git_ref(base_commit)?; + } + let image = image_override.unwrap_or_else(|| image_for_language(language)); let safe_name = repo.replace('/', "-").replace(' ', "_"); let ts_suffix = std::time::SystemTime::now() @@ -49,12 +55,15 @@ impl DockerSandbox { let tool_port = 10_000 + (ts_suffix as u16 % 50_000); // Remove stale container if it exists - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", &container_name]) .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .await; + .await + { + tracing::debug!(container = %container_name, error = %e, "Failed to remove stale container (may not exist)"); + } let run_output = Command::new("docker") .args([ @@ -81,7 +90,10 @@ impl DockerSandbox { ); } - let sandbox = Self { container_name, tool_port }; + let sandbox = Self { + container_name, + tool_port, + }; // Install git (only hard dependency; agent installs everything else) let install = sandbox @@ -99,10 +111,7 @@ impl DockerSandbox { } // Clone the repository (full clone for reliable checkout) - let clone_cmd = format!( - "git clone https://github.com/{}.git /repo 2>&1", - repo - ); + let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", repo); let clone = sandbox.exec(&clone_cmd, 600_000).await; if clone.exit_code != 0 { sandbox.destroy().await; @@ -154,7 +163,10 @@ impl DockerSandbox { } // Use write_file to inject the server script - if let Err(e) = self.write_file_abs("/tools/server.py", TOOL_SERVER_PY).await { + if let Err(e) = self + .write_file_abs("/tools/server.py", TOOL_SERVER_PY) + .await + { tracing::warn!(container = %self.container_name, error = %e, "Failed to write tool server"); return; } @@ -201,8 +213,13 @@ impl DockerSandbox { std::time::Duration::from_millis(2_000), Command::new("docker") .args([ - "exec", "-w", "/repo", &self.container_name, - "python3", "-c", &check_cmd, + "exec", + "-w", + "/repo", + &self.container_name, + "python3", + "-c", + &check_cmd, ]) .stdout(Stdio::null()) .stderr(Stdio::null()) @@ -215,6 +232,16 @@ impl DockerSandbox { /// Call a tool on the HTTP tool server running inside the container. /// Pipes the JSON args via stdin to avoid shell escaping issues. pub async fn tool_request(&self, tool_name: &str, args_json: &str) -> SandboxOutput { + for ch in tool_name.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + return SandboxOutput { + stdout: String::new(), + stderr: format!("Invalid tool name: {}", tool_name), + exit_code: -1, + }; + } + } + let script = format!( "import sys, urllib.request, json\n\ data = sys.stdin.read()\n\ @@ -230,29 +257,31 @@ impl DockerSandbox { self.tool_port, tool_name ); - let result = tokio::time::timeout( - std::time::Duration::from_millis(65_000), - async { - let mut child = Command::new("docker") - .args([ - "exec", "-i", "-w", "/repo", - &self.container_name, - "python3", "-c", &script, - ]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn()?; - - if let Some(ref mut stdin) = child.stdin { - use tokio::io::AsyncWriteExt; - stdin.write_all(args_json.as_bytes()).await?; - stdin.shutdown().await?; - } - - child.wait_with_output().await + let result = tokio::time::timeout(std::time::Duration::from_millis(65_000), async { + let mut child = Command::new("docker") + .args([ + "exec", + "-i", + "-w", + "/repo", + &self.container_name, + "python3", + "-c", + &script, + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + if let Some(ref mut stdin) = child.stdin { + use tokio::io::AsyncWriteExt; + stdin.write_all(args_json.as_bytes()).await?; + stdin.shutdown().await?; } - ) + + child.wait_with_output().await + }) .await; match result { @@ -315,14 +344,63 @@ impl DockerSandbox { } /// Write a file to an absolute path inside the container. + /// + /// Only allows paths under known safe prefixes (`/tools/`). async fn write_file_abs(&self, abs_path: &str, content: &str) -> Result<()> { + if !abs_path.starts_with("/tools/") { + anyhow::bail!( + "write_file_abs only allows paths under /tools/, got '{}'", + abs_path + ); + } + for ch in abs_path.chars() { + if matches!( + ch, + '\'' | '"' + | '`' + | '$' + | '!' + | '&' + | '|' + | ';' + | '(' + | ')' + | '{' + | '}' + | '<' + | '>' + | '\\' + | '\0' + | '\n' + | '\r' + ) { + anyhow::bail!( + "invalid character in absolute path '{}': shell metacharacters not allowed", + abs_path + ); + } + } + if abs_path.contains("..") { + anyhow::bail!( + "absolute path '{}' contains '..' (path traversal not allowed)", + abs_path + ); + } + let mkdir_cmd = format!("mkdir -p \"$(dirname '{}')\"", abs_path); self.exec(&mkdir_cmd, 10_000).await; let tee_cmd = format!("cat > '{}'", abs_path); let mut child = Command::new("docker") .args([ - "exec", "-i", "-w", "/repo", &self.container_name, "bash", "-c", &tee_cmd, + "exec", + "-i", + "-w", + "/repo", + &self.container_name, + "bash", + "-c", + &tee_cmd, ]) .stdin(Stdio::piped()) .stdout(Stdio::null()) @@ -348,6 +426,8 @@ impl DockerSandbox { /// Write a file inside the container by piping content via stdin. pub async fn write_file(&self, path: &str, content: &str) -> Result<()> { + validate_file_path(path)?; + // First ensure the parent directory exists let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", path); self.exec(&mkdir_cmd, 10_000).await; @@ -389,6 +469,8 @@ impl DockerSandbox { /// Read a file from inside the container. pub async fn read_file(&self, path: &str) -> Result { + validate_file_path(path)?; + let cmd = format!("cat '/repo/{}'", path); let result = self.exec(&cmd, 10_000).await; if result.exit_code != 0 { @@ -403,12 +485,15 @@ impl DockerSandbox { /// Destroy the container. pub async fn destroy(&self) { - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", &self.container_name]) .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .await; + .await + { + tracing::debug!(container = %self.container_name, error = %e, "Failed to destroy container"); + } tracing::debug!(container = %self.container_name, "Docker sandbox destroyed"); } @@ -416,7 +501,6 @@ impl DockerSandbox { pub fn name(&self) -> &str { &self.container_name } - } /// Ensure the sandbox is destroyed when dropped (best-effort sync cleanup). diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs index ab84631..871e77d 100644 --- a/src/swe/extractor.rs +++ b/src/swe/extractor.rs @@ -4,7 +4,7 @@ use anyhow::Result; use crate::swe::docker_sandbox::DockerSandbox; -use crate::swe::SweTask; +use crate::swe::{validate_git_ref, SweTask}; fn github_token() -> Option { std::env::var("GITHUB_TOKEN") @@ -188,11 +188,22 @@ impl PatchExtractor { /// Clone the repo inside a Docker container and extract the diff. async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result { + if let Some(base) = input.base_commit { + if !base.is_empty() { + validate_git_ref(base)?; + } + } + if let Some(merge) = input.merge_commit { + if !merge.is_empty() { + validate_git_ref(merge)?; + } + } + let base_commit = input.base_commit.unwrap_or(""); let sandbox = DockerSandbox::start(input.repository, base_commit, input.language, None).await?; - let diff_ref = match (input.base_commit, input.merge_commit) { + let diff_cmd = match (input.base_commit, input.merge_commit) { (Some(base), Some(merge)) if !base.is_empty() && !merge.is_empty() => { // Fetch the merge commit (shallow clone may not have it) sandbox @@ -201,7 +212,7 @@ impl PatchExtractor { 60_000, ) .await; - format!("{base}..{merge}") + format!("git diff --no-color --unified=3 {} {} 2>&1", base, merge) } (_, Some(merge)) if !merge.is_empty() => { sandbox @@ -210,17 +221,12 @@ impl PatchExtractor { 60_000, ) .await; - merge.to_string() + format!("git diff --no-color --unified=3 HEAD {} 2>&1", merge) } - _ => "HEAD".to_string(), + _ => "git diff --no-color --unified=3 HEAD~1 HEAD 2>&1".to_string(), }; - let result = sandbox - .exec( - &format!("git show --no-color --unified=3 {} 2>&1", diff_ref), - 60_000, - ) - .await; + let result = sandbox.exec(&diff_cmd, 60_000).await; sandbox.destroy().await; @@ -263,7 +269,6 @@ impl PatchExtractor { #[derive(Default)] struct PatchBlock { patch: String, - lines: usize, } fn count_line_delta(raw: &str) -> (usize, usize) { @@ -367,10 +372,8 @@ fn append_to_partition( ) { if is_test_file(file_path) { tests.patch.push_str(block); - tests.lines = tests.lines.saturating_add(block.lines().count()); } else { solution.patch.push_str(block); - solution.lines = solution.lines.saturating_add(block.lines().count()); } } diff --git a/src/swe/harness.rs b/src/swe/harness.rs index 900f751..90146ed 100644 --- a/src/swe/harness.rs +++ b/src/swe/harness.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::{info, warn}; -use super::SweTask; +use super::{validate_file_path, validate_git_ref, validate_repo_name, SweTask}; // --------------------------------------------------------------------------- // Config @@ -137,13 +137,18 @@ async fn docker_exec(container: &str, cmd: &str, timeout_secs: u64) -> (i32, Str } async fn docker_rm(container: &str) { - let _ = Command::new("docker") + if let Err(e) = Command::new("docker") .args(["rm", "-f", container]) .output() - .await; + .await + { + tracing::debug!(container = container, error = %e, "Failed to remove container (may not exist)"); + } } async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> { + validate_file_path(path)?; + use tokio::io::AsyncWriteExt; let tee_cmd = format!("cat > '/repo/{}'", path); let mut child = Command::new("docker") @@ -203,6 +208,18 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult container_id: Some(cname.clone()), }; + // 0. INPUT VALIDATION: reject shell-unsafe repo names and commit refs + if let Err(e) = validate_repo_name(&task.repo) { + result.error = Some(format!("Invalid repo name: {e}")); + return result; + } + if !task.base_commit.is_empty() { + if let Err(e) = validate_git_ref(&task.base_commit) { + result.error = Some(format!("Invalid base commit: {e}")); + return result; + } + } + // 1. SETUP: start container info!(task_id = %task.id, "Starting container {}", cname); let agent_dir_abs = if let Ok(docker_dir) = std::env::var("DOCKER_AGENT_DIR") { @@ -273,10 +290,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult } // Clone repo (full clone for reliable checkout) - let clone_cmd = format!( - "git clone https://github.com/{}.git /repo 2>&1", - task.repo - ); + let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", task.repo); let (code, _, err) = docker_exec(&cname, &clone_cmd, 600).await; if code != 0 { result.error = Some(format!("Clone failed: {}", truncate(&err, 500))); @@ -326,6 +340,10 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult serde_json::from_str::>(test_files_json) { for tf in &files { + if let Err(e) = validate_file_path(&tf.path) { + warn!(task_id = %task.id, path = %tf.path, error = %e, "Skipping test file with invalid path"); + continue; + } let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path); docker_exec(&cname, &mkdir_cmd, 10).await; let write_result = docker_write_file(&cname, &tf.path, &tf.content).await; diff --git a/src/swe/mod.rs b/src/swe/mod.rs index 0e4f42c..4a25962 100644 --- a/src/swe/mod.rs +++ b/src/swe/mod.rs @@ -46,6 +46,138 @@ pub use workspace_validator::{ValidationOutcome, WorkspaceValidator}; /// Default output directory for generated SWE workspaces. pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe"; +/// Validate a git ref (commit SHA, branch name) to prevent shell injection. +/// +/// Accepts hex-only SHAs (short or full) and standard git ref names +/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters, +/// `..` sequences (path traversal), and refs starting with `-` (flag injection). +pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> { + if s.is_empty() { + anyhow::bail!("git ref is empty"); + } + if s.len() > 256 { + anyhow::bail!("git ref too long ({} chars, max 256)", s.len()); + } + if s.starts_with('-') { + anyhow::bail!( + "git ref '{}' must not start with '-' (could be interpreted as a flag)", + s + ); + } + if s.contains("..") { + anyhow::bail!("git ref '{}' must not contain '..' (path traversal)", s); + } + for ch in s.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') { + anyhow::bail!( + "invalid character '{}' in git ref '{}': only alphanumeric, /, ., -, _, ~, ^ allowed", + ch, + s + ); + } + } + Ok(()) +} + +/// Validate a GitHub repository name (`owner/repo`) to prevent shell injection. +/// +/// Accepts the standard GitHub `owner/repo` format where both parts contain +/// only alphanumeric characters, hyphens, underscores, and dots. Parts must +/// not start with `.` or `-` to prevent path traversal and flag injection. +pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> { + if s.is_empty() { + anyhow::bail!("repository name is empty"); + } + if s.len() > 256 { + anyhow::bail!("repository name too long ({} chars, max 256)", s.len()); + } + let parts: Vec<&str> = s.split('/').collect(); + if parts.len() != 2 { + anyhow::bail!( + "invalid repository name '{}': expected 'owner/repo' format", + s + ); + } + for part in &parts { + if part.is_empty() { + anyhow::bail!( + "invalid repository name '{}': owner and repo must be non-empty", + s + ); + } + if part.starts_with('.') || part.starts_with('-') { + anyhow::bail!( + "invalid repository name '{}': parts must not start with '.' or '-'", + s + ); + } + for ch in part.chars() { + if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') { + anyhow::bail!( + "invalid character '{}' in repository name '{}': only alphanumeric, -, _, . allowed", + ch, + s + ); + } + } + } + Ok(()) +} + +/// Validate a file path used in shell commands inside Docker containers. +/// +/// Rejects paths containing shell metacharacters, single/double quotes, +/// null bytes, and `..` traversal sequences. This prevents shell injection +/// when paths are interpolated into commands like `cat > '/repo/{path}'`. +pub fn validate_file_path(path: &str) -> Result<(), anyhow::Error> { + if path.is_empty() { + anyhow::bail!("file path is empty"); + } + if path.len() > 4096 { + anyhow::bail!("file path too long ({} chars, max 4096)", path.len()); + } + if path.contains('\0') { + anyhow::bail!("file path contains null byte"); + } + if path.contains("..") { + anyhow::bail!( + "file path '{}' contains '..' (path traversal not allowed)", + path + ); + } + if path.starts_with('/') { + anyhow::bail!("file path '{}' must be relative (no leading '/')", path); + } + for ch in path.chars() { + if matches!( + ch, + '\'' | '"' + | '`' + | '$' + | '!' + | '&' + | '|' + | ';' + | '(' + | ')' + | '{' + | '}' + | '<' + | '>' + | '\\' + | '\n' + | '\r' + ) { + anyhow::bail!( + "invalid character '{}' in file path '{}': shell metacharacters not allowed", + ch, + path + ); + } + } + Ok(()) +} + /// DataForge-compatible task format for SWE mined items. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SweTask { @@ -213,3 +345,129 @@ impl SweTask { map } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_git_ref_accepts_hex_sha() { + assert!(validate_git_ref("abc123def456").is_ok()); + assert!(validate_git_ref("a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2").is_ok()); + } + + #[test] + fn validate_git_ref_rejects_empty() { + assert!(validate_git_ref("").is_err()); + } + + #[test] + fn validate_git_ref_accepts_branch_names() { + assert!(validate_git_ref("main").is_ok()); + assert!(validate_git_ref("feature/my-branch").is_ok()); + assert!(validate_git_ref("HEAD~1").is_ok()); + assert!(validate_git_ref("v1.2.3").is_ok()); + } + + #[test] + fn validate_git_ref_rejects_shell_injection() { + assert!(validate_git_ref("abc123; rm -rf /").is_err()); + assert!(validate_git_ref("$(whoami)").is_err()); + assert!(validate_git_ref("`id`").is_err()); + assert!(validate_git_ref("abc | cat /etc/passwd").is_err()); + assert!(validate_git_ref("abc && echo pwned").is_err()); + } + + #[test] + fn validate_git_ref_rejects_double_dot() { + assert!(validate_git_ref("main..HEAD").is_err()); + assert!(validate_git_ref("../../etc/passwd").is_err()); + } + + #[test] + fn validate_git_ref_rejects_leading_dash() { + assert!(validate_git_ref("--exec=whoami").is_err()); + assert!(validate_git_ref("-n").is_err()); + } + + #[test] + fn validate_git_ref_rejects_too_long() { + let long_ref = "a".repeat(257); + assert!(validate_git_ref(&long_ref).is_err()); + } + + #[test] + fn validate_repo_name_accepts_valid() { + assert!(validate_repo_name("owner/repo").is_ok()); + assert!(validate_repo_name("my-org/my-repo").is_ok()); + assert!(validate_repo_name("user123/project.js").is_ok()); + assert!(validate_repo_name("Org_Name/Repo_Name").is_ok()); + } + + #[test] + fn validate_repo_name_rejects_shell_injection() { + assert!(validate_repo_name("owner/repo; rm -rf /").is_err()); + assert!(validate_repo_name("$(whoami)/repo").is_err()); + assert!(validate_repo_name("owner/repo && echo pwned").is_err()); + } + + #[test] + fn validate_repo_name_rejects_invalid_format() { + assert!(validate_repo_name("").is_err()); + assert!(validate_repo_name("noslash").is_err()); + assert!(validate_repo_name("too/many/slashes").is_err()); + assert!(validate_repo_name("/repo").is_err()); + assert!(validate_repo_name("owner/").is_err()); + } + + #[test] + fn validate_repo_name_rejects_leading_dot_or_dash() { + assert!(validate_repo_name(".hidden/repo").is_err()); + assert!(validate_repo_name("owner/.repo").is_err()); + assert!(validate_repo_name("-flag/repo").is_err()); + assert!(validate_repo_name("owner/-repo").is_err()); + assert!(validate_repo_name("..traversal/repo").is_err()); + } + + #[test] + fn validate_repo_name_rejects_too_long() { + let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128)); + assert!(validate_repo_name(&long_name).is_err()); + } + + #[test] + fn validate_file_path_accepts_valid() { + assert!(validate_file_path("tests/test_foo.py").is_ok()); + assert!(validate_file_path("src/main.rs").is_ok()); + assert!(validate_file_path("a/b/c/d.txt").is_ok()); + assert!(validate_file_path("file.txt").is_ok()); + } + + #[test] + fn validate_file_path_rejects_traversal() { + assert!(validate_file_path("../etc/passwd").is_err()); + assert!(validate_file_path("a/../../etc/passwd").is_err()); + assert!(validate_file_path("..").is_err()); + } + + #[test] + fn validate_file_path_rejects_shell_metacharacters() { + assert!(validate_file_path("file'; rm -rf /; echo '").is_err()); + assert!(validate_file_path("file$(whoami)").is_err()); + assert!(validate_file_path("file`id`").is_err()); + assert!(validate_file_path("file|cat").is_err()); + assert!(validate_file_path("file;ls").is_err()); + assert!(validate_file_path("file&echo").is_err()); + } + + #[test] + fn validate_file_path_rejects_empty_and_absolute() { + assert!(validate_file_path("").is_err()); + assert!(validate_file_path("/etc/passwd").is_err()); + } + + #[test] + fn validate_file_path_rejects_null_byte() { + assert!(validate_file_path("file\0.txt").is_err()); + } +} diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs index 61a5dd8..653be13 100644 --- a/src/swe/test_generator.rs +++ b/src/swe/test_generator.rs @@ -701,7 +701,10 @@ impl TestGenerator { // Reset repo to clean state before applying patch (agent may have modified files) sandbox - .exec("cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", 30_000) + .exec( + "cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", + 30_000, + ) .await; // Re-write test files (they were cleaned by git checkout) @@ -718,12 +721,18 @@ impl TestGenerator { } let apply_result = sandbox - .exec("cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", 30_000) + .exec( + "cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", + 30_000, + ) .await; if apply_result.exit_code != 0 { let apply_3way = sandbox - .exec("cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", 30_000) + .exec( + "cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", + 30_000, + ) .await; if apply_3way.exit_code != 0 { tracing::warn!( @@ -732,7 +741,9 @@ impl TestGenerator { exit = apply_3way.exit_code, "Patch apply failed, rejecting task" ); - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; return ValidationResult::Rejected( "PR patch could not be applied to the base commit. The test cannot be validated.".to_string() ); @@ -743,10 +754,16 @@ impl TestGenerator { for cmd in &submit.fail_to_pass { let result = sandbox.exec(cmd, 60_000).await; if result.exit_code != 0 { - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after f2p rejection"); + } } return ValidationResult::Rejected(format!( "fail_to_pass test '{}' still FAILS after the PR patch is applied (exit={}, stderr={}). \ @@ -762,10 +779,16 @@ impl TestGenerator { for cmd in &submit.pass_to_pass { let result = sandbox.exec(cmd, 60_000).await; if result.exit_code != 0 { - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after p2p rejection"); + } } return ValidationResult::Rejected(format!( "pass_to_pass test '{}' FAILS after the PR patch (exit={}, stderr={}). \ @@ -778,10 +801,16 @@ impl TestGenerator { } // Revert to base commit for cleanliness - sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await; - sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await; + sandbox + .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000) + .await; + sandbox + .exec("cd /repo && git clean -fd 2>/dev/null", 10_000) + .await; for tf in test_files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after validation"); + } } ValidationResult::Accepted @@ -847,14 +876,17 @@ impl TestGenerator { .await; let output = if !result.stdout.is_empty() { // Parse JSON response from tool server - match serde_json::from_str::(&result.stdout.trim()) { + match serde_json::from_str::(result.stdout.trim()) { Ok(v) => { if let Some(err) = v.get("error").and_then(|e| e.as_str()) { err.to_string() } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) { let mut s = out.to_string(); if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) { - if v.get("truncated").and_then(|t| t.as_bool()).unwrap_or(false) { + if v.get("truncated") + .and_then(|t| t.as_bool()) + .unwrap_or(false) + { s.push_str(&format!("\n\n[Showing {}/{} lines. Use offset/limit to see more.]", v.get("shown_lines").and_then(|s| s.as_u64()).unwrap_or(0), total)); } else { diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs index b4619c8..190f280 100644 --- a/src/swe/workspace_validator.rs +++ b/src/swe/workspace_validator.rs @@ -84,6 +84,26 @@ impl WorkspaceValidator { sandbox: &DockerSandbox, task: &SweTask, ) -> Result { + // --- Install language runtime if needed --- + let runtime_install = match task.language.to_lowercase().as_str() { + "go" | "golang" => Some("apt-get update -qq && apt-get install -y -qq golang > /dev/null 2>&1"), + "javascript" | "typescript" | "js" | "ts" => Some("apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1"), + "rust" => Some("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1 && . $HOME/.cargo/env"), + "java" => Some("apt-get update -qq && apt-get install -y -qq default-jdk maven > /dev/null 2>&1"), + _ => None, + }; + if let Some(cmd) = runtime_install { + let rt_result = sandbox.exec(&format!("{} 2>&1", cmd), 300_000).await; + if rt_result.exit_code != 0 { + tracing::warn!( + task_id = %task.id, + language = %task.language, + exit = rt_result.exit_code, + "Runtime install failed during validation (continuing)" + ); + } + } + // --- Install --- if let Some(install_cmd) = task.install_config.get("install") { if !install_cmd.is_empty() && !install_cmd.starts_with('#') { @@ -191,7 +211,9 @@ impl WorkspaceValidator { if let Some(test_files_json) = task.meta.get("test_files") { if let Ok(files) = serde_json::from_str::>(test_files_json) { for tf in &files { - let _ = sandbox.write_file(&tf.path, &tf.content).await; + if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await { + tracing::warn!(path = %tf.path, error = %e, "Failed to re-write test file after patch"); + } } } }