diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs index 2adf200..93bfb87 100644 --- a/src/swe/docker_sandbox.rs +++ b/src/swe/docker_sandbox.rs @@ -5,11 +5,36 @@ use anyhow::Result; use std::process::Stdio; +use std::sync::atomic::{AtomicU16, Ordering}; use tokio::process::Command; use crate::swe::tool_server::TOOL_SERVER_PY; use crate::swe::{validate_file_path, validate_git_ref, validate_repo_name}; +/// Global atomic port counter to guarantee unique ports across all concurrent containers. +static NEXT_PORT: AtomicU16 = AtomicU16::new(10_000); + +/// Allocate a unique port for a tool server. +/// +/// Uses an atomic counter to guarantee no two concurrent sandboxes get the same port. +/// Wraps around from 60_000 back to 10_000. +fn allocate_port() -> u16 { + loop { + let current = NEXT_PORT.load(Ordering::Relaxed); + let next = if current >= 60_000 { + 10_000 + } else { + current + 1 + }; + if NEXT_PORT + .compare_exchange(current, next, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + return current; + } + } +} + /// Shell command output from inside the container. pub struct SandboxOutput { pub stdout: String, @@ -22,6 +47,8 @@ pub struct DockerSandbox { container_name: String, /// Unique port for the tool server (needed because --network=host shares port space). tool_port: u16, + /// Whether the tool server started successfully. + tool_server_ok: bool, } /// Pick a Docker image appropriate for the given language. @@ -51,8 +78,7 @@ impl DockerSandbox { .as_millis() % 1_000_000; let container_name = format!("swe-mine-{}-{}", safe_name, ts_suffix); - // Derive unique tool server port from timestamp (range 10000-59999) - let tool_port = 10_000 + (ts_suffix as u16 % 50_000); + let tool_port = allocate_port(); // Remove stale container if it exists if let Err(e) = Command::new("docker") @@ -90,9 +116,10 @@ impl DockerSandbox { ); } - let sandbox = Self { + let mut sandbox = Self { container_name, tool_port, + tool_server_ok: false, }; // Install git (only hard dependency; agent installs everything else) @@ -103,10 +130,11 @@ impl DockerSandbox { ) .await; if install.exit_code != 0 { - tracing::warn!( - container = %sandbox.container_name, - stderr = %install.stderr, - "git install failed (continuing)" + sandbox.destroy().await; + anyhow::bail!( + "git install failed in container '{}': {}", + sandbox.container_name, + install.stderr ); } @@ -131,17 +159,23 @@ impl DockerSandbox { ) .await; if checkout.exit_code != 0 { - tracing::warn!( - container = %sandbox.container_name, - commit = base_commit, - stderr = %checkout.stderr, - "Checkout failed (continuing on HEAD)" + sandbox.destroy().await; + anyhow::bail!( + "Checkout of commit {} failed in container '{}': {}", + base_commit, + sandbox.container_name, + truncate(&checkout.stderr, 500) ); } } // Inject and start the tool server - sandbox.start_tool_server().await; + sandbox.tool_server_ok = sandbox.start_tool_server().await; + if sandbox.tool_server_ok { + tracing::debug!(container = %sandbox.container_name, port = sandbox.tool_port, "Tool server started"); + } else { + tracing::debug!(container = %sandbox.container_name, "Tool server unavailable, shell fallback will be used"); + } tracing::info!( container = %sandbox.container_name, @@ -154,53 +188,103 @@ impl DockerSandbox { } /// Write and start the Python tool server inside the container. - async fn start_tool_server(&self) { - // Write the server script - let mkdir = self.exec("mkdir -p /tools", 10_000).await; - if mkdir.exit_code != 0 { - tracing::warn!(container = %self.container_name, "Failed to create /tools dir"); - return; - } + /// + /// Returns `true` if the tool server started successfully, `false` otherwise. + /// On failure, the caller should fall back to shell-based tool execution. + async fn start_tool_server(&self) -> bool { + for retry in 0..2 { + if retry > 0 { + tracing::debug!( + container = %self.container_name, + retry = retry, + "Retrying tool server startup" + ); + // Kill any leftover process from previous attempt + self.exec( + "pkill -f 'python3.*server.py' 2>/dev/null; rm -f /tools/server.log", + 5_000, + ) + .await; + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } - // Use write_file to inject the server script - if let Err(e) = self - .write_file_abs("/tools/server.py", TOOL_SERVER_PY) - .await - { - tracing::warn!(container = %self.container_name, error = %e, "Failed to write tool server"); - return; - } + // Write the server script + let mkdir = self.exec("mkdir -p /tools", 10_000).await; + if mkdir.exit_code != 0 { + tracing::debug!(container = %self.container_name, "Failed to create /tools dir"); + continue; + } - // Start server in background with unique port (--network=host shares port space) - let start_cmd = format!( - "nohup python3 -u /tools/server.py --port {} --cwd /repo > /tools/server.log 2>&1 &", - self.tool_port - ); - let start = self.exec(&start_cmd, 5_000).await; - if start.exit_code != 0 { - tracing::warn!( - container = %self.container_name, - stderr = %start.stderr, - "Tool server start may have failed" + if let Err(e) = self + .write_file_abs("/tools/server.py", TOOL_SERVER_PY) + .await + { + tracing::debug!(container = %self.container_name, error = %e, "Failed to write tool server"); + continue; + } + + // Verify the script was written correctly + let verify = self + .exec("wc -c < /tools/server.py 2>/dev/null", 5_000) + .await; + let written_bytes: usize = verify.stdout.trim().parse().unwrap_or(0); + if written_bytes < TOOL_SERVER_PY.len() / 2 { + tracing::debug!( + container = %self.container_name, + expected = TOOL_SERVER_PY.len(), + actual = written_bytes, + "Tool server script truncated, retrying" + ); + continue; + } + + // Start server in background with unique port (--network=host shares port space) + let start_cmd = format!( + "nohup python3 -u /tools/server.py --port {} --cwd /repo > /tools/server.log 2>&1 &", + self.tool_port ); - } + let start = self.exec(&start_cmd, 5_000).await; + if start.exit_code != 0 { + tracing::debug!( + container = %self.container_name, + stderr = %start.stderr, + "Tool server start command failed" + ); + continue; + } - // Health check: use python3 urllib (curl is not installed in slim images) - for attempt in 0..6 { - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - let health = self.tool_server_health().await; - if health { - tracing::debug!(container = %self.container_name, attempt = attempt, "Tool server healthy"); - return; + // Health check: 12 attempts × 500ms = 6s total + for attempt in 0..12 { + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + if self.tool_server_health().await { + tracing::debug!( + container = %self.container_name, + attempt = attempt, + retry = retry, + "Tool server healthy" + ); + return true; + } } + + // Log server output for debugging on this retry + let log = self.exec("cat /tools/server.log 2>/dev/null", 5_000).await; + tracing::debug!( + container = %self.container_name, + retry = retry, + server_log = %log.stdout, + "Tool server health check failed after 6s" + ); } - // Log server output for debugging + + // All retries exhausted let log = self.exec("cat /tools/server.log 2>/dev/null", 5_000).await; tracing::warn!( container = %self.container_name, server_log = %log.stdout, - "Tool server health check failed after 3s, tools may not work" + "Tool server failed to start after retries, falling back to shell tools" ); + false } /// Check if the tool server is healthy via python3 urllib inside the container. @@ -229,6 +313,11 @@ impl DockerSandbox { matches!(result, Ok(Ok(status)) if status.success()) } + /// Whether the tool server is available for HTTP-based tool requests. + pub fn has_tool_server(&self) -> bool { + self.tool_server_ok + } + /// Call a tool on the HTTP tool server running inside the container. /// Pipes the JSON args via stdin to avoid shell escaping issues. pub async fn tool_request(&self, tool_name: &str, args_json: &str) -> SandboxOutput { @@ -590,4 +679,21 @@ mod tests { assert!(output.stdout.is_empty()); assert_eq!(output.exit_code, 0); } + + #[test] + fn test_allocate_port_returns_valid_range() { + let port = allocate_port(); + assert!(port >= 10_000); + assert!(port <= 60_000); + } + + #[test] + fn test_allocate_port_sequential_unique() { + let p1 = allocate_port(); + let p2 = allocate_port(); + let p3 = allocate_port(); + assert_ne!(p1, p2); + assert_ne!(p2, p3); + assert_ne!(p1, p3); + } } diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs index 653be13..b22c5ce 100644 --- a/src/swe/test_generator.rs +++ b/src/swe/test_generator.rs @@ -12,7 +12,7 @@ use crate::llm::{ GenerationRequest, LlmProvider, Message, ToolCallInfo, ToolChoice, ToolDefinition, }; use crate::swe::docker_sandbox::DockerSandbox; -use crate::swe::SweTask; +use crate::swe::{validate_file_path, SweTask}; const MAX_AGENT_TURNS: usize = 200; const MAX_VALIDATION_RETRIES: usize = 3; @@ -871,44 +871,36 @@ impl TestGenerator { } } "read_file" | "list_dir" | "grep_files" | "search_files" | "apply_patch" => { - let result = sandbox - .tool_request(tc.function.name.as_str(), &tc.function.arguments) - .await; - let output = if !result.stdout.is_empty() { - // Parse JSON response from tool server - match serde_json::from_str::(result.stdout.trim()) { - Ok(v) => { - if let Some(err) = v.get("error").and_then(|e| e.as_str()) { - err.to_string() - } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) { - let mut s = out.to_string(); - if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) { - if v.get("truncated") - .and_then(|t| t.as_bool()) - .unwrap_or(false) - { - s.push_str(&format!("\n\n[Showing {}/{} lines. Use offset/limit to see more.]", - v.get("shown_lines").and_then(|s| s.as_u64()).unwrap_or(0), total)); - } else { - s.push_str(&format!("\n\n[{} total lines]", total)); - } - } - s - } else { - result.stdout.clone() - } - } - Err(_) => result.stdout.clone(), + let tool_name = tc.function.name.as_str(); + let args_json = &tc.function.arguments; + + let output = if sandbox.has_tool_server() { + let result = sandbox.tool_request(tool_name, args_json).await; + let server_down = result.exit_code != 0 + && result.stdout.is_empty() + && (result.stderr.contains("Connection refused") + || result.stderr.contains("URLError") + || result.stderr.contains("Tool request error")); + if server_down { + tracing::debug!( + task_id = task_id, + turn = turn, + tool = tool_name, + "Tool server unavailable, falling back to shell" + ); + shell_fallback(sandbox, tool_name, args_json).await + } else { + parse_tool_response(&result) } - } else if !result.stderr.is_empty() { - format!("Error: {}", truncate_utf8(&result.stderr, 1500)) } else { - "No output".to_string() + shell_fallback(sandbox, tool_name, args_json).await }; + tracing::debug!( - task_id = task_id, turn = turn, - tool = %tc.function.name, - "Agent tool call via HTTP server" + task_id = task_id, + turn = turn, + tool = tool_name, + "Agent tool call" ); ToolResult::ShellOutput(truncate_utf8(&output, 4000).to_string()) } @@ -927,6 +919,190 @@ enum ToolResult { Error(String), } +/// Parse a JSON response from the tool server into a user-friendly string. +fn parse_tool_response(result: &crate::swe::docker_sandbox::SandboxOutput) -> String { + if !result.stdout.is_empty() { + match serde_json::from_str::(result.stdout.trim()) { + Ok(v) => { + if let Some(err) = v.get("error").and_then(|e| e.as_str()) { + err.to_string() + } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) { + let mut s = out.to_string(); + if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) { + if v.get("truncated") + .and_then(|t| t.as_bool()) + .unwrap_or(false) + { + s.push_str(&format!( + "\n\n[Showing {}/{} lines. Use offset/limit to see more.]", + v.get("shown_lines").and_then(|sl| sl.as_u64()).unwrap_or(0), + total + )); + } else { + s.push_str(&format!("\n\n[{} total lines]", total)); + } + } + s + } else { + result.stdout.clone() + } + } + Err(_) => result.stdout.clone(), + } + } else if !result.stderr.is_empty() { + format!("Error: {}", truncate_utf8(&result.stderr, 1500)) + } else { + "No output".to_string() + } +} + +/// Escape a string for safe inclusion inside a POSIX single-quoted shell argument. +/// +/// Replaces each `'` with `'\''` (end quote, backslash-escaped literal quote, +/// re-open quote). This is the standard POSIX technique for embedding single +/// quotes inside single-quoted strings and prevents shell breakout. +fn sanitize_shell_arg(s: &str) -> String { + s.replace('\'', "'\\''") +} + +/// Execute a tool via direct shell commands when the HTTP tool server is unavailable. +/// +/// All values interpolated into shell commands are either validated via +/// [`validate_file_path`] (for filesystem paths) or escaped via +/// [`sanitize_shell_arg`] (for patterns/globs that may contain special chars). +async fn shell_fallback(sandbox: &DockerSandbox, tool_name: &str, args_json: &str) -> String { + let args: serde_json::Value = match serde_json::from_str(args_json) { + Ok(v) => v, + Err(e) => return format!("Invalid tool args: {}", e), + }; + + match tool_name { + "read_file" => { + let file_path = args.get("file_path").and_then(|v| v.as_str()).unwrap_or(""); + if file_path.is_empty() { + return "Error: missing file_path".to_string(); + } + if let Err(e) = validate_file_path(file_path) { + return format!("Error: invalid file_path: {}", e); + } + let offset = args.get("offset").and_then(|v| v.as_u64()).unwrap_or(0); + let limit = args.get("limit").and_then(|v| v.as_u64()); + let cmd = match limit { + Some(lim) => format!( + "awk 'NR>{} && NR<={}{{print NR\": \"$0}}' '{}'", + offset, + offset + lim, + file_path + ), + None => format!("awk '{{print NR\": \"$0}}' '{}'", file_path), + }; + let result = sandbox.exec(&cmd, 10_000).await; + if result.exit_code != 0 { + format!("Error reading file: {}", result.stderr) + } else if result.stdout.is_empty() { + "(empty file)".to_string() + } else { + result.stdout + } + } + "list_dir" => { + let dir = args + .get("directory_path") + .and_then(|v| v.as_str()) + .unwrap_or("."); + if dir != "." { + if let Err(e) = validate_file_path(dir) { + return format!("Error: invalid directory_path: {}", e); + } + } + let cmd = format!("ls -la '{}'", sanitize_shell_arg(dir)); + let result = sandbox.exec(&cmd, 10_000).await; + if result.exit_code != 0 { + format!("Error listing directory: {}", result.stderr) + } else { + result.stdout + } + } + "grep_files" => { + let pattern = args.get("pattern").and_then(|v| v.as_str()).unwrap_or(""); + if pattern.is_empty() { + return "Error: missing pattern".to_string(); + } + let path = args.get("path").and_then(|v| v.as_str()).unwrap_or("."); + if path != "." { + if let Err(e) = validate_file_path(path) { + return format!("Error: invalid path: {}", e); + } + } + let include = args.get("include").and_then(|v| v.as_str()); + let limit = args.get("limit").and_then(|v| v.as_u64()).unwrap_or(100); + let include_arg = match include { + Some(glob) => format!(" --include='{}'", sanitize_shell_arg(glob)), + None => String::new(), + }; + let cmd = format!( + "grep -rn --color=never{} '{}' '{}' | head -n {}", + include_arg, + sanitize_shell_arg(pattern), + sanitize_shell_arg(path), + limit + ); + let result = sandbox.exec(&cmd, 30_000).await; + if result.stdout.is_empty() { + "No matches found.".to_string() + } else { + result.stdout + } + } + "search_files" => { + let pattern = args.get("pattern").and_then(|v| v.as_str()).unwrap_or(""); + if pattern.is_empty() { + return "Error: missing pattern".to_string(); + } + let path = args.get("path").and_then(|v| v.as_str()).unwrap_or("."); + if path != "." { + if let Err(e) = validate_file_path(path) { + return format!("Error: invalid path: {}", e); + } + } + let cmd = format!( + "find '{}' -name '{}' -not -path '*/.git/*' -not -path '*/node_modules/*' | sort", + sanitize_shell_arg(path), + sanitize_shell_arg(pattern) + ); + let result = sandbox.exec(&cmd, 30_000).await; + if result.stdout.is_empty() { + format!("No files matching '{}'", pattern) + } else { + result.stdout + } + } + "apply_patch" => { + let patch = args.get("patch").and_then(|v| v.as_str()).unwrap_or(""); + if patch.is_empty() { + return "Error: missing patch".to_string(); + } + match sandbox.write_file(".swe_forge_tool_patch.tmp", patch).await { + Ok(_) => { + let result = sandbox + .exec( + "git apply --allow-empty .swe_forge_tool_patch.tmp 2>&1 && rm -f .swe_forge_tool_patch.tmp", + 30_000, + ) + .await; + if result.exit_code == 0 { + "Patch applied successfully.".to_string() + } else { + format!("git apply failed: {}", result.stdout) + } + } + Err(e) => format!("Failed to write patch file: {}", e), + } + } + _ => format!("Unknown tool: {}", tool_name), + } +} + /// Scan test files for string-matching anti-patterns and return a rejection reason if found. fn reject_string_matching_tests(files: &[TestFile]) -> Option { let patterns: &[(&str, &str)] = &[ diff --git a/src/swe/tool_server.rs b/src/swe/tool_server.rs index 8683377..b0f6c0e 100644 --- a/src/swe/tool_server.rs +++ b/src/swe/tool_server.rs @@ -2,7 +2,7 @@ /// Adapted from baseagent (https://github.com/PlatformNetwork/baseagent). /// /// Provides structured, token-efficient tools: read_file, list_dir, -/// grep_files, search_files, apply_patch. Runs on port 8080. +/// grep_files, search_files, apply_patch. Default port 8080, overridden via `--port` flag. pub const TOOL_SERVER_PY: &str = r#####"#!/usr/bin/env python3 """HTTP tool server for swe-forge Docker containers. Adapted from baseagent tools. Provides structured file exploration tools @@ -305,7 +305,12 @@ def main(): elif arg == "--cwd" and i < len(sys.argv) - 1: CWD = Path(sys.argv[i + 1]) - server = HTTPServer(("0.0.0.0", port), ToolHandler) + HTTPServer.allow_reuse_address = True + try: + server = HTTPServer(("0.0.0.0", port), ToolHandler) + except OSError as e: + print(f"FATAL: Cannot bind to port {port}: {e}", flush=True) + sys.exit(1) print(f"Tool server listening on port {port}, cwd={CWD}", flush=True) server.serve_forever()