diff --git a/.gitignore b/.gitignore
index 3d29c38..5f04bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,3 +42,5 @@ test-verify/
 *.db
 *.db-shm
 *.db-wal
+mine_test.log
+test-easy-output/
diff --git a/AGENTS.md b/AGENTS.md
index 2cf3ea6..66b8063 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -28,6 +28,7 @@ src/
 │   ├── pr_cache.rs          # SQLite-backed PR deduplication cache
 │   ├── progress.rs          # Background progress monitor for long-running pipeline runs
 │   ├── github_search.rs     # GitHub Search API client (alternative PR source)
+│   ├── tool_server.rs       # Embedded Python HTTP tool server injected into Docker containers
 │   └── workspace_validator.rs # Pre-export workspace validation (Docker-based)
 ├── llm/                     # LLM integration layer
 │   ├── litellm.rs           # OpenAI-compatible API client (function calling, tools)
@@ -41,7 +42,7 @@ src/
 ├── export/                  # Parquet dataset export + HuggingFace Hub upload
 ├── difficulty/              # Difficulty levels, resource limits, scoring
 ├── anti_hardcoding/         # Canary strings, sealed parameters, contamination detection
-├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs]
+├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs; exists as scaffolding]
 ├── utils/                   # JSON extraction from LLM responses
 └── error.rs                 # Typed error hierarchy (thiserror)
 ```
@@ -135,13 +136,13 @@ Git hooks are in `.githooks/` and activated via `git config core.hooksPath .gith
 
 3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated.
 
-4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–2GB), CPU (1–4 cores), PIDs (100–500). Never create containers without limits.
+4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: PIDs (100–500), storage (1–5 GB), network mode (none/internal). Never create containers without limits.
 
 5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline uses semaphore-based concurrency (no chunk barriers). Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 10x, pre-classification: 25x, deep processing: 8x).
 
 6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc<dyn LlmProvider>` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts.
 
-7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility.
+7. **Serde rename conventions** — Most serializable enums use `#[serde(rename_all = "snake_case")]` (e.g., `SweTaskStatus`, `HarnessStatus`, `CheckType`). `DifficultyLevel` and `NetworkMode` in `src/difficulty/` use `#[serde(rename_all = "lowercase")]`. Follow the existing convention for each type for YAML/JSON compatibility.
 
 8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`.
 
diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index 4e82dc6..820cda7 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -324,16 +324,26 @@ impl LiteLlmClient {
     /// * `api_base` - Base URL for the LiteLLM API (e.g., "http://localhost:4000")
     /// * `api_key` - Optional API key for authentication
     /// * `default_model` - Default model to use when none is specified
-    pub fn new(api_base: String, api_key: Option<String>, default_model: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new(
+        api_base: String,
+        api_key: Option<String>,
+        default_model: String,
+    ) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base,
             api_key,
             default_model,
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client pre-configured for OpenRouter.
@@ -347,16 +357,22 @@ impl LiteLlmClient {
     /// A client configured with:
     /// - api_base: "https://openrouter.ai/api/v1"
     /// - default_model: "anthropic/claude-opus-4.5"
-    pub fn new_with_defaults(api_key: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new_with_defaults(api_key: String) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base: "https://openrouter.ai/api/v1".to_string(),
             api_key: Some(api_key),
             default_model: "anthropic/claude-opus-4.5".to_string(),
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client from environment variables.
@@ -382,7 +398,9 @@ impl LiteLlmClient {
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
         })
     }
 
@@ -490,6 +508,10 @@ struct ApiRequest {
     top_p: Option<f64>,
     #[serde(skip_serializing_if = "Option::is_none")]
     response_format: Option<ResponseFormat>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tools: Option<serde_json::Value>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_choice: Option<serde_json::Value>,
 }
 
 /// Internal response structure from the OpenAI-compatible API.
@@ -506,14 +528,42 @@ struct ApiResponse {
 struct ApiChoice {
     index: u32,
     message: ApiMessage,
-    finish_reason: String,
+    #[serde(default)]
+    finish_reason: Option<String>,
 }
 
 /// Internal message structure from the API response.
+/// Supports reasoning models and tool calls.
 #[derive(Debug, Deserialize)]
 struct ApiMessage {
     role: String,
+    #[serde(default)]
     content: String,
+    #[serde(default)]
+    reasoning: Option<String>,
+    #[serde(default)]
+    reasoning_content: Option<String>,
+    #[serde(default)]
+    tool_calls: Option<Vec<ApiToolCall>>,
+}
+
+/// A tool call returned by the model.
+#[derive(Debug, Deserialize)]
+struct ApiToolCall {
+    #[serde(default)]
+    id: String,
+    #[serde(rename = "type", default)]
+    _tool_type: String,
+    function: ApiToolCallFunction,
+}
+
+/// Function details within a tool call response.
+#[derive(Debug, Deserialize)]
+struct ApiToolCallFunction {
+    #[serde(default)]
+    name: String,
+    #[serde(default)]
+    arguments: String,
 }
 
 /// Internal usage structure from the API response.
@@ -549,6 +599,20 @@ impl LlmProvider for LiteLlmClient {
             request.model.clone()
         };
 
+        let tools =
+            match request.tools {
+                Some(t) => Some(serde_json::to_value(&t).map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to serialize tools: {e}"))
+                })?),
+                None => None,
+            };
+        let tool_choice = match request.tool_choice {
+            Some(tc) => Some(serde_json::to_value(&tc).map_err(|e| {
+                LlmError::RequestFailed(format!("Failed to serialize tool_choice: {e}"))
+            })?),
+            None => None,
+        };
+
         let api_request = ApiRequest {
             model: model.clone(),
             messages: request.messages,
@@ -556,6 +620,8 @@ impl LlmProvider for LiteLlmClient {
             max_tokens: request.max_tokens,
             top_p: request.top_p,
             response_format: request.response_format,
+            tools,
+            tool_choice,
         };
 
         let url = format!("{}/chat/completions", self.api_base);
@@ -617,15 +683,52 @@ impl LlmProvider for LiteLlmClient {
         let choices = api_response
             .choices
             .into_iter()
-            .map(|choice| Choice {
-                index: choice.index,
-                message: Message {
-                    role: choice.message.role,
-                    content: choice.message.content,
-                    tool_calls: None,
-                    tool_call_id: None,
-                },
-                finish_reason: choice.finish_reason,
+            .map(|choice| {
+                let tool_calls_info = choice.message.tool_calls.as_ref().map(|tcs| {
+                    tcs.iter()
+                        .map(|tc| ToolCallInfo {
+                            id: tc.id.clone(),
+                            call_type: "function".to_string(),
+                            function: ToolCallFunction {
+                                name: tc.function.name.clone(),
+                                arguments: tc.function.arguments.clone(),
+                            },
+                        })
+                        .collect::<Vec<_>>()
+                });
+
+                let content = if let Some(ref tool_calls) = choice.message.tool_calls {
+                    if let Some(first_call) = tool_calls.first() {
+                        if !first_call.function.arguments.is_empty() {
+                            first_call.function.arguments.clone()
+                        } else {
+                            choice.message.content.clone()
+                        }
+                    } else {
+                        choice.message.content.clone()
+                    }
+                } else if !choice.message.content.trim().is_empty() {
+                    choice.message.content
+                } else if let Some(rc) = choice.message.reasoning_content {
+                    if !rc.trim().is_empty() {
+                        rc
+                    } else {
+                        choice.message.reasoning.unwrap_or_default()
+                    }
+                } else {
+                    choice.message.reasoning.unwrap_or_default()
+                };
+
+                Choice {
+                    index: choice.index,
+                    message: Message {
+                        role: choice.message.role,
+                        content,
+                        tool_calls: tool_calls_info,
+                        tool_call_id: None,
+                    },
+                    finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".to_string()),
+                }
             })
             .collect();
 
@@ -849,7 +952,8 @@ mod tests {
             "http://localhost:4000".to_string(),
             Some("test-key".to_string()),
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert_eq!(client.api_base(), "http://localhost:4000");
         assert_eq!(client.default_model(), "gpt-4");
@@ -862,14 +966,15 @@ mod tests {
             "http://localhost:4000".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert!(!client.has_api_key());
     }
 
     #[test]
     fn test_litellm_client_new_with_defaults() {
-        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string());
+        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()).unwrap();
 
         assert_eq!(client.api_base(), "https://openrouter.ai/api/v1");
         assert_eq!(client.default_model(), "anthropic/claude-opus-4.5");
@@ -883,7 +988,8 @@ mod tests {
             "http://localhost:65535".to_string(), // Use a port that's unlikely to have a server
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         let request = GenerationRequest::new("gpt-4", vec![Message::user("test")]);
         let result = client.generate(request).await;
@@ -903,6 +1009,8 @@ mod tests {
             max_tokens: Some(1000),
             top_p: None,
             response_format: None,
+            tools: None,
+            tool_choice: None,
         };
 
         let json = serde_json::to_string(&request).expect("serialization should succeed");
@@ -910,6 +1018,8 @@ mod tests {
         assert!(json.contains("\"temperature\":0.7"));
         assert!(json.contains("\"max_tokens\":1000"));
         assert!(!json.contains("top_p")); // Should be skipped because None
+        assert!(!json.contains("tools")); // Should be skipped because None
+        assert!(!json.contains("tool_choice")); // Should be skipped because None
     }
 
     #[tokio::test]
@@ -918,7 +1028,8 @@ mod tests {
             "http://localhost:65535".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
         let cache = PromptCache::new(100);
 
         // Create a request with a system prompt
diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md
index 3ee4bc2..ac52fef 100644
--- a/src/swe/AGENTS.md
+++ b/src/swe/AGENTS.md
@@ -22,6 +22,7 @@ Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches
 | `pipeline.rs` | Streaming pipeline with semaphore-based concurrency |
 | `github_search.rs` | `GitHubSearchClient` — GitHub Search API as alternative PR source (30 req/min) |
 | `workspace_validator.rs` | `WorkspaceValidator` — pre-export Docker-based validation (install, tests, patch application) |
+| `tool_server.rs` | Embedded Python HTTP tool server injected into Docker containers (read_file, list_dir, grep_files, search_files, apply_patch) |
 | `pr_cache.rs` | SQLite-backed PR deduplication cache |
 | `progress.rs` | `ProgressMonitor` — background progress logging for long-running pipeline runs |
 
diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
index 6e9899a..2adf200 100644
--- a/src/swe/docker_sandbox.rs
+++ b/src/swe/docker_sandbox.rs
@@ -8,6 +8,7 @@ use std::process::Stdio;
 use tokio::process::Command;
 
 use crate::swe::tool_server::TOOL_SERVER_PY;
+use crate::swe::{validate_file_path, validate_git_ref, validate_repo_name};
 
 /// Shell command output from inside the container.
 pub struct SandboxOutput {
@@ -37,6 +38,11 @@ impl DockerSandbox {
         language: &str,
         image_override: Option<&str>,
     ) -> Result<Self> {
+        validate_repo_name(repo)?;
+        if !base_commit.is_empty() {
+            validate_git_ref(base_commit)?;
+        }
+
         let image = image_override.unwrap_or_else(|| image_for_language(language));
         let safe_name = repo.replace('/', "-").replace(' ', "_");
         let ts_suffix = std::time::SystemTime::now()
@@ -49,12 +55,15 @@ impl DockerSandbox {
         let tool_port = 10_000 + (ts_suffix as u16 % 50_000);
 
         // Remove stale container if it exists
-        let _ = Command::new("docker")
+        if let Err(e) = Command::new("docker")
             .args(["rm", "-f", &container_name])
             .stdout(Stdio::null())
             .stderr(Stdio::null())
             .status()
-            .await;
+            .await
+        {
+            tracing::debug!(container = %container_name, error = %e, "Failed to remove stale container (may not exist)");
+        }
 
         let run_output = Command::new("docker")
             .args([
@@ -81,7 +90,10 @@ impl DockerSandbox {
             );
         }
 
-        let sandbox = Self { container_name, tool_port };
+        let sandbox = Self {
+            container_name,
+            tool_port,
+        };
 
         // Install git (only hard dependency; agent installs everything else)
         let install = sandbox
@@ -99,10 +111,7 @@ impl DockerSandbox {
         }
 
         // Clone the repository (full clone for reliable checkout)
-        let clone_cmd = format!(
-            "git clone https://github.com/{}.git /repo 2>&1",
-            repo
-        );
+        let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", repo);
         let clone = sandbox.exec(&clone_cmd, 600_000).await;
         if clone.exit_code != 0 {
             sandbox.destroy().await;
@@ -154,7 +163,10 @@ impl DockerSandbox {
         }
 
         // Use write_file to inject the server script
-        if let Err(e) = self.write_file_abs("/tools/server.py", TOOL_SERVER_PY).await {
+        if let Err(e) = self
+            .write_file_abs("/tools/server.py", TOOL_SERVER_PY)
+            .await
+        {
             tracing::warn!(container = %self.container_name, error = %e, "Failed to write tool server");
             return;
         }
@@ -201,8 +213,13 @@ impl DockerSandbox {
             std::time::Duration::from_millis(2_000),
             Command::new("docker")
                 .args([
-                    "exec", "-w", "/repo", &self.container_name,
-                    "python3", "-c", &check_cmd,
+                    "exec",
+                    "-w",
+                    "/repo",
+                    &self.container_name,
+                    "python3",
+                    "-c",
+                    &check_cmd,
                 ])
                 .stdout(Stdio::null())
                 .stderr(Stdio::null())
@@ -215,6 +232,16 @@ impl DockerSandbox {
     /// Call a tool on the HTTP tool server running inside the container.
     /// Pipes the JSON args via stdin to avoid shell escaping issues.
     pub async fn tool_request(&self, tool_name: &str, args_json: &str) -> SandboxOutput {
+        for ch in tool_name.chars() {
+            if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
+                return SandboxOutput {
+                    stdout: String::new(),
+                    stderr: format!("Invalid tool name: {}", tool_name),
+                    exit_code: -1,
+                };
+            }
+        }
+
         let script = format!(
             "import sys, urllib.request, json\n\
              data = sys.stdin.read()\n\
@@ -230,29 +257,31 @@ impl DockerSandbox {
             self.tool_port, tool_name
         );
 
-        let result = tokio::time::timeout(
-            std::time::Duration::from_millis(65_000),
-            async {
-                let mut child = Command::new("docker")
-                    .args([
-                        "exec", "-i", "-w", "/repo",
-                        &self.container_name,
-                        "python3", "-c", &script,
-                    ])
-                    .stdin(Stdio::piped())
-                    .stdout(Stdio::piped())
-                    .stderr(Stdio::piped())
-                    .spawn()?;
-
-                if let Some(ref mut stdin) = child.stdin {
-                    use tokio::io::AsyncWriteExt;
-                    stdin.write_all(args_json.as_bytes()).await?;
-                    stdin.shutdown().await?;
-                }
-
-                child.wait_with_output().await
+        let result = tokio::time::timeout(std::time::Duration::from_millis(65_000), async {
+            let mut child = Command::new("docker")
+                .args([
+                    "exec",
+                    "-i",
+                    "-w",
+                    "/repo",
+                    &self.container_name,
+                    "python3",
+                    "-c",
+                    &script,
+                ])
+                .stdin(Stdio::piped())
+                .stdout(Stdio::piped())
+                .stderr(Stdio::piped())
+                .spawn()?;
+
+            if let Some(ref mut stdin) = child.stdin {
+                use tokio::io::AsyncWriteExt;
+                stdin.write_all(args_json.as_bytes()).await?;
+                stdin.shutdown().await?;
             }
-        )
+
+            child.wait_with_output().await
+        })
         .await;
 
         match result {
@@ -315,14 +344,63 @@ impl DockerSandbox {
     }
 
     /// Write a file to an absolute path inside the container.
+    ///
+    /// Only allows paths under known safe prefixes (`/tools/`).
     async fn write_file_abs(&self, abs_path: &str, content: &str) -> Result<()> {
+        if !abs_path.starts_with("/tools/") {
+            anyhow::bail!(
+                "write_file_abs only allows paths under /tools/, got '{}'",
+                abs_path
+            );
+        }
+        for ch in abs_path.chars() {
+            if matches!(
+                ch,
+                '\'' | '"'
+                    | '`'
+                    | '$'
+                    | '!'
+                    | '&'
+                    | '|'
+                    | ';'
+                    | '('
+                    | ')'
+                    | '{'
+                    | '}'
+                    | '<'
+                    | '>'
+                    | '\\'
+                    | '\0'
+                    | '\n'
+                    | '\r'
+            ) {
+                anyhow::bail!(
+                    "invalid character in absolute path '{}': shell metacharacters not allowed",
+                    abs_path
+                );
+            }
+        }
+        if abs_path.contains("..") {
+            anyhow::bail!(
+                "absolute path '{}' contains '..' (path traversal not allowed)",
+                abs_path
+            );
+        }
+
         let mkdir_cmd = format!("mkdir -p \"$(dirname '{}')\"", abs_path);
         self.exec(&mkdir_cmd, 10_000).await;
 
         let tee_cmd = format!("cat > '{}'", abs_path);
         let mut child = Command::new("docker")
             .args([
-                "exec", "-i", "-w", "/repo", &self.container_name, "bash", "-c", &tee_cmd,
+                "exec",
+                "-i",
+                "-w",
+                "/repo",
+                &self.container_name,
+                "bash",
+                "-c",
+                &tee_cmd,
             ])
             .stdin(Stdio::piped())
             .stdout(Stdio::null())
@@ -348,6 +426,8 @@ impl DockerSandbox {
 
     /// Write a file inside the container by piping content via stdin.
     pub async fn write_file(&self, path: &str, content: &str) -> Result<()> {
+        validate_file_path(path)?;
+
         // First ensure the parent directory exists
         let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", path);
         self.exec(&mkdir_cmd, 10_000).await;
@@ -389,6 +469,8 @@ impl DockerSandbox {
 
     /// Read a file from inside the container.
     pub async fn read_file(&self, path: &str) -> Result<String> {
+        validate_file_path(path)?;
+
         let cmd = format!("cat '/repo/{}'", path);
         let result = self.exec(&cmd, 10_000).await;
         if result.exit_code != 0 {
@@ -403,12 +485,15 @@ impl DockerSandbox {
 
     /// Destroy the container.
     pub async fn destroy(&self) {
-        let _ = Command::new("docker")
+        if let Err(e) = Command::new("docker")
             .args(["rm", "-f", &self.container_name])
             .stdout(Stdio::null())
             .stderr(Stdio::null())
             .status()
-            .await;
+            .await
+        {
+            tracing::debug!(container = %self.container_name, error = %e, "Failed to destroy container");
+        }
         tracing::debug!(container = %self.container_name, "Docker sandbox destroyed");
     }
 
@@ -416,7 +501,6 @@ impl DockerSandbox {
     pub fn name(&self) -> &str {
         &self.container_name
     }
-
 }
 
 /// Ensure the sandbox is destroyed when dropped (best-effort sync cleanup).
diff --git a/src/swe/extractor.rs b/src/swe/extractor.rs
index ab84631..871e77d 100644
--- a/src/swe/extractor.rs
+++ b/src/swe/extractor.rs
@@ -4,7 +4,7 @@
 use anyhow::Result;
 
 use crate::swe::docker_sandbox::DockerSandbox;
-use crate::swe::SweTask;
+use crate::swe::{validate_git_ref, SweTask};
 
 fn github_token() -> Option<String> {
     std::env::var("GITHUB_TOKEN")
@@ -188,11 +188,22 @@ impl PatchExtractor {
 
     /// Clone the repo inside a Docker container and extract the diff.
     async fn fetch_diff_from_docker(&self, input: &PatchExtractionInput<'_>) -> Result<String> {
+        if let Some(base) = input.base_commit {
+            if !base.is_empty() {
+                validate_git_ref(base)?;
+            }
+        }
+        if let Some(merge) = input.merge_commit {
+            if !merge.is_empty() {
+                validate_git_ref(merge)?;
+            }
+        }
+
         let base_commit = input.base_commit.unwrap_or("");
         let sandbox =
             DockerSandbox::start(input.repository, base_commit, input.language, None).await?;
 
-        let diff_ref = match (input.base_commit, input.merge_commit) {
+        let diff_cmd = match (input.base_commit, input.merge_commit) {
             (Some(base), Some(merge)) if !base.is_empty() && !merge.is_empty() => {
                 // Fetch the merge commit (shallow clone may not have it)
                 sandbox
@@ -201,7 +212,7 @@ impl PatchExtractor {
                         60_000,
                     )
                     .await;
-                format!("{base}..{merge}")
+                format!("git diff --no-color --unified=3 {} {} 2>&1", base, merge)
             }
             (_, Some(merge)) if !merge.is_empty() => {
                 sandbox
@@ -210,17 +221,12 @@ impl PatchExtractor {
                         60_000,
                     )
                     .await;
-                merge.to_string()
+                format!("git diff --no-color --unified=3 HEAD {} 2>&1", merge)
             }
-            _ => "HEAD".to_string(),
+            _ => "git diff --no-color --unified=3 HEAD~1 HEAD 2>&1".to_string(),
         };
 
-        let result = sandbox
-            .exec(
-                &format!("git show --no-color --unified=3 {} 2>&1", diff_ref),
-                60_000,
-            )
-            .await;
+        let result = sandbox.exec(&diff_cmd, 60_000).await;
 
         sandbox.destroy().await;
 
@@ -263,7 +269,6 @@ impl PatchExtractor {
 #[derive(Default)]
 struct PatchBlock {
     patch: String,
-    lines: usize,
 }
 
 fn count_line_delta(raw: &str) -> (usize, usize) {
@@ -367,10 +372,8 @@ fn append_to_partition(
 ) {
     if is_test_file(file_path) {
         tests.patch.push_str(block);
-        tests.lines = tests.lines.saturating_add(block.lines().count());
     } else {
         solution.patch.push_str(block);
-        solution.lines = solution.lines.saturating_add(block.lines().count());
     }
 }
 
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
index 900f751..90146ed 100644
--- a/src/swe/harness.rs
+++ b/src/swe/harness.rs
@@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use tokio::process::Command;
 use tracing::{info, warn};
 
-use super::SweTask;
+use super::{validate_file_path, validate_git_ref, validate_repo_name, SweTask};
 
 // ---------------------------------------------------------------------------
 // Config
@@ -137,13 +137,18 @@ async fn docker_exec(container: &str, cmd: &str, timeout_secs: u64) -> (i32, Str
 }
 
 async fn docker_rm(container: &str) {
-    let _ = Command::new("docker")
+    if let Err(e) = Command::new("docker")
         .args(["rm", "-f", container])
         .output()
-        .await;
+        .await
+    {
+        tracing::debug!(container = container, error = %e, "Failed to remove container (may not exist)");
+    }
 }
 
 async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {
+    validate_file_path(path)?;
+
     use tokio::io::AsyncWriteExt;
     let tee_cmd = format!("cat > '/repo/{}'", path);
     let mut child = Command::new("docker")
@@ -203,6 +208,18 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
         container_id: Some(cname.clone()),
     };
 
+    // 0. INPUT VALIDATION: reject shell-unsafe repo names and commit refs
+    if let Err(e) = validate_repo_name(&task.repo) {
+        result.error = Some(format!("Invalid repo name: {e}"));
+        return result;
+    }
+    if !task.base_commit.is_empty() {
+        if let Err(e) = validate_git_ref(&task.base_commit) {
+            result.error = Some(format!("Invalid base commit: {e}"));
+            return result;
+        }
+    }
+
     // 1. SETUP: start container
     info!(task_id = %task.id, "Starting container {}", cname);
     let agent_dir_abs = if let Ok(docker_dir) = std::env::var("DOCKER_AGENT_DIR") {
@@ -273,10 +290,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
     }
 
     // Clone repo (full clone for reliable checkout)
-    let clone_cmd = format!(
-        "git clone https://github.com/{}.git /repo 2>&1",
-        task.repo
-    );
+    let clone_cmd = format!("git clone https://github.com/{}.git /repo 2>&1", task.repo);
     let (code, _, err) = docker_exec(&cname, &clone_cmd, 600).await;
     if code != 0 {
         result.error = Some(format!("Clone failed: {}", truncate(&err, 500)));
@@ -326,6 +340,10 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
             serde_json::from_str::<Vec<super::test_generator::TestFile>>(test_files_json)
         {
             for tf in &files {
+                if let Err(e) = validate_file_path(&tf.path) {
+                    warn!(task_id = %task.id, path = %tf.path, error = %e, "Skipping test file with invalid path");
+                    continue;
+                }
                 let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path);
                 docker_exec(&cname, &mkdir_cmd, 10).await;
                 let write_result = docker_write_file(&cname, &tf.path, &tf.content).await;
diff --git a/src/swe/mod.rs b/src/swe/mod.rs
index 0e4f42c..4a25962 100644
--- a/src/swe/mod.rs
+++ b/src/swe/mod.rs
@@ -46,6 +46,138 @@ pub use workspace_validator::{ValidationOutcome, WorkspaceValidator};
 /// Default output directory for generated SWE workspaces.
 pub const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe";
 
+/// Validate a git ref (commit SHA, branch name) to prevent shell injection.
+///
+/// Accepts hex-only SHAs (short or full) and standard git ref names
+/// (alphanumeric, `/`, `.`, `-`, `_`). Rejects shell metacharacters,
+/// `..` sequences (path traversal), and refs starting with `-` (flag injection).
+pub fn validate_git_ref(s: &str) -> Result<(), anyhow::Error> {
+    if s.is_empty() {
+        anyhow::bail!("git ref is empty");
+    }
+    if s.len() > 256 {
+        anyhow::bail!("git ref too long ({} chars, max 256)", s.len());
+    }
+    if s.starts_with('-') {
+        anyhow::bail!(
+            "git ref '{}' must not start with '-' (could be interpreted as a flag)",
+            s
+        );
+    }
+    if s.contains("..") {
+        anyhow::bail!("git ref '{}' must not contain '..' (path traversal)", s);
+    }
+    for ch in s.chars() {
+        if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '.' | '-' | '_' | '~' | '^') {
+            anyhow::bail!(
+                "invalid character '{}' in git ref '{}': only alphanumeric, /, ., -, _, ~, ^ allowed",
+                ch,
+                s
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Validate a GitHub repository name (`owner/repo`) to prevent shell injection.
+///
+/// Accepts the standard GitHub `owner/repo` format where both parts contain
+/// only alphanumeric characters, hyphens, underscores, and dots. Parts must
+/// not start with `.` or `-` to prevent path traversal and flag injection.
+pub fn validate_repo_name(s: &str) -> Result<(), anyhow::Error> {
+    if s.is_empty() {
+        anyhow::bail!("repository name is empty");
+    }
+    if s.len() > 256 {
+        anyhow::bail!("repository name too long ({} chars, max 256)", s.len());
+    }
+    let parts: Vec<&str> = s.split('/').collect();
+    if parts.len() != 2 {
+        anyhow::bail!(
+            "invalid repository name '{}': expected 'owner/repo' format",
+            s
+        );
+    }
+    for part in &parts {
+        if part.is_empty() {
+            anyhow::bail!(
+                "invalid repository name '{}': owner and repo must be non-empty",
+                s
+            );
+        }
+        if part.starts_with('.') || part.starts_with('-') {
+            anyhow::bail!(
+                "invalid repository name '{}': parts must not start with '.' or '-'",
+                s
+            );
+        }
+        for ch in part.chars() {
+            if !matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.') {
+                anyhow::bail!(
+                    "invalid character '{}' in repository name '{}': only alphanumeric, -, _, . allowed",
+                    ch,
+                    s
+                );
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Validate a file path used in shell commands inside Docker containers.
+///
+/// Rejects paths containing shell metacharacters, single/double quotes,
+/// null bytes, and `..` traversal sequences. This prevents shell injection
+/// when paths are interpolated into commands like `cat > '/repo/{path}'`.
+pub fn validate_file_path(path: &str) -> Result<(), anyhow::Error> {
+    if path.is_empty() {
+        anyhow::bail!("file path is empty");
+    }
+    if path.len() > 4096 {
+        anyhow::bail!("file path too long ({} chars, max 4096)", path.len());
+    }
+    if path.contains('\0') {
+        anyhow::bail!("file path contains null byte");
+    }
+    if path.contains("..") {
+        anyhow::bail!(
+            "file path '{}' contains '..' (path traversal not allowed)",
+            path
+        );
+    }
+    if path.starts_with('/') {
+        anyhow::bail!("file path '{}' must be relative (no leading '/')", path);
+    }
+    for ch in path.chars() {
+        if matches!(
+            ch,
+            '\'' | '"'
+                | '`'
+                | '$'
+                | '!'
+                | '&'
+                | '|'
+                | ';'
+                | '('
+                | ')'
+                | '{'
+                | '}'
+                | '<'
+                | '>'
+                | '\\'
+                | '\n'
+                | '\r'
+        ) {
+            anyhow::bail!(
+                "invalid character '{}' in file path '{}': shell metacharacters not allowed",
+                ch,
+                path
+            );
+        }
+    }
+    Ok(())
+}
+
 /// DataForge-compatible task format for SWE mined items.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SweTask {
@@ -213,3 +345,129 @@ impl SweTask {
         map
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn validate_git_ref_accepts_hex_sha() {
+        assert!(validate_git_ref("abc123def456").is_ok());
+        assert!(validate_git_ref("a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2").is_ok());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_empty() {
+        assert!(validate_git_ref("").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_accepts_branch_names() {
+        assert!(validate_git_ref("main").is_ok());
+        assert!(validate_git_ref("feature/my-branch").is_ok());
+        assert!(validate_git_ref("HEAD~1").is_ok());
+        assert!(validate_git_ref("v1.2.3").is_ok());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_shell_injection() {
+        assert!(validate_git_ref("abc123; rm -rf /").is_err());
+        assert!(validate_git_ref("$(whoami)").is_err());
+        assert!(validate_git_ref("`id`").is_err());
+        assert!(validate_git_ref("abc | cat /etc/passwd").is_err());
+        assert!(validate_git_ref("abc && echo pwned").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_double_dot() {
+        assert!(validate_git_ref("main..HEAD").is_err());
+        assert!(validate_git_ref("../../etc/passwd").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_leading_dash() {
+        assert!(validate_git_ref("--exec=whoami").is_err());
+        assert!(validate_git_ref("-n").is_err());
+    }
+
+    #[test]
+    fn validate_git_ref_rejects_too_long() {
+        let long_ref = "a".repeat(257);
+        assert!(validate_git_ref(&long_ref).is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_accepts_valid() {
+        assert!(validate_repo_name("owner/repo").is_ok());
+        assert!(validate_repo_name("my-org/my-repo").is_ok());
+        assert!(validate_repo_name("user123/project.js").is_ok());
+        assert!(validate_repo_name("Org_Name/Repo_Name").is_ok());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_shell_injection() {
+        assert!(validate_repo_name("owner/repo; rm -rf /").is_err());
+        assert!(validate_repo_name("$(whoami)/repo").is_err());
+        assert!(validate_repo_name("owner/repo && echo pwned").is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_invalid_format() {
+        assert!(validate_repo_name("").is_err());
+        assert!(validate_repo_name("noslash").is_err());
+        assert!(validate_repo_name("too/many/slashes").is_err());
+        assert!(validate_repo_name("/repo").is_err());
+        assert!(validate_repo_name("owner/").is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_leading_dot_or_dash() {
+        assert!(validate_repo_name(".hidden/repo").is_err());
+        assert!(validate_repo_name("owner/.repo").is_err());
+        assert!(validate_repo_name("-flag/repo").is_err());
+        assert!(validate_repo_name("owner/-repo").is_err());
+        assert!(validate_repo_name("..traversal/repo").is_err());
+    }
+
+    #[test]
+    fn validate_repo_name_rejects_too_long() {
+        let long_name = format!("{}/{}", "a".repeat(128), "b".repeat(128));
+        assert!(validate_repo_name(&long_name).is_err());
+    }
+
+    #[test]
+    fn validate_file_path_accepts_valid() {
+        assert!(validate_file_path("tests/test_foo.py").is_ok());
+        assert!(validate_file_path("src/main.rs").is_ok());
+        assert!(validate_file_path("a/b/c/d.txt").is_ok());
+        assert!(validate_file_path("file.txt").is_ok());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_traversal() {
+        assert!(validate_file_path("../etc/passwd").is_err());
+        assert!(validate_file_path("a/../../etc/passwd").is_err());
+        assert!(validate_file_path("..").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_shell_metacharacters() {
+        assert!(validate_file_path("file'; rm -rf /; echo '").is_err());
+        assert!(validate_file_path("file$(whoami)").is_err());
+        assert!(validate_file_path("file`id`").is_err());
+        assert!(validate_file_path("file|cat").is_err());
+        assert!(validate_file_path("file;ls").is_err());
+        assert!(validate_file_path("file&echo").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_empty_and_absolute() {
+        assert!(validate_file_path("").is_err());
+        assert!(validate_file_path("/etc/passwd").is_err());
+    }
+
+    #[test]
+    fn validate_file_path_rejects_null_byte() {
+        assert!(validate_file_path("file\0.txt").is_err());
+    }
+}
diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs
index 61a5dd8..653be13 100644
--- a/src/swe/test_generator.rs
+++ b/src/swe/test_generator.rs
@@ -701,7 +701,10 @@ impl TestGenerator {
 
         // Reset repo to clean state before applying patch (agent may have modified files)
         sandbox
-            .exec("cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null", 30_000)
+            .exec(
+                "cd /repo && git checkout -- . 2>/dev/null && git clean -fd 2>/dev/null",
+                30_000,
+            )
             .await;
 
         // Re-write test files (they were cleaned by git checkout)
@@ -718,12 +721,18 @@ impl TestGenerator {
         }
 
         let apply_result = sandbox
-            .exec("cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1", 30_000)
+            .exec(
+                "cd /repo && git apply --allow-empty .swe_forge_pr.patch 2>&1",
+                30_000,
+            )
             .await;
 
         if apply_result.exit_code != 0 {
             let apply_3way = sandbox
-                .exec("cd /repo && git apply --3way .swe_forge_pr.patch 2>&1", 30_000)
+                .exec(
+                    "cd /repo && git apply --3way .swe_forge_pr.patch 2>&1",
+                    30_000,
+                )
                 .await;
             if apply_3way.exit_code != 0 {
                 tracing::warn!(
@@ -732,7 +741,9 @@ impl TestGenerator {
                     exit = apply_3way.exit_code,
                     "Patch apply failed, rejecting task"
                 );
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
                 return ValidationResult::Rejected(
                     "PR patch could not be applied to the base commit. The test cannot be validated.".to_string()
                 );
@@ -743,10 +754,16 @@ impl TestGenerator {
         for cmd in &submit.fail_to_pass {
             let result = sandbox.exec(cmd, 60_000).await;
             if result.exit_code != 0 {
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-                sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
+                sandbox
+                    .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+                    .await;
                 for tf in test_files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after f2p rejection");
+                    }
                 }
                 return ValidationResult::Rejected(format!(
                     "fail_to_pass test '{}' still FAILS after the PR patch is applied (exit={}, stderr={}). \
@@ -762,10 +779,16 @@ impl TestGenerator {
         for cmd in &submit.pass_to_pass {
             let result = sandbox.exec(cmd, 60_000).await;
             if result.exit_code != 0 {
-                sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-                sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+                sandbox
+                    .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+                    .await;
+                sandbox
+                    .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+                    .await;
                 for tf in test_files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after p2p rejection");
+                    }
                 }
                 return ValidationResult::Rejected(format!(
                     "pass_to_pass test '{}' FAILS after the PR patch (exit={}, stderr={}). \
@@ -778,10 +801,16 @@ impl TestGenerator {
         }
 
         // Revert to base commit for cleanliness
-        sandbox.exec("cd /repo && git checkout -- . 2>/dev/null", 10_000).await;
-        sandbox.exec("cd /repo && git clean -fd 2>/dev/null", 10_000).await;
+        sandbox
+            .exec("cd /repo && git checkout -- . 2>/dev/null", 10_000)
+            .await;
+        sandbox
+            .exec("cd /repo && git clean -fd 2>/dev/null", 10_000)
+            .await;
         for tf in test_files {
-            let _ = sandbox.write_file(&tf.path, &tf.content).await;
+            if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                tracing::warn!(path = %tf.path, error = %e, "Failed to restore test file after validation");
+            }
         }
 
         ValidationResult::Accepted
@@ -847,14 +876,17 @@ impl TestGenerator {
                     .await;
                 let output = if !result.stdout.is_empty() {
                     // Parse JSON response from tool server
-                    match serde_json::from_str::<serde_json::Value>(&result.stdout.trim()) {
+                    match serde_json::from_str::<serde_json::Value>(result.stdout.trim()) {
                         Ok(v) => {
                             if let Some(err) = v.get("error").and_then(|e| e.as_str()) {
                                 err.to_string()
                             } else if let Some(out) = v.get("output").and_then(|o| o.as_str()) {
                                 let mut s = out.to_string();
                                 if let Some(total) = v.get("total_lines").and_then(|t| t.as_u64()) {
-                                    if v.get("truncated").and_then(|t| t.as_bool()).unwrap_or(false) {
+                                    if v.get("truncated")
+                                        .and_then(|t| t.as_bool())
+                                        .unwrap_or(false)
+                                    {
                                         s.push_str(&format!("\n\n[Showing {}/{} lines. Use offset/limit to see more.]",
                                             v.get("shown_lines").and_then(|s| s.as_u64()).unwrap_or(0), total));
                                     } else {
diff --git a/src/swe/workspace_validator.rs b/src/swe/workspace_validator.rs
index b4619c8..190f280 100644
--- a/src/swe/workspace_validator.rs
+++ b/src/swe/workspace_validator.rs
@@ -84,6 +84,26 @@ impl WorkspaceValidator {
         sandbox: &DockerSandbox,
         task: &SweTask,
     ) -> Result<ValidationOutcome, anyhow::Error> {
+        // --- Install language runtime if needed ---
+        let runtime_install = match task.language.to_lowercase().as_str() {
+            "go" | "golang" => Some("apt-get update -qq && apt-get install -y -qq golang > /dev/null 2>&1"),
+            "javascript" | "typescript" | "js" | "ts" => Some("apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1"),
+            "rust" => Some("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1 && . $HOME/.cargo/env"),
+            "java" => Some("apt-get update -qq && apt-get install -y -qq default-jdk maven > /dev/null 2>&1"),
+            _ => None,
+        };
+        if let Some(cmd) = runtime_install {
+            let rt_result = sandbox.exec(&format!("{} 2>&1", cmd), 300_000).await;
+            if rt_result.exit_code != 0 {
+                tracing::warn!(
+                    task_id = %task.id,
+                    language = %task.language,
+                    exit = rt_result.exit_code,
+                    "Runtime install failed during validation (continuing)"
+                );
+            }
+        }
+
         // --- Install ---
         if let Some(install_cmd) = task.install_config.get("install") {
             if !install_cmd.is_empty() && !install_cmd.starts_with('#') {
@@ -191,7 +211,9 @@ impl WorkspaceValidator {
         if let Some(test_files_json) = task.meta.get("test_files") {
             if let Ok(files) = serde_json::from_str::<Vec<TestFile>>(test_files_json) {
                 for tf in &files {
-                    let _ = sandbox.write_file(&tf.path, &tf.content).await;
+                    if let Err(e) = sandbox.write_file(&tf.path, &tf.content).await {
+                        tracing::warn!(path = %tf.path, error = %e, "Failed to re-write test file after patch");
+                    }
                 }
             }
         }