sourcegraph
diff --git a/‎docs/official_results/README.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/official_results/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/official_results/data/official_results.json‎
Lines changed: 3 additions & 3 deletions b/‎docs/official_results/data/official_results.json‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/official_results/tasks/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.html‎
Lines changed: 1 addition & 1 deletion b/‎docs/official_results/tasks/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎runs/official/MANIFEST.json‎
Lines changed: 1 addition & 1 deletion b/‎runs/official/MANIFEST.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/config.json‎
Lines changed: 73 additions & 0 deletions b/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/config.json‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/result.json‎
Lines changed: 29 additions & 0 deletions b/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/result.json‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/claude-code.txt‎
Lines changed: 157 additions & 0 deletions b/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/claude-code.txt‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/command-0/command.txt‎
Lines changed: 1 addition & 0 deletions b/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/command-0/command.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/command-0/return-code.txt‎
Lines changed: 1 addition & 0 deletions b/‎runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/command-0/return-code.txt‎
Lines changed: 1 addition & 0 deletions
@@ -2,7 +2,7 @@
 
 This bundle is generated from `runs/official/` and includes only valid scored tasks (`passed`/`failed` with numeric reward).
 
-Generated: `2026-03-01T19:45:06.820720+00:00`
+Generated: `2026-03-01T20:06:11.026433+00:00`
 
 ## Local Browse
 
 
@@ -201500,7 +201500,7 @@
       "audit_page": "audits/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.json",
       "benchmark_github": "https://github.com/sourcegraph/CodeContextBench/blob/main/benchmarks/ccb_test/curl-security-review-001",
       "benchmark_path": "benchmarks/ccb_test/curl-security-review-001",
-      "benchmark_task_path": null,
+      "benchmark_task_path": "/tmp/sgonly_curl-security-review-001",
       "bundled_trace_paths": {
         "trajectory": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/trajectory.json",
         "transcript": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/claude-code.txt"
@@ -220249,7 +220249,7 @@
       "wall_clock_seconds": 105.327732
     }
   ],
-  "generated_at": "2026-03-01T19:45:06.475567+00:00",
+  "generated_at": "2026-03-01T20:06:10.542100+00:00",
   "repo_blob_base": "https://github.com/sourcegraph/CodeContextBench/blob/main",
   "run_count": 224,
   "run_summaries": [
@@ -307138,7 +307138,7 @@
       "audit_page": "audits/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.json",
       "benchmark_github": "https://github.com/sourcegraph/CodeContextBench/blob/main/benchmarks/ccb_test/curl-security-review-001",
       "benchmark_path": "benchmarks/ccb_test/curl-security-review-001",
-      "benchmark_task_path": null,
+      "benchmark_task_path": "/tmp/sgonly_curl-security-review-001",
       "bundled_trace_paths": {
         "trajectory": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/trajectory.json",
         "transcript": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/claude-code.txt"
 
@@ -40,7 +40,7 @@ <h2>Task Information</h2>
         <div class="metric"><div class="k">Agent</div><div class="v" style="font-size:16px">claude-code</div></div>
         <div class="metric"><div class="k">Model</div><div class="v" style="font-size:16px">claude-haiku-4-5-20251001</div></div>
       </div>
-      <p class="meta" style="margin-top:10px">Benchmark path: <code>-</code></p>
+      <p class="meta" style="margin-top:10px">Benchmark path: <code>/tmp/sgonly_curl-security-review-001</code></p>
       <details>
         <summary>Task instruction sent to agent</summary>
         <pre># IMPORTANT: Source Code Access
 
@@ -40,6 +40,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 - `scripts/compute_retrieval_metrics.py` - Analysis/comparison script for compute retrieval metrics.
 - `scripts/cost_breakdown_analysis.py` - Analysis/comparison script for cost breakdown analysis.
 - `scripts/cost_report.py` - Aggregates token and cost metrics per run, suite, and config.
+- `scripts/doe_variance_analysis.py` - Analysis/comparison script for doe variance analysis.
 - `scripts/ds_audit.py` - Analysis/comparison script for ds audit.
 - `scripts/economic_analysis.py` - Analysis/comparison script for economic analysis.
 - `scripts/failure_analysis.py` - Analysis/comparison script for failure analysis.
 
@@ -1,6 +1,6 @@
 {
   "description": "Canonical run manifest for CodeContextBench evaluation",
-  "generated": "2026-03-01T19:44:25.410250+00:00",
+  "generated": "2026-03-01T20:05:16.926750+00:00",
   "total_tasks": 758,
   "total_runs": 76,
   "runs": {
 
@@ -0,0 +1,73 @@
+{
+    "job_name": "curl-fix-validate-mcp",
+    "jobs_dir": "runs/staging/curl_fix_validate",
+    "n_attempts": 1,
+    "timeout_multiplier": 10.0,
+    "agent_timeout_multiplier": null,
+    "verifier_timeout_multiplier": null,
+    "agent_setup_timeout_multiplier": null,
+    "environment_build_timeout_multiplier": null,
+    "debug": false,
+    "orchestrator": {
+        "type": "local",
+        "n_concurrent_trials": 1,
+        "quiet": false,
+        "retry": {
+            "max_retries": 0,
+            "include_exceptions": null,
+            "exclude_exceptions": [
+                "RewardFileNotFoundError",
+                "VerifierTimeoutError",
+                "VerifierOutputParseError",
+                "RewardFileEmptyError",
+                "AgentTimeoutError"
+            ],
+            "wait_multiplier": 1.0,
+            "min_wait_sec": 1.0,
+            "max_wait_sec": 60.0
+        },
+        "kwargs": {}
+    },
+    "environment": {
+        "type": "docker",
+        "import_path": null,
+        "force_build": false,
+        "delete": true,
+        "override_cpus": null,
+        "override_memory_mb": null,
+        "override_storage_mb": null,
+        "override_gpus": null,
+        "suppress_override_warnings": false,
+        "kwargs": {}
+    },
+    "verifier": {
+        "override_timeout_sec": null,
+        "max_timeout_sec": null,
+        "disable": false
+    },
+    "metrics": [],
+    "agents": [
+        {
+            "name": null,
+            "import_path": "agents.claude_baseline_agent:BaselineClaudeCodeAgent",
+            "model_name": "anthropic/claude-haiku-4-5-20251001",
+            "override_timeout_sec": null,
+            "override_setup_timeout_sec": null,
+            "max_timeout_sec": null,
+            "kwargs": {},
+            "env": {}
+        }
+    ],
+    "datasets": [],
+    "tasks": [
+        {
+            "path": "/tmp/sgonly_curl-security-review-001",
+            "git_url": null,
+            "git_commit_id": null,
+            "overwrite": false,
+            "download_dir": null,
+            "source": null
+        }
+    ],
+    "artifacts": []
+}
@@ -0,0 +1,29 @@
+{
+    "id": "d1b85392-0643-4100-8b41-17aaa9844d21",
+    "started_at": "2026-03-01T19:59:13.597193",
+    "finished_at": "2026-03-01T20:03:16.821642",
+    "n_total_trials": 1,
+    "stats": {
+        "n_trials": 1,
+        "n_errors": 0,
+        "evals": {
+            "claude-code__claude-haiku-4-5-20251001__adhoc": {
+                "n_trials": 1,
+                "n_errors": 0,
+                "metrics": [
+                    {
+                        "mean": 0.51
+                    }
+                ],
+                "reward_stats": {
+                    "reward": {
+                        "0.51": [
+                            "sgonly_curl-security-review-001__pyVKGJB"
+                        ]
+                    }
+                },
+                "exception_stats": {}
+            }
+        }
+    }
+}
@@ -0,0 +1 @@
+mkdir -p $CLAUDE_CONFIG_DIR/debug $CLAUDE_CONFIG_DIR/projects/-app $CLAUDE_CONFIG_DIR/shell-snapshots $CLAUDE_CONFIG_DIR/statsig $CLAUDE_CONFIG_DIR/todos && if [ -d ~/.claude/skills ]; then cp -r ~/.claude/skills $CLAUDE_CONFIG_DIR/skills 2>/dev/null || true; fi
@@ -0,0 +1 @@
+0
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"description": "Canonical run manifest for CodeContextBench evaluation",`
`3`		`- "generated": "2026-03-01T19:44:25.410250+00:00",`
	`3`	`+ "generated": "2026-03-01T20:05:16.926750+00:00",`
`4`	`4`	`"total_tasks": 758,`
`5`	`5`	`"total_runs": 76,`
`6`	`6`	`"runs": {`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+mkdir -p $CLAUDE_CONFIG_DIR/debug $CLAUDE_CONFIG_DIR/projects/-app $CLAUDE_CONFIG_DIR/shell-snapshots $CLAUDE_CONFIG_DIR/statsig $CLAUDE_CONFIG_DIR/todos && if [ -d ~/.claude/skills ]; then cp -r ~/.claude/skills $CLAUDE_CONFIG_DIR/skills 2>/dev/null \|\| true; fi`