Skip to content

Commit 7fbc48c

Browse files
sjarmakclaude
andcommitted
chore: promote curl_fix_validate MCP run, regenerate MANIFEST + docs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f72717a commit 7fbc48c

File tree

23 files changed

+5261
-7
lines changed

23 files changed

+5261
-7
lines changed

docs/official_results/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This bundle is generated from `runs/official/` and includes only valid scored tasks (`passed`/`failed` with numeric reward).
44

5-
Generated: `2026-03-01T19:45:06.820720+00:00`
5+
Generated: `2026-03-01T20:06:11.026433+00:00`
66

77
## Local Browse
88

docs/official_results/data/official_results.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201500,7 +201500,7 @@
201500201500
"audit_page": "audits/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.json",
201501201501
"benchmark_github": "https://github.com/sourcegraph/CodeContextBench/blob/main/benchmarks/ccb_test/curl-security-review-001",
201502201502
"benchmark_path": "benchmarks/ccb_test/curl-security-review-001",
201503-
"benchmark_task_path": null,
201503+
"benchmark_task_path": "/tmp/sgonly_curl-security-review-001",
201504201504
"bundled_trace_paths": {
201505201505
"trajectory": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/trajectory.json",
201506201506
"transcript": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/claude-code.txt"
@@ -220249,7 +220249,7 @@
220249220249
"wall_clock_seconds": 105.327732
220250220250
}
220251220251
],
220252-
"generated_at": "2026-03-01T19:45:06.475567+00:00",
220252+
"generated_at": "2026-03-01T20:06:10.542100+00:00",
220253220253
"repo_blob_base": "https://github.com/sourcegraph/CodeContextBench/blob/main",
220254220254
"run_count": 224,
220255220255
"run_summaries": [
@@ -307138,7 +307138,7 @@
307138307138
"audit_page": "audits/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.json",
307139307139
"benchmark_github": "https://github.com/sourcegraph/CodeContextBench/blob/main/benchmarks/ccb_test/curl-security-review-001",
307140307140
"benchmark_path": "benchmarks/ccb_test/curl-security-review-001",
307141-
"benchmark_task_path": null,
307141+
"benchmark_task_path": "/tmp/sgonly_curl-security-review-001",
307142307142
"bundled_trace_paths": {
307143307143
"trajectory": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/trajectory.json",
307144307144
"transcript": "traces/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001/claude-code.txt"

docs/official_results/tasks/test_haiku_20260224_011816--mcp-remote-direct--sgonly_curl-security-review-001.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ <h2>Task Information</h2>
4040
<div class="metric"><div class="k">Agent</div><div class="v" style="font-size:16px">claude-code</div></div>
4141
<div class="metric"><div class="k">Model</div><div class="v" style="font-size:16px">claude-haiku-4-5-20251001</div></div>
4242
</div>
43-
<p class="meta" style="margin-top:10px">Benchmark path: <code>-</code></p>
43+
<p class="meta" style="margin-top:10px">Benchmark path: <code>/tmp/sgonly_curl-security-review-001</code></p>
4444
<details>
4545
<summary>Task instruction sent to agent</summary>
4646
<pre># IMPORTANT: Source Code Access

docs/ops/SCRIPT_INDEX.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
4040
- `scripts/compute_retrieval_metrics.py` - Analysis/comparison script for compute retrieval metrics.
4141
- `scripts/cost_breakdown_analysis.py` - Analysis/comparison script for cost breakdown analysis.
4242
- `scripts/cost_report.py` - Aggregates token and cost metrics per run, suite, and config.
43+
- `scripts/doe_variance_analysis.py` - Analysis/comparison script for doe variance analysis.
4344
- `scripts/ds_audit.py` - Analysis/comparison script for ds audit.
4445
- `scripts/economic_analysis.py` - Analysis/comparison script for economic analysis.
4546
- `scripts/failure_analysis.py` - Analysis/comparison script for failure analysis.

runs/official/MANIFEST.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"description": "Canonical run manifest for CodeContextBench evaluation",
3-
"generated": "2026-03-01T19:44:25.410250+00:00",
3+
"generated": "2026-03-01T20:05:16.926750+00:00",
44
"total_tasks": 758,
55
"total_runs": 76,
66
"runs": {
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
"job_name": "curl-fix-validate-mcp",
3+
"jobs_dir": "runs/staging/curl_fix_validate",
4+
"n_attempts": 1,
5+
"timeout_multiplier": 10.0,
6+
"agent_timeout_multiplier": null,
7+
"verifier_timeout_multiplier": null,
8+
"agent_setup_timeout_multiplier": null,
9+
"environment_build_timeout_multiplier": null,
10+
"debug": false,
11+
"orchestrator": {
12+
"type": "local",
13+
"n_concurrent_trials": 1,
14+
"quiet": false,
15+
"retry": {
16+
"max_retries": 0,
17+
"include_exceptions": null,
18+
"exclude_exceptions": [
19+
"RewardFileNotFoundError",
20+
"VerifierTimeoutError",
21+
"VerifierOutputParseError",
22+
"RewardFileEmptyError",
23+
"AgentTimeoutError"
24+
],
25+
"wait_multiplier": 1.0,
26+
"min_wait_sec": 1.0,
27+
"max_wait_sec": 60.0
28+
},
29+
"kwargs": {}
30+
},
31+
"environment": {
32+
"type": "docker",
33+
"import_path": null,
34+
"force_build": false,
35+
"delete": true,
36+
"override_cpus": null,
37+
"override_memory_mb": null,
38+
"override_storage_mb": null,
39+
"override_gpus": null,
40+
"suppress_override_warnings": false,
41+
"kwargs": {}
42+
},
43+
"verifier": {
44+
"override_timeout_sec": null,
45+
"max_timeout_sec": null,
46+
"disable": false
47+
},
48+
"metrics": [],
49+
"agents": [
50+
{
51+
"name": null,
52+
"import_path": "agents.claude_baseline_agent:BaselineClaudeCodeAgent",
53+
"model_name": "anthropic/claude-haiku-4-5-20251001",
54+
"override_timeout_sec": null,
55+
"override_setup_timeout_sec": null,
56+
"max_timeout_sec": null,
57+
"kwargs": {},
58+
"env": {}
59+
}
60+
],
61+
"datasets": [],
62+
"tasks": [
63+
{
64+
"path": "/tmp/sgonly_curl-security-review-001",
65+
"git_url": null,
66+
"git_commit_id": null,
67+
"overwrite": false,
68+
"download_dir": null,
69+
"source": null
70+
}
71+
],
72+
"artifacts": []
73+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"id": "d1b85392-0643-4100-8b41-17aaa9844d21",
3+
"started_at": "2026-03-01T19:59:13.597193",
4+
"finished_at": "2026-03-01T20:03:16.821642",
5+
"n_total_trials": 1,
6+
"stats": {
7+
"n_trials": 1,
8+
"n_errors": 0,
9+
"evals": {
10+
"claude-code__claude-haiku-4-5-20251001__adhoc": {
11+
"n_trials": 1,
12+
"n_errors": 0,
13+
"metrics": [
14+
{
15+
"mean": 0.51
16+
}
17+
],
18+
"reward_stats": {
19+
"reward": {
20+
"0.51": [
21+
"sgonly_curl-security-review-001__pyVKGJB"
22+
]
23+
}
24+
},
25+
"exception_stats": {}
26+
}
27+
}
28+
}
29+
}

runs/official/curl_fix_validate/curl-fix-validate-mcp/sgonly_curl-security-review-001__pyVKGJB/agent/claude-code.txt

Lines changed: 157 additions & 0 deletions
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
mkdir -p $CLAUDE_CONFIG_DIR/debug $CLAUDE_CONFIG_DIR/projects/-app $CLAUDE_CONFIG_DIR/shell-snapshots $CLAUDE_CONFIG_DIR/statsig $CLAUDE_CONFIG_DIR/todos && if [ -d ~/.claude/skills ]; then cp -r ~/.claude/skills $CLAUDE_CONFIG_DIR/skills 2>/dev/null || true; fi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0

0 commit comments

Comments
 (0)