Skip to content

Commit 2ff565d

Browse files
sjarmakclaude
andcommitted
feat: variance gap 147/150 — promote 73 batches, 3 code reviews remain
Root cause: promote_run.py DIR_PREFIX_TO_SUITE was missing SDLC suite prefixes, blocking promotion of valid runs. Fixed, promoted all READY staging batches, and ran 3 passes of targeted gap reruns. Results: - 8/9 SDLC suites DONE (debug, design, document, feature, fix, refactor, secure, understand — all at 3+ paired runs per task) - 3 code review tasks remain at 2 paired: calcom, curl-security, terraform (these are MCP-heavy tasks that need one more Sourcegraph-based run each) - Coverage: 147/150 SDLC tasks = 98% Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent af29675 commit 2ff565d

File tree

2 files changed

+20
-172
lines changed

2 files changed

+20
-172
lines changed

configs/variance_reruns/variance_gap_all_sdlc.json

Lines changed: 9 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,32 @@
11
{
22
"metadata": {
3-
"title": "Variance rerun: ALL SDLC gap tasks (9 tasks)",
3+
"title": "Variance rerun: ALL SDLC gap tasks (3 tasks)",
44
"description": "Combined rerun config for all SDLC tasks with < 3 paired passes.",
55
"generated_date": "2026-03-01",
6-
"total_tasks": 9
6+
"total_tasks": 3
77
},
88
"methodology": {
99
"sdlc_suites": [
10-
"ccb_fix",
1110
"ccb_test"
1211
]
1312
},
1413
"statistics": {
15-
"total_tasks": 9,
14+
"total_tasks": 3,
1615
"per_suite": {
17-
"ccb_fix": 2,
18-
"ccb_test": 7
16+
"ccb_test": 3
1917
}
2018
},
2119
"tasks": [
22-
{
23-
"task_id": "pytorch-release-210-fix-001",
24-
"benchmark": "ccb_fix",
25-
"task_dir": "ccb_fix/pytorch-release-210-fix-001",
26-
"language": "cpp",
27-
"difficulty": "hard",
28-
"current_bl_runs": 4,
29-
"current_mcp_runs": 2,
30-
"current_paired": 2,
31-
"runs_needed": 1,
32-
"sdlc_phase": "Implementation (bug fix)",
33-
"category": "fix",
34-
"repo": "pytorch",
35-
"mcp_benefit_score": 0.85
36-
},
37-
{
38-
"task_id": "pytorch-relu-gelu-fusion-fix-001",
39-
"benchmark": "ccb_fix",
40-
"task_dir": "ccb_fix/pytorch-relu-gelu-fusion-fix-001",
41-
"language": "cpp",
42-
"difficulty": "hard",
43-
"current_bl_runs": 4,
44-
"current_mcp_runs": 2,
45-
"current_paired": 2,
46-
"runs_needed": 1,
47-
"sdlc_phase": "Implementation (bug fix)",
48-
"category": "fix",
49-
"repo": "pytorch/pytorch",
50-
"mcp_benefit_score": 0.85
51-
},
52-
{
53-
"task_id": "aspnetcore-code-review-001",
54-
"benchmark": "ccb_test",
55-
"task_dir": "ccb_test/aspnetcore-code-review-001",
56-
"language": "csharp",
57-
"difficulty": "hard",
58-
"current_bl_runs": 4,
59-
"current_mcp_runs": 2,
60-
"current_paired": 2,
61-
"runs_needed": 1,
62-
"sdlc_phase": "Testing & QA",
63-
"category": "code-review",
64-
"repo": "dotnet/aspnetcore",
65-
"mcp_benefit_score": 0.84
66-
},
6720
{
6821
"task_id": "calcom-code-review-001",
6922
"benchmark": "ccb_test",
7023
"task_dir": "ccb_test/calcom-code-review-001",
7124
"language": "typescript",
7225
"difficulty": "hard",
7326
"current_bl_runs": 6,
74-
"current_mcp_runs": 1,
75-
"current_paired": 1,
76-
"runs_needed": 2,
27+
"current_mcp_runs": 2,
28+
"current_paired": 2,
29+
"runs_needed": 1,
7730
"sdlc_phase": "Testing & QA",
7831
"category": "code-review",
7932
"repo": "calcom/cal.com",
@@ -86,72 +39,27 @@
8639
"language": "c",
8740
"difficulty": "hard",
8841
"current_bl_runs": 7,
89-
"current_mcp_runs": 1,
90-
"current_paired": 1,
91-
"runs_needed": 2,
92-
"sdlc_phase": "Testing & QA",
93-
"category": "code_review",
94-
"repo": "curl/curl",
95-
"mcp_benefit_score": 0.72
96-
},
97-
{
98-
"task_id": "envoy-code-review-001",
99-
"benchmark": "ccb_test",
100-
"task_dir": "ccb_test/envoy-code-review-001",
101-
"language": "cpp",
102-
"difficulty": "hard",
103-
"current_bl_runs": 5,
10442
"current_mcp_runs": 2,
10543
"current_paired": 2,
10644
"runs_needed": 1,
10745
"sdlc_phase": "Testing & QA",
10846
"category": "code_review",
109-
"repo": "envoyproxy/envoy",
47+
"repo": "curl/curl",
11048
"mcp_benefit_score": 0.72
11149
},
112-
{
113-
"task_id": "ghost-code-review-001",
114-
"benchmark": "ccb_test",
115-
"task_dir": "ccb_test/ghost-code-review-001",
116-
"language": "javascript",
117-
"difficulty": "hard",
118-
"current_bl_runs": 4,
119-
"current_mcp_runs": 2,
120-
"current_paired": 2,
121-
"runs_needed": 1,
122-
"sdlc_phase": "Testing & QA",
123-
"category": "code-review",
124-
"repo": "agentic-review-benchmarks/benchmark-pr-mapping",
125-
"mcp_benefit_score": 0.82
126-
},
12750
{
12851
"task_id": "terraform-code-review-001",
12952
"benchmark": "ccb_test",
13053
"task_dir": "ccb_test/terraform-code-review-001",
13154
"language": "go",
13255
"difficulty": "hard",
13356
"current_bl_runs": 6,
134-
"current_mcp_runs": 1,
135-
"current_paired": 1,
136-
"runs_needed": 2,
137-
"sdlc_phase": "Testing & QA",
138-
"category": "code_review",
139-
"repo": "hashicorp/terraform",
140-
"mcp_benefit_score": 0.72
141-
},
142-
{
143-
"task_id": "vscode-code-review-001",
144-
"benchmark": "ccb_test",
145-
"task_dir": "ccb_test/vscode-code-review-001",
146-
"language": "typescript",
147-
"difficulty": "hard",
148-
"current_bl_runs": 5,
14957
"current_mcp_runs": 2,
15058
"current_paired": 2,
15159
"runs_needed": 1,
15260
"sdlc_phase": "Testing & QA",
15361
"category": "code_review",
154-
"repo": "microsoft/vscode",
62+
"repo": "hashicorp/terraform",
15563
"mcp_benefit_score": 0.72
15664
}
15765
]

configs/variance_reruns/variance_gap_ccb_test.json

Lines changed: 11 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,34 @@
11
{
22
"metadata": {
3-
"title": "Variance rerun: ccb_test gap tasks (7 tasks, target 3 pairs)",
3+
"title": "Variance rerun: ccb_test gap tasks (3 tasks, target 3 pairs)",
44
"description": "Targeted rerun for ccb_test tasks with < 3 paired passes. Generated by variance_gap_analysis.py.",
55
"generated_date": "2026-03-01",
6-
"total_tasks": 7,
7-
"max_concurrency_needed": 2,
8-
"note": "Run with --concurrency 2 to fill all gaps in one batch. Or run with --concurrency 1 multiple times."
6+
"total_tasks": 3,
7+
"max_concurrency_needed": 1,
8+
"note": "Run with --concurrency 1 to fill all gaps in one batch. Or run with --concurrency 1 multiple times."
99
},
1010
"methodology": {
1111
"sdlc_suites": [
1212
"ccb_test"
1313
]
1414
},
1515
"statistics": {
16-
"total_tasks": 7,
16+
"total_tasks": 3,
1717
"per_suite": {
18-
"ccb_test": 7
18+
"ccb_test": 3
1919
}
2020
},
2121
"tasks": [
22-
{
23-
"task_id": "aspnetcore-code-review-001",
24-
"benchmark": "ccb_test",
25-
"task_dir": "ccb_test/aspnetcore-code-review-001",
26-
"language": "csharp",
27-
"difficulty": "hard",
28-
"current_bl_runs": 4,
29-
"current_mcp_runs": 2,
30-
"current_paired": 2,
31-
"runs_needed": 1,
32-
"sdlc_phase": "Testing & QA",
33-
"category": "code-review",
34-
"repo": "dotnet/aspnetcore",
35-
"mcp_benefit_score": 0.84
36-
},
3722
{
3823
"task_id": "calcom-code-review-001",
3924
"benchmark": "ccb_test",
4025
"task_dir": "ccb_test/calcom-code-review-001",
4126
"language": "typescript",
4227
"difficulty": "hard",
4328
"current_bl_runs": 6,
44-
"current_mcp_runs": 1,
45-
"current_paired": 1,
46-
"runs_needed": 2,
29+
"current_mcp_runs": 2,
30+
"current_paired": 2,
31+
"runs_needed": 1,
4732
"sdlc_phase": "Testing & QA",
4833
"category": "code-review",
4934
"repo": "calcom/cal.com",
@@ -56,72 +41,27 @@
5641
"language": "c",
5742
"difficulty": "hard",
5843
"current_bl_runs": 7,
59-
"current_mcp_runs": 1,
60-
"current_paired": 1,
61-
"runs_needed": 2,
62-
"sdlc_phase": "Testing & QA",
63-
"category": "code_review",
64-
"repo": "curl/curl",
65-
"mcp_benefit_score": 0.72
66-
},
67-
{
68-
"task_id": "envoy-code-review-001",
69-
"benchmark": "ccb_test",
70-
"task_dir": "ccb_test/envoy-code-review-001",
71-
"language": "cpp",
72-
"difficulty": "hard",
73-
"current_bl_runs": 5,
7444
"current_mcp_runs": 2,
7545
"current_paired": 2,
7646
"runs_needed": 1,
7747
"sdlc_phase": "Testing & QA",
7848
"category": "code_review",
79-
"repo": "envoyproxy/envoy",
49+
"repo": "curl/curl",
8050
"mcp_benefit_score": 0.72
8151
},
82-
{
83-
"task_id": "ghost-code-review-001",
84-
"benchmark": "ccb_test",
85-
"task_dir": "ccb_test/ghost-code-review-001",
86-
"language": "javascript",
87-
"difficulty": "hard",
88-
"current_bl_runs": 4,
89-
"current_mcp_runs": 2,
90-
"current_paired": 2,
91-
"runs_needed": 1,
92-
"sdlc_phase": "Testing & QA",
93-
"category": "code-review",
94-
"repo": "agentic-review-benchmarks/benchmark-pr-mapping",
95-
"mcp_benefit_score": 0.82
96-
},
9752
{
9853
"task_id": "terraform-code-review-001",
9954
"benchmark": "ccb_test",
10055
"task_dir": "ccb_test/terraform-code-review-001",
10156
"language": "go",
10257
"difficulty": "hard",
10358
"current_bl_runs": 6,
104-
"current_mcp_runs": 1,
105-
"current_paired": 1,
106-
"runs_needed": 2,
107-
"sdlc_phase": "Testing & QA",
108-
"category": "code_review",
109-
"repo": "hashicorp/terraform",
110-
"mcp_benefit_score": 0.72
111-
},
112-
{
113-
"task_id": "vscode-code-review-001",
114-
"benchmark": "ccb_test",
115-
"task_dir": "ccb_test/vscode-code-review-001",
116-
"language": "typescript",
117-
"difficulty": "hard",
118-
"current_bl_runs": 5,
11959
"current_mcp_runs": 2,
12060
"current_paired": 2,
12161
"runs_needed": 1,
12262
"sdlc_phase": "Testing & QA",
12363
"category": "code_review",
124-
"repo": "microsoft/vscode",
64+
"repo": "hashicorp/terraform",
12565
"mcp_benefit_score": 0.72
12666
}
12767
]

0 commit comments

Comments
 (0)