Skip to content

Commit 6dd3f87

Browse files
sjarmakclaude
andcommitted
docs: add V2 technical report with 370-task multi-run results
- Fix stale statistics in selected_benchmark_tasks.json (total_tasks 390→370, tasks_per_benchmark corrected to match actual per_suite counts) - Rerun comprehensive analysis, eval report, and IR retrieval pipeline (normalize + compute metrics + impact analysis) on all 429 official runs - Create TECHNICAL_REPORT_V2.md with updated results: - 370 tasks (150 SDLC + 220 Org), 4132 individual results, 3+ runs/task - Overall MCP delta: +0.025 (CI: [+0.008, +0.042]) - SDLC delta: +0.014 (CI spans zero, not significant) - Org delta: +0.032 (CI: [+0.013, +0.053]) - IR metrics: 1745 computable tasks, file recall 0.390 - V1→V2 comparison table documenting how multi-run averaging reduces noise and narrows CIs vs single-trial V1 data - Architecture diagram updated for 9 SDLC + 11 Org suites - Execution infrastructure updated for Daytona cloud, 3 accounts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8d0b543 commit 6dd3f87

File tree

5 files changed

+98430
-965
lines changed

5 files changed

+98430
-965
lines changed

configs/selected_benchmark_tasks.json

Lines changed: 67 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
{
22
"metadata": {
3-
"title": "CodeContextBench Selected Benchmark Tasks (SDLC Suite Reorganization)",
4-
"version": "2.0",
3+
"title": "CodeScaleBench Selected Benchmark Tasks",
4+
"version": "3.0",
55
"generated_by": "SDLC suite migration from migration_map.json",
66
"generated_date": "2026-02-18",
77
"total_available": 835,
88
"total_selected": 370,
99
"migration_source": "migration_map.json (157 mapped tasks across 8 SDLC suites)",
1010
"target_total": 370,
1111
"target_note": "150 SDLC tasks (9 suites, Neyman-optimal) + 220 MCP-unique tasks (11 suites, Neyman-optimal) = 370 active.",
12-
"last_updated": "2026-03-01",
12+
"last_updated": "2026-03-03",
1313
"note": "DOE-driven rebalance: 150 SDLC (Neyman-optimal) + 220 MCP-unique (Neyman-optimal). MCP-unique rebalance: 13 scaffolded (IDs 272-284), 7 promoted from onboarding_extra, 20 low-IV tasks moved to benchmarks/backups/*_doe_trim/.",
1414
"per_suite": {
1515
"csb_sdlc_debug": 18,
@@ -103,73 +103,84 @@
103103
},
104104
"statistics": {
105105
"tasks_per_sdlc_phase": {
106-
"Analysis": 8,
107-
"Architecture & Design": 1,
108-
"Compliance audit": 2,
109-
"Cross-org discovery": 2,
110-
"Cross-repo tracing": 3,
111-
"Debugging": 5,
112-
"Documentation": 16,
113-
"Implementation (bug fix)": 1,
114-
"Implementation (feature)": 26,
106+
"unknown": 61,
107+
"Requirements & Discovery": 28,
108+
"cross-repo-dep-trace": 24,
109+
"Implementation (feature)": 21,
110+
"Testing & QA": 18,
111+
"fix": 15,
112+
"Refactoring": 13,
113+
"debug": 13,
114+
"migration-inventory": 13,
115+
"platform-knowledge": 13,
116+
"compliance-audit": 12,
117+
"cross-org-discovery": 12,
118+
"Documentation": 11,
119+
"domain-lineage": 10,
120+
"vuln-remediation": 10,
121+
"agentic-correctness": 10,
122+
"secure": 9,
123+
"Onboarding & comprehension": 9,
124+
"incident-debug": 9,
125+
"design": 8,
126+
"Migration analysis": 8,
127+
"Implementation (bug fix)": 7,
128+
"Analysis": 6,
129+
"Security remediation": 6,
130+
"Cross-repo tracing": 5,
131+
"Debugging": 4,
132+
"Incident investigation": 3,
115133
"Implementation (refactor)": 2,
134+
"Platform engineering": 2,
135+
"Onboarding comprehension": 2,
136+
"Bug Repair": 2,
116137
"Implementation (refactoring)": 1,
117-
"Incident investigation": 3,
118-
"Migration analysis": 2,
119-
"Onboarding & comprehension": 3,
120-
"Platform engineering": 3,
121-
"Refactoring": 17,
122-
"Requirements & Discovery": 34,
123-
"Security remediation": 2,
124-
"Security review": 3,
125-
"Testing & QA": 18,
126-
"debug": 14,
127-
"design": 11,
128-
"fix": 19,
129-
"secure": 14
138+
"Security review": 1,
139+
"Compliance audit": 1,
140+
"Platform knowledge": 1
130141
},
131142
"tasks_per_benchmark": {
132-
"csb_sdlc_debug": 20,
133-
"csb_sdlc_design": 20,
134-
"csb_sdlc_document": 20,
135-
"csb_sdlc_fix": 25,
136-
"csb_org_compliance": 7,
137-
"csb_org_crossorg": 5,
138-
"csb_org_crossrepo": 1,
139-
"csb_org_crossrepo_tracing": 9,
140-
"csb_org_domain": 10,
141-
"csb_org_incident": 11,
142-
"csb_org_migration": 7,
143-
"csb_org_onboarding": 11,
144-
"csb_org_org": 5,
145-
"csb_org_platform": 5,
146-
"csb_org_security": 10,
147-
"csb_sdlc_secure": 20,
148-
"csb_sdlc_test": 20,
149-
"csb_sdlc_understand": 20,
150-
"csb_sdlc_feature": 20,
151-
"csb_sdlc_refactor": 20
143+
"csb_org_compliance": 18,
144+
"csb_org_crossorg": 15,
145+
"csb_org_crossrepo": 14,
146+
"csb_org_crossrepo_tracing": 22,
147+
"csb_org_domain": 20,
148+
"csb_org_incident": 20,
149+
"csb_org_migration": 26,
150+
"csb_org_onboarding": 28,
151+
"csb_org_org": 15,
152+
"csb_org_platform": 18,
153+
"csb_org_security": 24,
154+
"csb_sdlc_debug": 18,
155+
"csb_sdlc_design": 14,
156+
"csb_sdlc_document": 13,
157+
"csb_sdlc_feature": 23,
158+
"csb_sdlc_fix": 26,
159+
"csb_sdlc_refactor": 16,
160+
"csb_sdlc_secure": 12,
161+
"csb_sdlc_test": 18,
162+
"csb_sdlc_understand": 10
152163
},
153164
"tasks_per_language": {
154-
"java": 56,
155-
"go": 138,
165+
"go": 134,
166+
"cpp": 73,
167+
"java": 57,
168+
"python": 55,
156169
"rust": 12,
170+
"c": 10,
171+
"javascript": 8,
172+
"typescript": 7,
173+
"java,cpp": 5,
174+
"cpp,c,javascript": 3,
157175
"python,cpp": 1,
158-
"typescript": 4,
159-
"python": 52,
160176
"csharp": 1,
161-
"cpp": 75,
162-
"javascript": 6,
163-
"c": 9,
164177
"go,protobuf": 1,
165178
"go,cpp": 1,
166179
"mixed": 1,
167-
"java,cpp": 5,
168-
"cpp,c,javascript": 3,
169180
"unknown": 1
170181
},
171-
"avg_mcp_benefit_score": 0.868,
172-
"total_tasks": 390,
182+
"avg_mcp_benefit_score": 0.891,
183+
"total_tasks": 370,
173184
"per_suite": {
174185
"csb_sdlc_feature": 23,
175186
"csb_sdlc_refactor": 16,

0 commit comments

Comments
 (0)