Skip to content

Commit 9cdbae5

Browse files
LoCoBench Botclaude
andcommitted
feat: US-008 - Register 9 navprove tasks in selected_benchmark_tasks.json
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c28f6d3 commit 9cdbae5

File tree

1 file changed

+170
-6
lines changed

1 file changed

+170
-6
lines changed

configs/selected_benchmark_tasks.json

Lines changed: 170 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"generated_by": "scripts/select_benchmark_tasks.py",
66
"random_seed": 42,
77
"total_available": 835,
8-
"total_selected": 166
8+
"total_selected": 175
99
},
1010
"methodology": {
1111
"description": "Tasks selected via stratified sampling across 15 benchmarks, covering all SDLC phases. Each task scored for MCP benefit using weighted combination of context complexity, cross-file dependencies, semantic search potential, and task category affinity.",
@@ -38,14 +38,15 @@
3838
"ccb_linuxflbench": "5 (representative tasks across kernel subsystems)",
3939
"ccb_investigation": "all (4, investigation-only: debug, impact, migration, regression)",
4040
"ccb_governance": "all (6, governance compliance: repo scope, sensitive exclusion, cross-team, audit trail, degraded context)",
41-
"ccb_enterprise": "all (5, enterprise complexity: multi-team ownership, conflicting docs, legacy deps, polyglot)"
41+
"ccb_enterprise": "all (5, enterprise complexity: multi-team ownership, conflicting docs, legacy deps, polyglot)",
42+
"ccb_navprove": "all (9, navigation-verified: find bug + write regression test)"
4243
}
4344
},
4445
"statistics": {
4546
"tasks_per_sdlc_phase": {
4647
"Analysis": 8,
4748
"Architecture & Design": 10,
48-
"Debugging": 3,
49+
"Debugging": 12,
4950
"Documentation": 5,
5051
"Implementation (bug fix)": 62,
5152
"Implementation (feature)": 28,
@@ -69,6 +70,7 @@
6970
"ccb_largerepo": 25,
7071
"ccb_linuxflbench": 5,
7172
"ccb_locobench": 25,
73+
"ccb_navprove": 9,
7274
"ccb_pytorch": 11,
7375
"ccb_security": 8,
7476
"ccb_swebenchpro": 36,
@@ -79,13 +81,13 @@
7981
"c": 16,
8082
"cpp": 19,
8183
"csharp": 6,
82-
"go": 41,
84+
"go": 44,
8385
"java": 11,
8486
"javascript": 6,
85-
"python": 45,
87+
"python": 50,
8688
"python,cpp": 1,
8789
"rust": 11,
88-
"typescript": 10
90+
"typescript": 11
8991
},
9092
"avg_mcp_benefit_score": 0.7648
9193
},
@@ -3077,6 +3079,168 @@
30773079
},
30783080
"selection_rationale": "Transitive dependency vulnerability analysis (NOT AFFECTED case) - analyze CVE-2023-39325 in grpcurl -> grpc-go -> golang.org/x/net chain. Agent must prove grpcurl is NOT affected despite having vulnerable x/net version because it uses HTTP/2 client (Transport) not server (Server.ServeConn). Harder than sec-transitive-001 - requires proving a negative by distinguishing client vs server code paths.",
30793081
"task_dir": "ccb_security/sec-transitive-002"
3082+
},
3083+
{
3084+
"task_id": "navprove-qb-url-001",
3085+
"benchmark": "ccb_navprove",
3086+
"sdlc_phase": "Debugging",
3087+
"language": "python",
3088+
"difficulty": "hard",
3089+
"category": "navigation_verified",
3090+
"repo": "qutebrowser/qutebrowser",
3091+
"mcp_benefit_score": 0.83,
3092+
"mcp_breakdown": {
3093+
"context_complexity": 0.85,
3094+
"cross_file_deps": 0.80,
3095+
"semantic_search_potential": 0.90,
3096+
"task_category_weight": 0.80
3097+
},
3098+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3099+
"task_dir": "ccb_navprove/navprove-qb-url-001"
3100+
},
3101+
{
3102+
"task_id": "navprove-qb-tab-001",
3103+
"benchmark": "ccb_navprove",
3104+
"sdlc_phase": "Debugging",
3105+
"language": "python",
3106+
"difficulty": "hard",
3107+
"category": "navigation_verified",
3108+
"repo": "qutebrowser/qutebrowser",
3109+
"mcp_benefit_score": 0.83,
3110+
"mcp_breakdown": {
3111+
"context_complexity": 0.85,
3112+
"cross_file_deps": 0.80,
3113+
"semantic_search_potential": 0.90,
3114+
"task_category_weight": 0.80
3115+
},
3116+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3117+
"task_dir": "ccb_navprove/navprove-qb-tab-001"
3118+
},
3119+
{
3120+
"task_id": "navprove-qb-bookmark-001",
3121+
"benchmark": "ccb_navprove",
3122+
"sdlc_phase": "Debugging",
3123+
"language": "python",
3124+
"difficulty": "hard",
3125+
"category": "navigation_verified",
3126+
"repo": "qutebrowser/qutebrowser",
3127+
"mcp_benefit_score": 0.83,
3128+
"mcp_breakdown": {
3129+
"context_complexity": 0.85,
3130+
"cross_file_deps": 0.80,
3131+
"semantic_search_potential": 0.90,
3132+
"task_category_weight": 0.80
3133+
},
3134+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3135+
"task_dir": "ccb_navprove/navprove-qb-bookmark-001"
3136+
},
3137+
{
3138+
"task_id": "navprove-qb-download-001",
3139+
"benchmark": "ccb_navprove",
3140+
"sdlc_phase": "Debugging",
3141+
"language": "python",
3142+
"difficulty": "hard",
3143+
"category": "navigation_verified",
3144+
"repo": "qutebrowser/qutebrowser",
3145+
"mcp_benefit_score": 0.83,
3146+
"mcp_breakdown": {
3147+
"context_complexity": 0.85,
3148+
"cross_file_deps": 0.80,
3149+
"semantic_search_potential": 0.90,
3150+
"task_category_weight": 0.80
3151+
},
3152+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3153+
"task_dir": "ccb_navprove/navprove-qb-download-001"
3154+
},
3155+
{
3156+
"task_id": "navprove-ansible-vault-001",
3157+
"benchmark": "ccb_navprove",
3158+
"sdlc_phase": "Debugging",
3159+
"language": "python",
3160+
"difficulty": "hard",
3161+
"category": "navigation_verified",
3162+
"repo": "ansible/ansible",
3163+
"mcp_benefit_score": 0.86,
3164+
"mcp_breakdown": {
3165+
"context_complexity": 0.90,
3166+
"cross_file_deps": 0.85,
3167+
"semantic_search_potential": 0.90,
3168+
"task_category_weight": 0.80
3169+
},
3170+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3171+
"task_dir": "ccb_navprove/navprove-ansible-vault-001"
3172+
},
3173+
{
3174+
"task_id": "navprove-teleport-ssh-001",
3175+
"benchmark": "ccb_navprove",
3176+
"sdlc_phase": "Debugging",
3177+
"language": "go",
3178+
"difficulty": "hard",
3179+
"category": "navigation_verified",
3180+
"repo": "gravitational/teleport",
3181+
"mcp_benefit_score": 0.85,
3182+
"mcp_breakdown": {
3183+
"context_complexity": 0.90,
3184+
"cross_file_deps": 0.85,
3185+
"semantic_search_potential": 0.85,
3186+
"task_category_weight": 0.80
3187+
},
3188+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3189+
"task_dir": "ccb_navprove/navprove-teleport-ssh-001"
3190+
},
3191+
{
3192+
"task_id": "navprove-vuls-oval-001",
3193+
"benchmark": "ccb_navprove",
3194+
"sdlc_phase": "Debugging",
3195+
"language": "go",
3196+
"difficulty": "hard",
3197+
"category": "navigation_verified",
3198+
"repo": "future-architect/vuls",
3199+
"mcp_benefit_score": 0.77,
3200+
"mcp_breakdown": {
3201+
"context_complexity": 0.75,
3202+
"cross_file_deps": 0.70,
3203+
"semantic_search_potential": 0.85,
3204+
"task_category_weight": 0.80
3205+
},
3206+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3207+
"task_dir": "ccb_navprove/navprove-vuls-oval-001"
3208+
},
3209+
{
3210+
"task_id": "navprove-flipt-cache-001",
3211+
"benchmark": "ccb_navprove",
3212+
"sdlc_phase": "Debugging",
3213+
"language": "go",
3214+
"difficulty": "hard",
3215+
"category": "navigation_verified",
3216+
"repo": "flipt-io/flipt",
3217+
"mcp_benefit_score": 0.80,
3218+
"mcp_breakdown": {
3219+
"context_complexity": 0.80,
3220+
"cross_file_deps": 0.75,
3221+
"semantic_search_potential": 0.85,
3222+
"task_category_weight": 0.80
3223+
},
3224+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3225+
"task_dir": "ccb_navprove/navprove-flipt-cache-001"
3226+
},
3227+
{
3228+
"task_id": "navprove-tutanota-search-001",
3229+
"benchmark": "ccb_navprove",
3230+
"sdlc_phase": "Debugging",
3231+
"language": "typescript",
3232+
"difficulty": "hard",
3233+
"category": "navigation_verified",
3234+
"repo": "tutao/tutanota",
3235+
"mcp_benefit_score": 0.82,
3236+
"mcp_breakdown": {
3237+
"context_complexity": 0.85,
3238+
"cross_file_deps": 0.80,
3239+
"semantic_search_potential": 0.85,
3240+
"task_category_weight": 0.80
3241+
},
3242+
"selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)",
3243+
"task_dir": "ccb_navprove/navprove-tutanota-search-001"
30803244
}
30813245
]
30823246
}

0 commit comments

Comments
 (0)