|
5 | 5 | "generated_by": "scripts/select_benchmark_tasks.py", |
6 | 6 | "random_seed": 42, |
7 | 7 | "total_available": 835, |
8 | | - "total_selected": 166 |
| 8 | + "total_selected": 175 |
9 | 9 | }, |
10 | 10 | "methodology": { |
11 | 11 | "description": "Tasks selected via stratified sampling across 15 benchmarks, covering all SDLC phases. Each task scored for MCP benefit using weighted combination of context complexity, cross-file dependencies, semantic search potential, and task category affinity.", |
|
38 | 38 | "ccb_linuxflbench": "5 (representative tasks across kernel subsystems)", |
39 | 39 | "ccb_investigation": "all (4, investigation-only: debug, impact, migration, regression)", |
40 | 40 | "ccb_governance": "all (6, governance compliance: repo scope, sensitive exclusion, cross-team, audit trail, degraded context)", |
41 | | - "ccb_enterprise": "all (5, enterprise complexity: multi-team ownership, conflicting docs, legacy deps, polyglot)" |
| 41 | + "ccb_enterprise": "all (5, enterprise complexity: multi-team ownership, conflicting docs, legacy deps, polyglot)", |
| 42 | + "ccb_navprove": "all (9, navigation-verified: find bug + write regression test)" |
42 | 43 | } |
43 | 44 | }, |
44 | 45 | "statistics": { |
45 | 46 | "tasks_per_sdlc_phase": { |
46 | 47 | "Analysis": 8, |
47 | 48 | "Architecture & Design": 10, |
48 | | - "Debugging": 3, |
| 49 | + "Debugging": 12, |
49 | 50 | "Documentation": 5, |
50 | 51 | "Implementation (bug fix)": 62, |
51 | 52 | "Implementation (feature)": 28, |
|
69 | 70 | "ccb_largerepo": 25, |
70 | 71 | "ccb_linuxflbench": 5, |
71 | 72 | "ccb_locobench": 25, |
| 73 | + "ccb_navprove": 9, |
72 | 74 | "ccb_pytorch": 11, |
73 | 75 | "ccb_security": 8, |
74 | 76 | "ccb_swebenchpro": 36, |
|
79 | 81 | "c": 16, |
80 | 82 | "cpp": 19, |
81 | 83 | "csharp": 6, |
82 | | - "go": 41, |
| 84 | + "go": 44, |
83 | 85 | "java": 11, |
84 | 86 | "javascript": 6, |
85 | | - "python": 45, |
| 87 | + "python": 50, |
86 | 88 | "python,cpp": 1, |
87 | 89 | "rust": 11, |
88 | | - "typescript": 10 |
| 90 | + "typescript": 11 |
89 | 91 | }, |
90 | 92 | "avg_mcp_benefit_score": 0.7648 |
91 | 93 | }, |
|
3077 | 3079 | }, |
3078 | 3080 | "selection_rationale": "Transitive dependency vulnerability analysis (NOT AFFECTED case) - analyze CVE-2023-39325 in grpcurl -> grpc-go -> golang.org/x/net chain. Agent must prove grpcurl is NOT affected despite having vulnerable x/net version because it uses HTTP/2 client (Transport) not server (Server.ServeConn). Harder than sec-transitive-001 - requires proving a negative by distinguishing client vs server code paths.", |
3079 | 3081 | "task_dir": "ccb_security/sec-transitive-002" |
| 3082 | + }, |
| 3083 | + { |
| 3084 | + "task_id": "navprove-qb-url-001", |
| 3085 | + "benchmark": "ccb_navprove", |
| 3086 | + "sdlc_phase": "Debugging", |
| 3087 | + "language": "python", |
| 3088 | + "difficulty": "hard", |
| 3089 | + "category": "navigation_verified", |
| 3090 | + "repo": "qutebrowser/qutebrowser", |
| 3091 | + "mcp_benefit_score": 0.83, |
| 3092 | + "mcp_breakdown": { |
| 3093 | + "context_complexity": 0.85, |
| 3094 | + "cross_file_deps": 0.80, |
| 3095 | + "semantic_search_potential": 0.90, |
| 3096 | + "task_category_weight": 0.80 |
| 3097 | + }, |
| 3098 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3099 | + "task_dir": "ccb_navprove/navprove-qb-url-001" |
| 3100 | + }, |
| 3101 | + { |
| 3102 | + "task_id": "navprove-qb-tab-001", |
| 3103 | + "benchmark": "ccb_navprove", |
| 3104 | + "sdlc_phase": "Debugging", |
| 3105 | + "language": "python", |
| 3106 | + "difficulty": "hard", |
| 3107 | + "category": "navigation_verified", |
| 3108 | + "repo": "qutebrowser/qutebrowser", |
| 3109 | + "mcp_benefit_score": 0.83, |
| 3110 | + "mcp_breakdown": { |
| 3111 | + "context_complexity": 0.85, |
| 3112 | + "cross_file_deps": 0.80, |
| 3113 | + "semantic_search_potential": 0.90, |
| 3114 | + "task_category_weight": 0.80 |
| 3115 | + }, |
| 3116 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3117 | + "task_dir": "ccb_navprove/navprove-qb-tab-001" |
| 3118 | + }, |
| 3119 | + { |
| 3120 | + "task_id": "navprove-qb-bookmark-001", |
| 3121 | + "benchmark": "ccb_navprove", |
| 3122 | + "sdlc_phase": "Debugging", |
| 3123 | + "language": "python", |
| 3124 | + "difficulty": "hard", |
| 3125 | + "category": "navigation_verified", |
| 3126 | + "repo": "qutebrowser/qutebrowser", |
| 3127 | + "mcp_benefit_score": 0.83, |
| 3128 | + "mcp_breakdown": { |
| 3129 | + "context_complexity": 0.85, |
| 3130 | + "cross_file_deps": 0.80, |
| 3131 | + "semantic_search_potential": 0.90, |
| 3132 | + "task_category_weight": 0.80 |
| 3133 | + }, |
| 3134 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3135 | + "task_dir": "ccb_navprove/navprove-qb-bookmark-001" |
| 3136 | + }, |
| 3137 | + { |
| 3138 | + "task_id": "navprove-qb-download-001", |
| 3139 | + "benchmark": "ccb_navprove", |
| 3140 | + "sdlc_phase": "Debugging", |
| 3141 | + "language": "python", |
| 3142 | + "difficulty": "hard", |
| 3143 | + "category": "navigation_verified", |
| 3144 | + "repo": "qutebrowser/qutebrowser", |
| 3145 | + "mcp_benefit_score": 0.83, |
| 3146 | + "mcp_breakdown": { |
| 3147 | + "context_complexity": 0.85, |
| 3148 | + "cross_file_deps": 0.80, |
| 3149 | + "semantic_search_potential": 0.90, |
| 3150 | + "task_category_weight": 0.80 |
| 3151 | + }, |
| 3152 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3153 | + "task_dir": "ccb_navprove/navprove-qb-download-001" |
| 3154 | + }, |
| 3155 | + { |
| 3156 | + "task_id": "navprove-ansible-vault-001", |
| 3157 | + "benchmark": "ccb_navprove", |
| 3158 | + "sdlc_phase": "Debugging", |
| 3159 | + "language": "python", |
| 3160 | + "difficulty": "hard", |
| 3161 | + "category": "navigation_verified", |
| 3162 | + "repo": "ansible/ansible", |
| 3163 | + "mcp_benefit_score": 0.86, |
| 3164 | + "mcp_breakdown": { |
| 3165 | + "context_complexity": 0.90, |
| 3166 | + "cross_file_deps": 0.85, |
| 3167 | + "semantic_search_potential": 0.90, |
| 3168 | + "task_category_weight": 0.80 |
| 3169 | + }, |
| 3170 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3171 | + "task_dir": "ccb_navprove/navprove-ansible-vault-001" |
| 3172 | + }, |
| 3173 | + { |
| 3174 | + "task_id": "navprove-teleport-ssh-001", |
| 3175 | + "benchmark": "ccb_navprove", |
| 3176 | + "sdlc_phase": "Debugging", |
| 3177 | + "language": "go", |
| 3178 | + "difficulty": "hard", |
| 3179 | + "category": "navigation_verified", |
| 3180 | + "repo": "gravitational/teleport", |
| 3181 | + "mcp_benefit_score": 0.85, |
| 3182 | + "mcp_breakdown": { |
| 3183 | + "context_complexity": 0.90, |
| 3184 | + "cross_file_deps": 0.85, |
| 3185 | + "semantic_search_potential": 0.85, |
| 3186 | + "task_category_weight": 0.80 |
| 3187 | + }, |
| 3188 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3189 | + "task_dir": "ccb_navprove/navprove-teleport-ssh-001" |
| 3190 | + }, |
| 3191 | + { |
| 3192 | + "task_id": "navprove-vuls-oval-001", |
| 3193 | + "benchmark": "ccb_navprove", |
| 3194 | + "sdlc_phase": "Debugging", |
| 3195 | + "language": "go", |
| 3196 | + "difficulty": "hard", |
| 3197 | + "category": "navigation_verified", |
| 3198 | + "repo": "future-architect/vuls", |
| 3199 | + "mcp_benefit_score": 0.77, |
| 3200 | + "mcp_breakdown": { |
| 3201 | + "context_complexity": 0.75, |
| 3202 | + "cross_file_deps": 0.70, |
| 3203 | + "semantic_search_potential": 0.85, |
| 3204 | + "task_category_weight": 0.80 |
| 3205 | + }, |
| 3206 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3207 | + "task_dir": "ccb_navprove/navprove-vuls-oval-001" |
| 3208 | + }, |
| 3209 | + { |
| 3210 | + "task_id": "navprove-flipt-cache-001", |
| 3211 | + "benchmark": "ccb_navprove", |
| 3212 | + "sdlc_phase": "Debugging", |
| 3213 | + "language": "go", |
| 3214 | + "difficulty": "hard", |
| 3215 | + "category": "navigation_verified", |
| 3216 | + "repo": "flipt-io/flipt", |
| 3217 | + "mcp_benefit_score": 0.80, |
| 3218 | + "mcp_breakdown": { |
| 3219 | + "context_complexity": 0.80, |
| 3220 | + "cross_file_deps": 0.75, |
| 3221 | + "semantic_search_potential": 0.85, |
| 3222 | + "task_category_weight": 0.80 |
| 3223 | + }, |
| 3224 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3225 | + "task_dir": "ccb_navprove/navprove-flipt-cache-001" |
| 3226 | + }, |
| 3227 | + { |
| 3228 | + "task_id": "navprove-tutanota-search-001", |
| 3229 | + "benchmark": "ccb_navprove", |
| 3230 | + "sdlc_phase": "Debugging", |
| 3231 | + "language": "typescript", |
| 3232 | + "difficulty": "hard", |
| 3233 | + "category": "navigation_verified", |
| 3234 | + "repo": "tutao/tutanota", |
| 3235 | + "mcp_benefit_score": 0.82, |
| 3236 | + "mcp_breakdown": { |
| 3237 | + "context_complexity": 0.85, |
| 3238 | + "cross_file_deps": 0.80, |
| 3239 | + "semantic_search_potential": 0.85, |
| 3240 | + "task_category_weight": 0.80 |
| 3241 | + }, |
| 3242 | + "selection_rationale": "All ccb_navprove tasks selected (navigation-verified benchmark, 9 tasks)", |
| 3243 | + "task_dir": "ccb_navprove/navprove-tutanota-search-001" |
3080 | 3244 | } |
3081 | 3245 | ] |
3082 | 3246 | } |
0 commit comments