Skip to content

Commit 25cc997

Browse files
sjarmakclaude
andcommitted
feat: replace heuristic LOC estimates with precise cloc counts for all benchmark repos
The old repo_approx_loc was computed as repo_size_bytes/30, which inflated values by 5-50x (e.g. linux: 208M heuristic vs 29.8M actual). Now uses cloc --json on shallow clones of all 71 repos. - Add scripts/collect_repo_cloc.py: shallow-clone + cloc pipeline (resumable) - Add scripts/update_loc_from_cloc.py: propagate cloc values to all configs - Update backfill_size_metadata.py to prefer cloc data over bytes/30 heuristic - Update 470/477 tasks across 9 config files + official_results.json - 71 repos counted: 9.9K (grpcurl) to 43.4M (chromium) code lines Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 695c0d5 commit 25cc997

17 files changed

+40717
-2940
lines changed

configs/coverage_gap_tasks.json

Lines changed: 3477 additions & 0 deletions
Large diffs are not rendered by default.

configs/rerun_2_failed_org.json

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"repo_size_mb": 2439.537,
2424
"repo_file_count": 20583,
2525
"repo_directory_count": 3599,
26-
"repo_approx_loc": 85268002,
26+
"repo_approx_loc": 7111004,
2727
"repo_languages": [
2828
{
2929
"language": "csharp",
@@ -52,7 +52,31 @@
5252
"repo_complexity_source": "cached_registry",
5353
"task_complexity": 0.15,
5454
"task_complexity_label": "medium",
55-
"task_complexity_source": "ground_truth_meta_plus_registry"
55+
"task_complexity_source": "ground_truth_meta_plus_registry",
56+
"repo_approx_loc_source": "cloc",
57+
"repo_cloc_total_files": 19850,
58+
"repo_cloc_top_languages": [
59+
{
60+
"language": "C#",
61+
"code_lines": 4582333
62+
},
63+
{
64+
"language": "Visual Basic .NET",
65+
"code_lines": 1709980
66+
},
67+
{
68+
"language": "XML",
69+
"code_lines": 668001
70+
},
71+
{
72+
"language": "JSON",
73+
"code_lines": 77418
74+
},
75+
{
76+
"language": "Markdown",
77+
"code_lines": 28473
78+
}
79+
]
5680
},
5781
{
5882
"task_id": "ccx-migration-294",
@@ -73,7 +97,7 @@
7397
"repo_size_mb": 2881.666,
7498
"repo_file_count": 81361,
7599
"repo_directory_count": 2195,
76-
"repo_approx_loc": 100721527,
100+
"repo_approx_loc": 3030308,
77101
"repo_languages": [
78102
{
79103
"language": "typescript",
@@ -92,7 +116,31 @@
92116
"repo_complexity_source": "cached_registry",
93117
"task_complexity": 0.15,
94118
"task_complexity_label": "medium",
95-
"task_complexity_source": "ground_truth_meta_plus_registry"
119+
"task_complexity_source": "ground_truth_meta_plus_registry",
120+
"repo_approx_loc_source": "cloc",
121+
"repo_cloc_total_files": 37623,
122+
"repo_cloc_top_languages": [
123+
{
124+
"language": "JavaScript",
125+
"code_lines": 1870486
126+
},
127+
{
128+
"language": "TypeScript",
129+
"code_lines": 676704
130+
},
131+
{
132+
"language": "XML",
133+
"code_lines": 254275
134+
},
135+
{
136+
"language": "Markdown",
137+
"code_lines": 179456
138+
},
139+
{
140+
"language": "JSON",
141+
"code_lines": 46297
142+
}
143+
]
96144
}
97145
]
98-
}
146+
}

configs/rerun_migration294.json

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"repo_size_mb": 2881.666,
2424
"repo_file_count": 81361,
2525
"repo_directory_count": 2195,
26-
"repo_approx_loc": 100721527,
26+
"repo_approx_loc": 3030308,
2727
"repo_languages": [
2828
{
2929
"language": "typescript",
@@ -42,7 +42,31 @@
4242
"repo_complexity_source": "cached_registry",
4343
"task_complexity": 0.15,
4444
"task_complexity_label": "medium",
45-
"task_complexity_source": "ground_truth_meta_plus_registry"
45+
"task_complexity_source": "ground_truth_meta_plus_registry",
46+
"repo_approx_loc_source": "cloc",
47+
"repo_cloc_total_files": 37623,
48+
"repo_cloc_top_languages": [
49+
{
50+
"language": "JavaScript",
51+
"code_lines": 1870486
52+
},
53+
{
54+
"language": "TypeScript",
55+
"code_lines": 676704
56+
},
57+
{
58+
"language": "XML",
59+
"code_lines": 254275
60+
},
61+
{
62+
"language": "Markdown",
63+
"code_lines": 179456
64+
},
65+
{
66+
"language": "JSON",
67+
"code_lines": 46297
68+
}
69+
]
4670
}
4771
]
48-
}
72+
}

0 commit comments

Comments
 (0)