Skip to content

Commit 63b8613

Browse files
sjarmakclaude
andcommitted
fix: normalize repo names in oracle_checks.py to handle github.com/ prefix
MCP agents receive repo names with github.com/ prefix from Sourcegraph tools. Haiku failed to strip this prefix in config-trace-010, causing a false 0.0 score despite correct answer content. Added _normalize_repo() that strips github.com/, gitlab.com/, bitbucket.org/ prefixes before comparison in check_file_set_match, check_symbol_resolution, and check_dependency_chain. Also adds /run-benchmark slash command. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2a2a372 commit 63b8613

File tree

13 files changed

+268
-36
lines changed

13 files changed

+268
-36
lines changed

.claude/commands/run-benchmark.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
Configure and launch CodeContextBench benchmark runs.
2+
3+
## Approval Gate (Required)
4+
5+
Before executing any run, confirm with the user:
6+
7+
1. **Model** — e.g. `anthropic/claude-haiku-4-5-20251001` (test) or `anthropic/claude-sonnet-4-6-20250514`
8+
2. **Suite / selection file** — which benchmark suite or `--selection-file`?
9+
3. **Config** — paired (default), `--baseline-only`, or `--full-only`? Which `--full-config`?
10+
4. **Parallel slots** — how many? (default: 1; use 8 for multi-account)
11+
5. **Category**`staging` (default) or `official`?
12+
13+
Do NOT launch until all five are confirmed.
14+
15+
## Steps
16+
17+
1. Run preflight checks:
18+
```bash
19+
python3 scripts/check_infra.py
20+
```
21+
22+
2. Launch the run with confirmed parameters:
23+
```bash
24+
# MCP-unique tasks (artifact config)
25+
FULL_CONFIG=mcp-remote-artifact bash configs/run_selected_tasks.sh \
26+
--selection-file configs/selected_mcp_unique_tasks.json \
27+
--model <MODEL> --parallel <N> --category <CATEGORY>
28+
29+
# SDLC suite
30+
./configs/<suite>_2config.sh --parallel <N>
31+
```
32+
33+
3. Monitor progress:
34+
```bash
35+
python3 scripts/aggregate_status.py --staging
36+
```
37+
38+
## Arguments
39+
40+
$ARGUMENTS — optional: suite name, model, or selection file to pre-fill the approval gate

benchmarks/ccb_mcp_crossorg/ccx-crossorg-061/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_crossorg/ccx-crossorg-066/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_incident/ccx-incident-031/tests/oracle_checks.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@
2424
import subprocess
2525
import sys
2626
from pathlib import Path
27+
import re
2728
from typing import Any, Dict, List, Optional
2829

30+
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31+
# Agents sometimes forget to strip these; normalize before comparison.
32+
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33+
34+
35+
def _normalize_repo(repo: str) -> str:
36+
"""Strip hosting-provider prefix from a repo name for fuzzy matching.
37+
38+
>>> _normalize_repo("github.com/sg-benchmarks/kubernetes-client-go")
39+
'sg-benchmarks/kubernetes-client-go'
40+
>>> _normalize_repo("sg-benchmarks/kubernetes-client-go")
41+
'sg-benchmarks/kubernetes-client-go'
42+
"""
43+
return _HOSTING_PREFIX_RE.sub("", repo)
44+
2945

3046
def check_file_set_match(
3147
answer_files: List[Dict[str, str]],
@@ -50,7 +66,7 @@ def check_file_set_match(
5066
[{'repo': 'a/b', 'path': 'x.go'}]
5167
"""
5268
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
69+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5470

5571
oracle_set = {_key(f) for f in oracle_files}
5672
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +106,7 @@ def check_symbol_resolution(
90106
1.0
91107
"""
92108
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
109+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94110

95111
oracle_set = {_key(s) for s in oracle_symbols}
96112
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +149,7 @@ def check_dependency_chain(
133149
1.0
134150
"""
135151
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
152+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137153

138154
oracle_keys = [_key(s) for s in oracle_chain]
139155
answer_keys = [_key(s) for s in answer_chain]

0 commit comments

Comments
 (0)