Skip to content

Commit 87f1a5d

Browse files
sjarmakclaude
andcommitted
fix: oracle scoring — strip github.com/ prefix, match version-tag mirror suffixes
Two bugs caused 73/211 MCP-unique tasks to score 0.0 regardless of agent quality: 1. _normalize_file_entry (hydrate_task_specs.py) and _coerce_file_entry (oracle_checks.py): entries like "github.com/sg-evals/kubernetes--v1.32.0/pkg/file.go" were split as repo="github.com/sg-evals", path="kubernetes--v1.32.0/pkg/file.go". Fix: strip "github.com/" prefix before splitting, yielding repo="sg-evals/kubernetes--v1.32.0", path="pkg/file.go". 2. _MIRROR_HASH_RE only matched hex hashes (--871325b8) not version tags (--v1.32.0). Fix: broaden regex to --(v[\d.]+|[0-9a-f]{6,})$ so kubernetes--v1.32.0 normalizes to "kubernetes" and matches agent's "kubernetes/kubernetes". Re-hydrated all 211 task_spec.json + copied oracle_checks.py to 211 task dirs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4ff6274 commit 87f1a5d

File tree

244 files changed

+4487
-1104
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

244 files changed

+4487
-1104
lines changed

benchmarks/ccb_mcp_compliance/ccx-compliance-051/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-052/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-053/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-057/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-115/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-118/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-124/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-182/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-183/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

benchmarks/ccb_mcp_compliance/ccx-compliance-184/tests/oracle_checks.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,30 @@
3535
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
3636
# both sides so that matching works regardless of which convention is used.
3737

38-
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
38+
_MIRROR_HASH_RE = re.compile(r"--(v[\d.]+|[0-9a-f]{6,})$")
3939

4040

4141
def _coerce_file_entry(entry) -> Dict[str, str]:
4242
"""Coerce a file entry to {"repo": ..., "path": ...} dict format.
4343
4444
Handles string entries like "sg-evals/kubernetes--v1.32.0/pkg/file.go"
4545
where the first two path components are the repo.
46+
47+
Also handles "github.com/sg-evals/repo--hash/path" by stripping the
48+
"github.com/" prefix first so the repo is "sg-evals/repo--hash".
4649
"""
4750
if isinstance(entry, dict):
4851
return entry
4952
if isinstance(entry, str):
50-
parts = entry.split("/", 2)
53+
s = entry
54+
if s.startswith("github.com/"):
55+
s = s[len("github.com/"):]
56+
parts = s.split("/", 2)
5157
if len(parts) >= 3:
5258
return {"repo": f"{parts[0]}/{parts[1]}", "path": parts[2]}
5359
elif len(parts) == 2:
5460
return {"repo": parts[0], "path": parts[1]}
55-
return {"repo": "", "path": entry}
61+
return {"repo": "", "path": s}
5662
return {"repo": "", "path": str(entry)}
5763

5864

0 commit comments

Comments
 (0)