Skip to content

Commit 076c9f3

Browse files
author
LoCoBench Bot
committed
Add TTFR metrics to task_metrics extraction and promotion flow
1 parent bab16b7 commit 076c9f3

File tree

4 files changed

+255
-0
lines changed

4 files changed

+255
-0
lines changed

.beads/issues.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
{"id":"CodeContextBench-33o","title":"US-011: Migrate results and remove score fallback","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T21:42:35.481643658Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T21:44:33.467729962Z","closed_at":"2026-02-06T21:44:33.467729962Z","close_reason":"Migrated 30 files, removed score fallback from generate_manifest.py"}
2020
{"id":"CodeContextBench-36d","title":"Fill LoCoBench baseline+SG_base gaps (5+7 tasks)","description":"LoCoBench has 20/25 baseline and 18/25 SG_base. Run locobench_3config.sh --baseline-only and --base-only to fill 5 baseline + 7 SG_base missing tasks.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-08T02:54:22.875728306Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.365475558Z","closed_at":"2026-02-16T00:50:42.365475558Z","close_reason":"SG_base dropped from benchmark configs on 2026-02-15"}
2121
{"id":"CodeContextBench-3c9","title":"Archive 12 stale run batches","description":"QA audit M1: 12 stale batches (~325 results) sitting in runs/official/ that predate current verified results. Move to archive/ to reduce scan noise. Identify by checking timestamps against known good run dirs.","notes":"CORRECTED: 8 of 10 stale batches were restored from archive because they contained unique task results not present in newer batches. Only 3 batches correctly archived: pytorch_gapfill broken-verifier, linuxflbench incomplete run. The 8 restored: bigcode(2), k8s_docs(2), swebenchpro(2), sweperf(1), tac(1).","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:49:47.10922058Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T19:13:23.623047344Z","closed_at":"2026-02-06T18:00:02.35526543Z","close_reason":"Archived 10 stale batches (171 results) + 1 broken-verifier PyTorch gapfill batch. Total 11 batches moved to runs/official/archive/."}
22+
{"id":"CodeContextBench-3cj","title":"US-001 Add shared multi-harness contract document","status":"open","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:30:04.258764486Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:30:04.258764486Z"}
2223
{"id":"CodeContextBench-3e0","title":"Reclassify context window errors in TAC and CrossRepo","description":"QA audit C6: 5 context window errors misclassified as task failures (TAC find-in-codebase tasks, CrossRepo). The context_window_exceeded fingerprint exists in status_fingerprints.py but historical runs need reclassification in MANIFEST. May need to mark these tasks as infra-limited rather than agent-failed.","status":"closed","priority":3,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:20.850762526Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T18:12:29.617496653Z","closed_at":"2026-02-06T18:12:29.617496653Z","close_reason":"Investigation found NO context window exceeded errors in TAC or CrossRepo. Original C6 audit finding was misidentified. TAC find-in-codebase failures are RocketChat network unreachable (infra issue). CrossRepo failures are genuine task difficulty (0% solve rate). High token counts reflect heavy context usage, not window overflow."}
2324
{"id":"CodeContextBench-3j8","title":"Fix CrossRepo verifier: COPY expected_changes.json into Docker image","description":"CrossRepo test.sh references /tests/expected_changes.json but the Dockerfile never COPYs it into the image. All 4 tasks fail even when agent succeeds. Fix: add COPY tests/expected_changes.json /tests/ to each CrossRepo Dockerfile. Affects: api_upgrade_01, bug_localization_01, cross_file_reasoning_01, refactor_rename_01.","status":"closed","priority":0,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:49:33.859688777Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T15:24:45.679075805Z","closed_at":"2026-02-06T15:24:45.679075805Z","close_reason":"Already fixed: test.sh path corrected from /task/tests/ to /tests/ in commit 0483b714. Harbor uploads tests/ to /tests/ correctly. Old runs used wrong path. Need reruns, not code changes."}
2425
{"id":"CodeContextBench-3ls","title":"US-006b: Scaffold 3 arch understanding tasks (Tier B)","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T23:13:19.158531794Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.320127479Z","closed_at":"2026-02-16T00:50:42.320127479Z","close_reason":"US-006b complete: 3 Tier B arch tasks scaffolded (camel, flink, quantlib)"}

scripts/ccb_metrics/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,16 @@ class TaskMetrics:
9999
mcp_latency_p95_ms: Optional[float] = None
100100
context_window_peak_pct: Optional[float] = None
101101

102+
# Time-to-relevant/context metrics (requires ground truth files)
103+
ttfr: Optional[float] = None
104+
ttfr_step: Optional[int] = None
105+
tt_all_r: Optional[float] = None
106+
n_steps_to_first: Optional[int] = None
107+
tokens_before_first_relevant: Optional[int] = None
108+
cost_before_first_relevant: Optional[float] = None
109+
output_tokens_before_first_relevant: Optional[int] = None
110+
agent_time_to_first_relevant: Optional[float] = None
111+
102112
def to_dict(self) -> dict:
103113
return asdict(self)
104114

scripts/extract_task_metrics.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@
2525
sys.path.insert(0, str(Path(__file__).resolve().parent))
2626

2727
from ccb_metrics.models import TaskMetrics
28+
from ccb_metrics.ground_truth import load_registry, TaskGroundTruth
29+
from ccb_metrics.ir_metrics import (
30+
extract_time_to_context,
31+
extract_cost_metrics_before_first_relevant,
32+
extract_agent_time_to_first_relevant,
33+
)
2834
from ccb_metrics.extractors import (
2935
extract_task_from_result_json,
3036
extract_task_tokens_from_transcript,
@@ -49,6 +55,9 @@
4955
)
5056
from ccb_metrics.task_selection import load_selected_tasks, build_task_index, enrich_task_metrics
5157

58+
GT_CACHE = Path(__file__).resolve().parent.parent / "configs" / "ground_truth_files.json"
59+
_GT_REGISTRY: dict[str, TaskGroundTruth] | None = None
60+
5261

5362
def _extract_task_id(dirname: str) -> str:
5463
"""Derive task_id from directory name (strip __hash suffix)."""
@@ -58,6 +67,39 @@ def _extract_task_id(dirname: str) -> str:
5867
return dirname
5968

6069

70+
def _load_ground_truth_registry() -> dict[str, TaskGroundTruth]:
71+
global _GT_REGISTRY
72+
if _GT_REGISTRY is not None:
73+
return _GT_REGISTRY
74+
if GT_CACHE.is_file():
75+
try:
76+
_GT_REGISTRY = load_registry(GT_CACHE)
77+
return _GT_REGISTRY
78+
except (OSError, json.JSONDecodeError, TypeError, ValueError):
79+
pass
80+
_GT_REGISTRY = {}
81+
return _GT_REGISTRY
82+
83+
84+
def _lookup_ground_truth(task_id: str, registry: dict[str, TaskGroundTruth]) -> TaskGroundTruth | None:
85+
if not task_id:
86+
return None
87+
88+
# Candidate key variants seen across benchmark families.
89+
candidates = [task_id]
90+
if task_id.startswith("ccb_"):
91+
candidates.append(task_id[4:])
92+
for prefix in ("ccb_dibench-", "ccb_tac-", "ccb_largerepo-"):
93+
if task_id.startswith(prefix):
94+
candidates.append(task_id[len("ccb_"):])
95+
96+
for key in candidates:
97+
gt = registry.get(key)
98+
if gt is not None:
99+
return gt
100+
return None
101+
102+
61103
def process_task_dir(
62104
task_dir: Path,
63105
benchmark: str,
@@ -215,6 +257,35 @@ def process_task_dir(
215257
tm.mcp_latency_p50_ms = latency["mcp_latency_p50_ms"]
216258
tm.mcp_latency_p95_ms = latency["mcp_latency_p95_ms"]
217259

260+
# --- Time-to-relevant/context metrics (requires ground truth + transcript) ---
261+
gt_registry = _load_ground_truth_registry()
262+
gt = _lookup_ground_truth(tm.task_id, gt_registry)
263+
if gt is not None and getattr(gt, "files", None) and transcript_path.is_file():
264+
ttc = extract_time_to_context(
265+
trajectory_path=trajectory_path,
266+
transcript_path=transcript_path,
267+
ground_truth_files=gt.files,
268+
)
269+
if ttc:
270+
tm.ttfr = ttc.get("ttfr")
271+
tm.ttfr_step = ttc.get("ttfr_step")
272+
tm.tt_all_r = ttc.get("tt_all_r")
273+
tm.n_steps_to_first = ttc.get("n_steps_to_first")
274+
275+
cost_metrics = extract_cost_metrics_before_first_relevant(
276+
transcript_path=transcript_path,
277+
n_steps_to_first=tm.n_steps_to_first,
278+
)
279+
if cost_metrics:
280+
tm.tokens_before_first_relevant = cost_metrics.get("tokens_total")
281+
tm.output_tokens_before_first_relevant = cost_metrics.get("output_tokens")
282+
tm.cost_before_first_relevant = cost_metrics.get("cost_usd")
283+
284+
tm.agent_time_to_first_relevant = extract_agent_time_to_first_relevant(
285+
trajectory_path=trajectory_path,
286+
n_steps_to_first=tm.n_steps_to_first,
287+
)
288+
218289
return tm
219290

220291

scripts/promote_run.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,40 @@
3737
OFFICIAL_DIR = PROJECT_ROOT / "runs" / "official"
3838
VALIDATE_SCRIPT = PROJECT_ROOT / "scripts" / "validate_task_run.py"
3939
MANIFEST_SCRIPT = PROJECT_ROOT / "scripts" / "generate_manifest.py"
40+
EXTRACT_METRICS_SCRIPT = PROJECT_ROOT / "scripts" / "extract_task_metrics.py"
41+
SELECTED_TASKS_FILE = PROJECT_ROOT / "configs" / "selected_benchmark_tasks.json"
4042

4143
SKIP_PATTERNS = ["__broken_verifier", "validation_test", "archive", "__v1_hinted"]
4244
CONFIGS = ["baseline", "sourcegraph_base", "sourcegraph_full", "sourcegraph_isolated", "sourcegraph_only"]
4345

46+
DIR_PREFIX_TO_SUITE = {
47+
"bigcode_mcp_": "ccb_largerepo",
48+
"bigcode_sgcompare_": "ccb_largerepo",
49+
"codereview_": "ccb_codereview",
50+
"crossrepo_": "ccb_crossrepo",
51+
"dependeval_": "ccb_dependeval",
52+
"dibench_": "ccb_dibench",
53+
"docgen_": "ccb_docgen",
54+
"enterprise_": "ccb_enterprise",
55+
"governance_": "ccb_governance",
56+
"investigation_": "ccb_investigation",
57+
"k8s_docs_": "ccb_k8sdocs",
58+
"linuxflbench_": "ccb_linuxflbench",
59+
"locobench_": "ccb_locobench",
60+
"navprove_": "ccb_navprove",
61+
"nlqa_": "ccb_nlqa",
62+
"onboarding_": "ccb_onboarding",
63+
"paired_rerun_crossrepo_": "ccb_crossrepo",
64+
"paired_rerun_dibench_": "ccb_dibench",
65+
"paired_rerun_pytorch_": "ccb_pytorch",
66+
"pytorch_": "ccb_pytorch",
67+
"repoqa_": "ccb_repoqa",
68+
"security_": "ccb_security",
69+
"swebenchpro_": "ccb_swebenchpro",
70+
"sweperf_": "ccb_sweperf",
71+
"tac_": "ccb_tac",
72+
}
73+
4474
# ANSI colors
4575
GREEN = "\033[92m"
4676
YELLOW = "\033[93m"
@@ -94,6 +124,120 @@ def find_task_dirs(config_path: Path) -> list[Path]:
94124
return task_dirs
95125

96126

127+
def suite_from_task_id(task_id: str) -> str | None:
128+
"""Infer suite from task_id when run name isn't enough."""
129+
if task_id.startswith("instance_"):
130+
return "ccb_swebenchpro"
131+
if task_id.startswith("sgt-"):
132+
return "ccb_pytorch"
133+
if task_id.startswith("big-code-"):
134+
return "ccb_largerepo"
135+
if task_id.startswith("dibench-"):
136+
return "ccb_dibench"
137+
if task_id.startswith("cr-"):
138+
return "ccb_codereview"
139+
if task_id.endswith("-doc-001"):
140+
return "ccb_k8sdocs"
141+
if task_id.startswith("lfl-"):
142+
return "ccb_linuxflbench"
143+
if task_id.startswith("bug_localization_") or task_id.startswith("refactor_rename_") or task_id.startswith("cross_file_reasoning_"):
144+
return "ccb_crossrepo"
145+
if "_expert_" in task_id:
146+
return "ccb_locobench"
147+
if task_id.startswith("multifile_editing-") or task_id.startswith("file_span_fix-") or task_id.startswith("dependency_recognition-"):
148+
return "ccb_dependeval"
149+
if task_id.startswith("repoqa-"):
150+
return "ccb_repoqa"
151+
if task_id.startswith("sweperf-") or task_id.startswith("sweperf_") or task_id.startswith("django_perf_"):
152+
return "ccb_sweperf"
153+
if task_id.startswith("tac-") or task_id.startswith("simple_test_") or task_id.startswith("api_upgrade_") or task_id.startswith("hyperloglog") or task_id.startswith("write-unit-test"):
154+
return "ccb_tac"
155+
if "answer_extraction" in task_id or "function_recall" in task_id or "question_answer" in task_id:
156+
return "ccb_repoqa"
157+
if task_id.startswith("onboard-"):
158+
return "ccb_onboarding"
159+
if task_id.startswith("gov-"):
160+
return "ccb_governance"
161+
if task_id.startswith("sec-"):
162+
return "ccb_security"
163+
if task_id.startswith("ent-") or task_id.startswith("dep-") or task_id.startswith("multi-team-") or task_id.startswith("polyglot-"):
164+
return "ccb_enterprise"
165+
return None
166+
167+
168+
def suite_from_run_name(run_name: str) -> str | None:
169+
for prefix, suite in DIR_PREFIX_TO_SUITE.items():
170+
if run_name.startswith(prefix):
171+
return suite
172+
return None
173+
174+
175+
def benchmark_for_task(run_name: str, task_dir: Path) -> str | None:
176+
suite = suite_from_run_name(run_name)
177+
if suite:
178+
return suite
179+
task_name = task_dir.name.rsplit("__", 1)[0]
180+
return suite_from_task_id(task_name)
181+
182+
183+
def extract_missing_task_metrics(run_dir: Path, *, execute: bool) -> tuple[int, int, list[str]]:
184+
"""Generate missing task_metrics.json files for a run.
185+
186+
Returns:
187+
(missing_count, generated_count, errors)
188+
"""
189+
missing_count = 0
190+
generated_count = 0
191+
errors: list[str] = []
192+
193+
for config_name in CONFIGS:
194+
config_path = run_dir / config_name
195+
if not config_path.is_dir():
196+
continue
197+
198+
for task_dir in find_task_dirs(config_path):
199+
result_json = task_dir / "result.json"
200+
metrics_json = task_dir / "task_metrics.json"
201+
if not result_json.is_file() or metrics_json.is_file():
202+
continue
203+
204+
missing_count += 1
205+
if not execute:
206+
continue
207+
208+
benchmark = benchmark_for_task(run_dir.name, task_dir)
209+
if benchmark is None:
210+
errors.append(f"{config_name}/{task_dir.name}: could not infer benchmark")
211+
continue
212+
213+
cmd = [
214+
sys.executable,
215+
str(EXTRACT_METRICS_SCRIPT),
216+
"--task-dir",
217+
str(task_dir),
218+
"--benchmark",
219+
benchmark,
220+
"--config",
221+
config_name,
222+
]
223+
if SELECTED_TASKS_FILE.is_file():
224+
cmd.extend(["--selected-tasks", str(SELECTED_TASKS_FILE)])
225+
226+
proc = subprocess.run(
227+
cmd,
228+
capture_output=True,
229+
text=True,
230+
timeout=120,
231+
)
232+
if proc.returncode == 0 and metrics_json.is_file():
233+
generated_count += 1
234+
else:
235+
detail = proc.stderr.strip() or proc.stdout.strip() or f"exit {proc.returncode}"
236+
errors.append(f"{config_name}/{task_dir.name}: {detail[:240]}")
237+
238+
return missing_count, generated_count, errors
239+
240+
97241
def validate_run(run_dir: Path) -> ValidationResult:
98242
"""Validate a staging run and return results."""
99243
result = ValidationResult(run_name=run_dir.name)
@@ -336,6 +480,35 @@ def cmd_promote(
336480
skipped.append(run_dir.name)
337481
continue
338482

483+
missing_metrics, generated_metrics, metric_errors = extract_missing_task_metrics(
484+
run_dir, execute=execute
485+
)
486+
487+
if execute:
488+
if missing_metrics == 0:
489+
print(f"\n {GREEN}Metrics:{RESET} all task_metrics.json files already present.")
490+
else:
491+
print(
492+
f"\n Metrics extraction: {generated_metrics}/{missing_metrics} generated"
493+
)
494+
if metric_errors:
495+
print(f" {RED}Extraction errors:{RESET} {len(metric_errors)}")
496+
for err in metric_errors[:5]:
497+
print(f" - {err}")
498+
if len(metric_errors) > 5:
499+
print(f" ... {len(metric_errors) - 5} more")
500+
if not force:
501+
print(
502+
f"\n {RED}BLOCKED:{RESET} Failed to generate all task_metrics.json files."
503+
)
504+
print(" Use --force to bypass.")
505+
skipped.append(run_dir.name)
506+
continue
507+
elif missing_metrics > 0:
508+
print(
509+
f"\n {BOLD}DRY RUN:{RESET} would generate {missing_metrics} missing task_metrics.json file(s) before move."
510+
)
511+
339512
if execute:
340513
print(f"\n Promoting: {run_dir.name}")
341514
shutil.move(str(run_dir), str(official_dest))

0 commit comments

Comments
 (0)