Add TTFR metrics to task_metrics extraction and promotion flow

LoCoBench Bot · LoCoBench Bot · commit 076c9f351807 · 2026-02-17T03:30:08.000Z
diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
@@ -19,6 +19,7 @@
 {"id":"CodeContextBench-33o","title":"US-011: Migrate results and remove score fallback","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T21:42:35.481643658Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T21:44:33.467729962Z","closed_at":"2026-02-06T21:44:33.467729962Z","close_reason":"Migrated 30 files, removed score fallback from generate_manifest.py"}
 {"id":"CodeContextBench-36d","title":"Fill LoCoBench baseline+SG_base gaps (5+7 tasks)","description":"LoCoBench has 20/25 baseline and 18/25 SG_base. Run locobench_3config.sh --baseline-only and --base-only to fill 5 baseline + 7 SG_base missing tasks.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-08T02:54:22.875728306Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.365475558Z","closed_at":"2026-02-16T00:50:42.365475558Z","close_reason":"SG_base dropped from benchmark configs on 2026-02-15"}
 {"id":"CodeContextBench-3c9","title":"Archive 12 stale run batches","description":"QA audit M1: 12 stale batches (~325 results) sitting in runs/official/ that predate current verified results. Move to archive/ to reduce scan noise. Identify by checking timestamps against known good run dirs.","notes":"CORRECTED: 8 of 10 stale batches were restored from archive because they contained unique task results not present in newer batches. Only 3 batches correctly archived: pytorch_gapfill broken-verifier, linuxflbench incomplete run. The 8 restored: bigcode(2), k8s_docs(2), swebenchpro(2), sweperf(1), tac(1).","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:49:47.10922058Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T19:13:23.623047344Z","closed_at":"2026-02-06T18:00:02.35526543Z","close_reason":"Archived 10 stale batches (171 results) + 1 broken-verifier PyTorch gapfill batch. Total 11 batches moved to runs/official/archive/."}
+{"id":"CodeContextBench-3cj","title":"US-001 Add shared multi-harness contract document","status":"open","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:30:04.258764486Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:30:04.258764486Z"}
 {"id":"CodeContextBench-3e0","title":"Reclassify context window errors in TAC and CrossRepo","description":"QA audit C6: 5 context window errors misclassified as task failures (TAC find-in-codebase tasks, CrossRepo). The context_window_exceeded fingerprint exists in status_fingerprints.py but historical runs need reclassification in MANIFEST. May need to mark these tasks as infra-limited rather than agent-failed.","status":"closed","priority":3,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:20.850762526Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T18:12:29.617496653Z","closed_at":"2026-02-06T18:12:29.617496653Z","close_reason":"Investigation found NO context window exceeded errors in TAC or CrossRepo. Original C6 audit finding was misidentified. TAC find-in-codebase failures are RocketChat network unreachable (infra issue). CrossRepo failures are genuine task difficulty (0% solve rate). High token counts reflect heavy context usage, not window overflow."}
 {"id":"CodeContextBench-3j8","title":"Fix CrossRepo verifier: COPY expected_changes.json into Docker image","description":"CrossRepo test.sh references /tests/expected_changes.json but the Dockerfile never COPYs it into the image. All 4 tasks fail even when agent succeeds. Fix: add COPY tests/expected_changes.json /tests/ to each CrossRepo Dockerfile. Affects: api_upgrade_01, bug_localization_01, cross_file_reasoning_01, refactor_rename_01.","status":"closed","priority":0,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:49:33.859688777Z","created_by":"LoCoBench Bot","updated_at":"2026-02-06T15:24:45.679075805Z","closed_at":"2026-02-06T15:24:45.679075805Z","close_reason":"Already fixed: test.sh path corrected from /task/tests/ to /tests/ in commit 0483b714. Harbor uploads tests/ to /tests/ correctly. Old runs used wrong path. Need reruns, not code changes."}
 {"id":"CodeContextBench-3ls","title":"US-006b: Scaffold 3 arch understanding tasks (Tier B)","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T23:13:19.158531794Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.320127479Z","closed_at":"2026-02-16T00:50:42.320127479Z","close_reason":"US-006b complete: 3 Tier B arch tasks scaffolded (camel, flink, quantlib)"}
diff --git a/scripts/ccb_metrics/models.py b/scripts/ccb_metrics/models.py
@@ -99,6 +99,16 @@ class TaskMetrics:
     mcp_latency_p95_ms: Optional[float] = None
     context_window_peak_pct: Optional[float] = None
 
+    # Time-to-relevant/context metrics (requires ground truth files)
+    ttfr: Optional[float] = None
+    ttfr_step: Optional[int] = None
+    tt_all_r: Optional[float] = None
+    n_steps_to_first: Optional[int] = None
+    tokens_before_first_relevant: Optional[int] = None
+    cost_before_first_relevant: Optional[float] = None
+    output_tokens_before_first_relevant: Optional[int] = None
+    agent_time_to_first_relevant: Optional[float] = None
+
     def to_dict(self) -> dict:
         return asdict(self)
 
diff --git a/scripts/extract_task_metrics.py b/scripts/extract_task_metrics.py
@@ -25,6 +25,12 @@
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 
 from ccb_metrics.models import TaskMetrics
+from ccb_metrics.ground_truth import load_registry, TaskGroundTruth
+from ccb_metrics.ir_metrics import (
+    extract_time_to_context,
+    extract_cost_metrics_before_first_relevant,
+    extract_agent_time_to_first_relevant,
+)
 from ccb_metrics.extractors import (
     extract_task_from_result_json,
     extract_task_tokens_from_transcript,
@@ -49,6 +55,9 @@
 )
 from ccb_metrics.task_selection import load_selected_tasks, build_task_index, enrich_task_metrics
 
+GT_CACHE = Path(__file__).resolve().parent.parent / "configs" / "ground_truth_files.json"
+_GT_REGISTRY: dict[str, TaskGroundTruth] | None = None
+
 
 def _extract_task_id(dirname: str) -> str:
     """Derive task_id from directory name (strip __hash suffix)."""
@@ -58,6 +67,39 @@ def _extract_task_id(dirname: str) -> str:
     return dirname
 
 
+def _load_ground_truth_registry() -> dict[str, TaskGroundTruth]:
+    global _GT_REGISTRY
+    if _GT_REGISTRY is not None:
+        return _GT_REGISTRY
+    if GT_CACHE.is_file():
+        try:
+            _GT_REGISTRY = load_registry(GT_CACHE)
+            return _GT_REGISTRY
+        except (OSError, json.JSONDecodeError, TypeError, ValueError):
+            pass
+    _GT_REGISTRY = {}
+    return _GT_REGISTRY
+
+
+def _lookup_ground_truth(task_id: str, registry: dict[str, TaskGroundTruth]) -> TaskGroundTruth | None:
+    if not task_id:
+        return None
+
+    # Candidate key variants seen across benchmark families.
+    candidates = [task_id]
+    if task_id.startswith("ccb_"):
+        candidates.append(task_id[4:])
+    for prefix in ("ccb_dibench-", "ccb_tac-", "ccb_largerepo-"):
+        if task_id.startswith(prefix):
+            candidates.append(task_id[len("ccb_"):])
+
+    for key in candidates:
+        gt = registry.get(key)
+        if gt is not None:
+            return gt
+    return None
+
+
 def process_task_dir(
     task_dir: Path,
     benchmark: str,
@@ -215,6 +257,35 @@ def process_task_dir(
     tm.mcp_latency_p50_ms = latency["mcp_latency_p50_ms"]
     tm.mcp_latency_p95_ms = latency["mcp_latency_p95_ms"]
 
+    # --- Time-to-relevant/context metrics (requires ground truth + transcript) ---
+    gt_registry = _load_ground_truth_registry()
+    gt = _lookup_ground_truth(tm.task_id, gt_registry)
+    if gt is not None and getattr(gt, "files", None) and transcript_path.is_file():
+        ttc = extract_time_to_context(
+            trajectory_path=trajectory_path,
+            transcript_path=transcript_path,
+            ground_truth_files=gt.files,
+        )
+        if ttc:
+            tm.ttfr = ttc.get("ttfr")
+            tm.ttfr_step = ttc.get("ttfr_step")
+            tm.tt_all_r = ttc.get("tt_all_r")
+            tm.n_steps_to_first = ttc.get("n_steps_to_first")
+
+            cost_metrics = extract_cost_metrics_before_first_relevant(
+                transcript_path=transcript_path,
+                n_steps_to_first=tm.n_steps_to_first,
+            )
+            if cost_metrics:
+                tm.tokens_before_first_relevant = cost_metrics.get("tokens_total")
+                tm.output_tokens_before_first_relevant = cost_metrics.get("output_tokens")
+                tm.cost_before_first_relevant = cost_metrics.get("cost_usd")
+
+            tm.agent_time_to_first_relevant = extract_agent_time_to_first_relevant(
+                trajectory_path=trajectory_path,
+                n_steps_to_first=tm.n_steps_to_first,
+            )
+
     return tm
 
 
diff --git a/scripts/promote_run.py b/scripts/promote_run.py
@@ -37,10 +37,40 @@
 OFFICIAL_DIR = PROJECT_ROOT / "runs" / "official"
 VALIDATE_SCRIPT = PROJECT_ROOT / "scripts" / "validate_task_run.py"
 MANIFEST_SCRIPT = PROJECT_ROOT / "scripts" / "generate_manifest.py"
+EXTRACT_METRICS_SCRIPT = PROJECT_ROOT / "scripts" / "extract_task_metrics.py"
+SELECTED_TASKS_FILE = PROJECT_ROOT / "configs" / "selected_benchmark_tasks.json"
 
 SKIP_PATTERNS = ["__broken_verifier", "validation_test", "archive", "__v1_hinted"]
 CONFIGS = ["baseline", "sourcegraph_base", "sourcegraph_full", "sourcegraph_isolated", "sourcegraph_only"]
 
+DIR_PREFIX_TO_SUITE = {
+    "bigcode_mcp_": "ccb_largerepo",
+    "bigcode_sgcompare_": "ccb_largerepo",
+    "codereview_": "ccb_codereview",
+    "crossrepo_": "ccb_crossrepo",
+    "dependeval_": "ccb_dependeval",
+    "dibench_": "ccb_dibench",
+    "docgen_": "ccb_docgen",
+    "enterprise_": "ccb_enterprise",
+    "governance_": "ccb_governance",
+    "investigation_": "ccb_investigation",
+    "k8s_docs_": "ccb_k8sdocs",
+    "linuxflbench_": "ccb_linuxflbench",
+    "locobench_": "ccb_locobench",
+    "navprove_": "ccb_navprove",
+    "nlqa_": "ccb_nlqa",
+    "onboarding_": "ccb_onboarding",
+    "paired_rerun_crossrepo_": "ccb_crossrepo",
+    "paired_rerun_dibench_": "ccb_dibench",
+    "paired_rerun_pytorch_": "ccb_pytorch",
+    "pytorch_": "ccb_pytorch",
+    "repoqa_": "ccb_repoqa",
+    "security_": "ccb_security",
+    "swebenchpro_": "ccb_swebenchpro",
+    "sweperf_": "ccb_sweperf",
+    "tac_": "ccb_tac",
+}
+
 # ANSI colors
 GREEN = "\033[92m"
 YELLOW = "\033[93m"
@@ -94,6 +124,120 @@ def find_task_dirs(config_path: Path) -> list[Path]:
     return task_dirs
 
 
+def suite_from_task_id(task_id: str) -> str | None:
+    """Infer suite from task_id when run name isn't enough."""
+    if task_id.startswith("instance_"):
+        return "ccb_swebenchpro"
+    if task_id.startswith("sgt-"):
+        return "ccb_pytorch"
+    if task_id.startswith("big-code-"):
+        return "ccb_largerepo"
+    if task_id.startswith("dibench-"):
+        return "ccb_dibench"
+    if task_id.startswith("cr-"):
+        return "ccb_codereview"
+    if task_id.endswith("-doc-001"):
+        return "ccb_k8sdocs"
+    if task_id.startswith("lfl-"):
+        return "ccb_linuxflbench"
+    if task_id.startswith("bug_localization_") or task_id.startswith("refactor_rename_") or task_id.startswith("cross_file_reasoning_"):
+        return "ccb_crossrepo"
+    if "_expert_" in task_id:
+        return "ccb_locobench"
+    if task_id.startswith("multifile_editing-") or task_id.startswith("file_span_fix-") or task_id.startswith("dependency_recognition-"):
+        return "ccb_dependeval"
+    if task_id.startswith("repoqa-"):
+        return "ccb_repoqa"
+    if task_id.startswith("sweperf-") or task_id.startswith("sweperf_") or task_id.startswith("django_perf_"):
+        return "ccb_sweperf"
+    if task_id.startswith("tac-") or task_id.startswith("simple_test_") or task_id.startswith("api_upgrade_") or task_id.startswith("hyperloglog") or task_id.startswith("write-unit-test"):
+        return "ccb_tac"
+    if "answer_extraction" in task_id or "function_recall" in task_id or "question_answer" in task_id:
+        return "ccb_repoqa"
+    if task_id.startswith("onboard-"):
+        return "ccb_onboarding"
+    if task_id.startswith("gov-"):
+        return "ccb_governance"
+    if task_id.startswith("sec-"):
+        return "ccb_security"
+    if task_id.startswith("ent-") or task_id.startswith("dep-") or task_id.startswith("multi-team-") or task_id.startswith("polyglot-"):
+        return "ccb_enterprise"
+    return None
+
+
+def suite_from_run_name(run_name: str) -> str | None:
+    for prefix, suite in DIR_PREFIX_TO_SUITE.items():
+        if run_name.startswith(prefix):
+            return suite
+    return None
+
+
+def benchmark_for_task(run_name: str, task_dir: Path) -> str | None:
+    suite = suite_from_run_name(run_name)
+    if suite:
+        return suite
+    task_name = task_dir.name.rsplit("__", 1)[0]
+    return suite_from_task_id(task_name)
+
+
+def extract_missing_task_metrics(run_dir: Path, *, execute: bool) -> tuple[int, int, list[str]]:
+    """Generate missing task_metrics.json files for a run.
+
+    Returns:
+        (missing_count, generated_count, errors)
+    """
+    missing_count = 0
+    generated_count = 0
+    errors: list[str] = []
+
+    for config_name in CONFIGS:
+        config_path = run_dir / config_name
+        if not config_path.is_dir():
+            continue
+
+        for task_dir in find_task_dirs(config_path):
+            result_json = task_dir / "result.json"
+            metrics_json = task_dir / "task_metrics.json"
+            if not result_json.is_file() or metrics_json.is_file():
+                continue
+
+            missing_count += 1
+            if not execute:
+                continue
+
+            benchmark = benchmark_for_task(run_dir.name, task_dir)
+            if benchmark is None:
+                errors.append(f"{config_name}/{task_dir.name}: could not infer benchmark")
+                continue
+
+            cmd = [
+                sys.executable,
+                str(EXTRACT_METRICS_SCRIPT),
+                "--task-dir",
+                str(task_dir),
+                "--benchmark",
+                benchmark,
+                "--config",
+                config_name,
+            ]
+            if SELECTED_TASKS_FILE.is_file():
+                cmd.extend(["--selected-tasks", str(SELECTED_TASKS_FILE)])
+
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if proc.returncode == 0 and metrics_json.is_file():
+                generated_count += 1
+            else:
+                detail = proc.stderr.strip() or proc.stdout.strip() or f"exit {proc.returncode}"
+                errors.append(f"{config_name}/{task_dir.name}: {detail[:240]}")
+
+    return missing_count, generated_count, errors
+
+
 def validate_run(run_dir: Path) -> ValidationResult:
     """Validate a staging run and return results."""
     result = ValidationResult(run_name=run_dir.name)
@@ -336,6 +480,35 @@ def cmd_promote(
             skipped.append(run_dir.name)
             continue
 
+        missing_metrics, generated_metrics, metric_errors = extract_missing_task_metrics(
+            run_dir, execute=execute
+        )
+
+        if execute:
+            if missing_metrics == 0:
+                print(f"\n  {GREEN}Metrics:{RESET} all task_metrics.json files already present.")
+            else:
+                print(
+                    f"\n  Metrics extraction: {generated_metrics}/{missing_metrics} generated"
+                )
+                if metric_errors:
+                    print(f"  {RED}Extraction errors:{RESET} {len(metric_errors)}")
+                    for err in metric_errors[:5]:
+                        print(f"    - {err}")
+                    if len(metric_errors) > 5:
+                        print(f"    ... {len(metric_errors) - 5} more")
+                    if not force:
+                        print(
+                            f"\n  {RED}BLOCKED:{RESET} Failed to generate all task_metrics.json files."
+                        )
+                        print("  Use --force to bypass.")
+                        skipped.append(run_dir.name)
+                        continue
+        elif missing_metrics > 0:
+            print(
+                f"\n  {BOLD}DRY RUN:{RESET} would generate {missing_metrics} missing task_metrics.json file(s) before move."
+            )
+
         if execute:
             print(f"\n  Promoting: {run_dir.name}")
             shutil.move(str(run_dir), str(official_dest))