feat: US-020 - Report generator integration for retrieval metrics

sjarmak · claude · sjarmak · commit 38092ed623bf · 2026-02-20T21:44:30.000Z
- Add collect_retrieval_data() to ccb_metrics/discovery.py: walks runs dir
  and collects retrieval_metrics.json files from task output directories
- Export collect_retrieval_data from ccb_metrics/__init__.py
- Add 4 MCP Retrieval Performance table builders to generate_eval_report.py:
  per-task coverage/timing, per-suite aggregates, baseline vs MCP-Full
  comparison, and MCP tool discovery breakdown
- Backwards-compatible: section omitted when no retrieval_metrics.json found
- All py_compile checks pass; repo health gate passes

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/ralph-mcp-unique/prd.json b/ralph-mcp-unique/prd.json
@@ -603,7 +603,7 @@
         "python3 scripts/generate_eval_report.py runs without errors"
       ],
       "priority": 20,
-      "passes": false,
+      "passes": true,
       "notes": "The report should clearly show that baseline has lower oracle_coverage on mcp_only repos."
     },
     {
diff --git a/ralph-mcp-unique/progress.txt b/ralph-mcp-unique/progress.txt
@@ -513,3 +513,30 @@
   - DIR_PREFIX_TO_SUITE run dir prefix: `ccb_mcp_crossrepo_tracing_` (with trailing underscore) maps to suite name `ccb_mcp_crossrepo_tracing`
   - mcp_benefit_score assigned by difficulty: easy=0.70, medium=0.80-0.85, hard=0.90-0.95 (DS variants = 0.95)
 ---
+[2026-02-20 21:39:55 UTC] Iteration 10 no story markers found
+[2026-02-20 21:39:55 UTC] Iteration 10 complete
+[2026-02-20 21:39:57 UTC] Iteration 11 started
+
+## 2026-02-20 - US-020: Report generator integration for retrieval metrics
+- Extended `scripts/generate_eval_report.py` to include 'MCP Retrieval Performance' section
+- Added `collect_retrieval_data(runs_dir)` to `scripts/ccb_metrics/discovery.py` — walks same dir structure as `discover_runs()`, collects `retrieval_metrics.json` from each task output directory
+- Exported `collect_retrieval_data` from `scripts/ccb_metrics/__init__.py`
+- Added 4 new table builder functions to `generate_eval_report.py`:
+  - `_build_retrieval_per_task()` — oracle_coverage, time-to-first-hit, repos/orgs per task
+  - `_build_retrieval_per_suite()` — mean coverage/repos per suite aggregate
+  - `_build_retrieval_comparison()` — baseline vs MCP-Full delta per task
+  - `_build_retrieval_tool_breakdown()` — which MCP tools drive discovery, aggregated per suite
+- Section is backwards-compatible: omitted when no `retrieval_metrics.json` files found
+- `generate_report()` collects retrieval data and passes to table builders
+- All 4 table builders return None when no data — they are guarded by `if _has_retrieval_data(retrieval_data)`
+- Quality checks: `python3 -m py_compile` passes for all changed files
+- `python3 scripts/generate_eval_report.py --help` runs without errors
+- Repo health check: PASSED
+
+- Files changed: `scripts/ccb_metrics/discovery.py`, `scripts/ccb_metrics/__init__.py`, `scripts/generate_eval_report.py`, `ralph-mcp-unique/prd.json`
+- **Learnings for future iterations:**
+  - `collect_retrieval_data` uses same skip/dedup patterns as `discover_runs` — latest batch wins
+  - Retrieval comparison table uses heuristic: baseline = config without "sourcegraph"/"mcp" in name; mcp = config with "sourcegraph_full"/"mcp_full"
+  - Table builders are None-returning optional functions — consistent with other optional tables (swebench_partial, search_patterns, etc.)
+  - Collecting retrieval data is a separate pass over the runs dir (not integrated into task discovery) to avoid schema changes to TaskMetrics
+---
diff --git a/scripts/ccb_metrics/__init__.py b/scripts/ccb_metrics/__init__.py
@@ -1,7 +1,7 @@
 """CCB Metrics — data models and extractors for CodeContextBench evaluation."""
 
 from .models import TaskMetrics, RunMetrics, EvalReport
-from .discovery import discover_runs
+from .discovery import discover_runs, collect_retrieval_data
 from .extractors import extract_run_config
 from .task_selection import (
     load_selected_tasks,
@@ -15,6 +15,7 @@
     "RunMetrics",
     "EvalReport",
     "discover_runs",
+    "collect_retrieval_data",
     "extract_run_config",
     "load_selected_tasks",
     "build_task_index",
diff --git a/scripts/ccb_metrics/discovery.py b/scripts/ccb_metrics/discovery.py
@@ -405,3 +405,68 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
         results.append(run)
 
     return results
+
+
+def collect_retrieval_data(
+    runs_dir: str | Path,
+) -> dict[tuple[str, str, str], dict]:
+    """Collect retrieval_metrics.json files from all task output directories.
+
+    Walks the same directory structure as :func:`discover_runs` and collects
+    ``retrieval_metrics.json`` files written by
+    ``scripts/ccb_metrics/retrieval.py``.
+
+    Args:
+        runs_dir: Path to the runs/official/ (or staging) directory.
+
+    Returns:
+        Dict mapping ``(benchmark, config_name, task_id)`` to the parsed
+        retrieval metrics dict.  Empty dict if no files are found.
+        When the same task appears in multiple batch directories, the latest
+        batch's data is kept (same dedup policy as :func:`discover_runs`).
+    """
+    runs_dir = Path(runs_dir)
+    result: dict[tuple[str, str, str], dict] = {}
+
+    if not runs_dir.is_dir():
+        return result
+
+    _SKIP_PATTERNS = (
+        "archive", "__broken", "__duplicate", "__all_errored", "__partial", "__integrated"
+    )
+
+    for run_dir in sorted(runs_dir.iterdir()):
+        if not run_dir.is_dir():
+            continue
+        run_name = run_dir.name
+        if any(pat in run_name for pat in _SKIP_PATTERNS):
+            continue
+        benchmark = normalize_benchmark_name(_infer_benchmark(run_name))
+
+        for config_dir in sorted(run_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            config_name = config_dir.name
+
+            for batch_dir in sorted(config_dir.iterdir()):
+                if not batch_dir.is_dir() or not _is_batch_dir(batch_dir):
+                    continue
+
+                for task_dir in sorted(batch_dir.iterdir()):
+                    if not _is_task_dir(task_dir):
+                        continue
+
+                    ret_path = task_dir / "retrieval_metrics.json"
+                    if not ret_path.is_file():
+                        continue
+
+                    task_id = _extract_task_id(task_dir.name)
+                    try:
+                        data = json.loads(ret_path.read_text())
+                    except (OSError, json.JSONDecodeError):
+                        continue
+
+                    # Latest batch wins (same dedup policy as discover_runs)
+                    result[(benchmark, config_name, task_id)] = data
+
+    return result
diff --git a/scripts/generate_eval_report.py b/scripts/generate_eval_report.py
@@ -32,7 +32,7 @@
 if str(_SCRIPT_DIR) not in sys.path:
     sys.path.insert(0, str(_SCRIPT_DIR))
 
-from ccb_metrics import discover_runs, EvalReport, RunMetrics
+from ccb_metrics import discover_runs, collect_retrieval_data, EvalReport, RunMetrics
 from ccb_metrics.task_selection import (
     load_selected_tasks,
     build_task_index,
@@ -515,6 +515,193 @@ def _build_swebench_partial(runs: list[RunMetrics]) -> Optional[tuple[list[str],
     return headers, rows
 
 
+# ---------------------------------------------------------------------------
+# MCP Retrieval Performance tables
+# ---------------------------------------------------------------------------
+
+# Type alias: (benchmark, config_name, task_id) -> retrieval metrics dict
+_RetrievalData = dict[tuple[str, str, str], dict]
+
+
+def _has_retrieval_data(retrieval_data: _RetrievalData) -> bool:
+    return bool(retrieval_data)
+
+
+def _build_retrieval_per_task(
+    runs: list[RunMetrics],
+    retrieval_data: _RetrievalData,
+) -> Optional[tuple[list[str], list[list[str]]]]:
+    """Table: per-task oracle coverage, time-to-first-hit, repos/orgs touched."""
+    # Collect rows for any task that has retrieval data
+    rows = []
+    for r in sorted(runs, key=lambda x: (x.benchmark, x.config_name)):
+        for t in sorted(r.tasks, key=lambda x: x.task_id):
+            key = (r.benchmark, r.config_name, t.task_id)
+            m = retrieval_data.get(key)
+            if m is None:
+                continue
+            ttfh = m.get("time_to_first_oracle_hit_ms")
+            rows.append([
+                r.benchmark,
+                r.config_name,
+                t.task_id,
+                _fmt(m.get("oracle_coverage")),
+                f"{int(ttfh):,}" if ttfh is not None else "-",
+                str(m.get("unique_repos_touched", 0)),
+                str(m.get("unique_orgs_touched", 0)),
+            ])
+
+    if not rows:
+        return None
+
+    headers = [
+        "Suite", "Config", "Task",
+        "Oracle Coverage", "Time-to-First-Hit (ms)",
+        "Repos Touched", "Orgs Touched",
+    ]
+    return headers, rows
+
+
+def _build_retrieval_per_suite(
+    runs: list[RunMetrics],
+    retrieval_data: _RetrievalData,
+) -> Optional[tuple[list[str], list[list[str]]]]:
+    """Table: per-suite aggregate retrieval metrics."""
+    # Group by (benchmark, config_name)
+    agg: dict[tuple[str, str], list[dict]] = {}
+    for r in runs:
+        for t in r.tasks:
+            key = (r.benchmark, r.config_name, t.task_id)
+            m = retrieval_data.get(key)
+            if m is None:
+                continue
+            gkey = (r.benchmark, r.config_name)
+            agg.setdefault(gkey, []).append(m)
+
+    if not agg:
+        return None
+
+    headers = [
+        "Suite", "Config", "Tasks",
+        "Mean Coverage", "Mean Repos Touched", "Mean Orgs Touched",
+    ]
+    rows = []
+    for (bench, config) in sorted(agg.keys()):
+        items = agg[(bench, config)]
+        n = len(items)
+        mean_cov = _safe_mean([m.get("oracle_coverage") for m in items])
+        mean_repos = _safe_mean([m.get("unique_repos_touched") for m in items])
+        mean_orgs = _safe_mean([m.get("unique_orgs_touched") for m in items])
+        rows.append([
+            bench,
+            config,
+            str(n),
+            _fmt(mean_cov),
+            _fmt(mean_repos, 1),
+            _fmt(mean_orgs, 1),
+        ])
+    return headers, rows
+
+
+def _build_retrieval_comparison(
+    runs: list[RunMetrics],
+    retrieval_data: _RetrievalData,
+) -> Optional[tuple[list[str], list[list[str]]]]:
+    """Table: baseline vs MCP-Full oracle coverage comparison per task."""
+    # Identify baseline and mcp configs
+    configs = sorted({r.config_name for r in runs})
+    # Heuristic: baseline has no "mcp" or "sourcegraph" in name; sg_full has "sourcegraph_full"
+    baseline_configs = [c for c in configs if "sourcegraph" not in c.lower() and "mcp" not in c.lower()]
+    mcp_configs = [c for c in configs if "sourcegraph_full" in c.lower() or "mcp_full" in c.lower()]
+
+    if not baseline_configs or not mcp_configs:
+        return None
+
+    # Build (benchmark, task_id) -> {config -> metrics} lookup
+    lookup: dict[tuple[str, str], dict[str, dict]] = {}
+    for r in runs:
+        for t in r.tasks:
+            key = (r.benchmark, r.config_name, t.task_id)
+            m = retrieval_data.get(key)
+            if m is None:
+                continue
+            task_key = (r.benchmark, t.task_id)
+            lookup.setdefault(task_key, {})[r.config_name] = m
+
+    rows = []
+    for (bench, task_id) in sorted(lookup.keys()):
+        cmap = lookup[(bench, task_id)]
+        for bl_config in baseline_configs:
+            for mcp_config in mcp_configs:
+                bl = cmap.get(bl_config)
+                mcp = cmap.get(mcp_config)
+                if bl is None and mcp is None:
+                    continue
+                bl_cov = bl.get("oracle_coverage") if bl else None
+                mcp_cov = mcp.get("oracle_coverage") if mcp else None
+                delta = (mcp_cov - bl_cov) if (bl_cov is not None and mcp_cov is not None) else None
+                bl_orgs = str(bl.get("unique_orgs_touched", 0)) if bl else "-"
+                mcp_orgs = str(mcp.get("unique_orgs_touched", 0)) if mcp else "-"
+                rows.append([
+                    bench,
+                    task_id,
+                    _fmt(bl_cov),
+                    _fmt(mcp_cov),
+                    _fmt(delta) if delta is not None else "-",
+                    bl_orgs,
+                    mcp_orgs,
+                ])
+
+    if not rows:
+        return None
+
+    headers = [
+        "Suite", "Task",
+        "Baseline Coverage", "MCP-Full Coverage", "Delta",
+        "Baseline Orgs", "MCP Orgs",
+    ]
+    return headers, rows
+
+
+def _build_retrieval_tool_breakdown(
+    runs: list[RunMetrics],
+    retrieval_data: _RetrievalData,
+) -> Optional[tuple[list[str], list[list[str]]]]:
+    """Table: which MCP tools drive oracle discovery, aggregated per suite."""
+    # Aggregate mcp_tool_counts across all tasks with retrieval data
+    # Key: (benchmark, config_name, tool_name) -> total_calls
+    tool_agg: dict[tuple[str, str, str], int] = {}
+    found_any = False
+
+    for r in runs:
+        for t in r.tasks:
+            key = (r.benchmark, r.config_name, t.task_id)
+            m = retrieval_data.get(key)
+            if m is None:
+                continue
+            mcp_counts = m.get("mcp_tool_counts") or {}
+            for tool, count in mcp_counts.items():
+                found_any = True
+                agg_key = (r.benchmark, r.config_name, tool)
+                tool_agg[agg_key] = tool_agg.get(agg_key, 0) + count
+
+    if not found_any:
+        return None
+
+    # Sort by (benchmark, config, count desc)
+    sorted_items = sorted(
+        tool_agg.items(),
+        key=lambda x: (x[0][0], x[0][1], -x[1]),
+    )
+
+    headers = ["Suite", "Config", "MCP Tool", "Total Calls"]
+    rows = [
+        [bench, config, tool, str(count)]
+        for (bench, config, tool), count in sorted_items
+    ]
+    return headers, rows
+
+
 # ---------------------------------------------------------------------------
 # Report generation
 # ---------------------------------------------------------------------------
@@ -585,6 +772,14 @@ def generate_report(
         hc_path.write_text(json.dumps(harness_configs, indent=2) + "\n")
         print(f"Written: {hc_path}")
 
+    # Collect MCP retrieval data (backwards-compatible: empty dict if no files found)
+    print(f"Collecting retrieval metrics from: {runs_dir}")
+    retrieval_data = collect_retrieval_data(runs_dir)
+    if retrieval_data:
+        print(f"Found retrieval_metrics.json for {len(retrieval_data)} task(s).")
+    else:
+        print("No retrieval_metrics.json found — MCP Retrieval Performance section will be omitted.")
+
     # Build all tables
     tables: list[tuple[str, str, list[str], list[list[str]]]] = []
 
@@ -649,6 +844,28 @@ def generate_report(
         h, r = mcp_corr
         tables.append(("Performance by MCP Benefit Score", "mcp_benefit_correlation", h, r))
 
+    # MCP Retrieval Performance section (only when retrieval_metrics.json data exists)
+    if _has_retrieval_data(retrieval_data):
+        ret_per_task = _build_retrieval_per_task(runs, retrieval_data)
+        if ret_per_task:
+            h, r = ret_per_task
+            tables.append(("MCP Retrieval Performance — Per Task", "retrieval_per_task", h, r))
+
+        ret_per_suite = _build_retrieval_per_suite(runs, retrieval_data)
+        if ret_per_suite:
+            h, r = ret_per_suite
+            tables.append(("MCP Retrieval Performance — Per Suite", "retrieval_per_suite", h, r))
+
+        ret_cmp = _build_retrieval_comparison(runs, retrieval_data)
+        if ret_cmp:
+            h, r = ret_cmp
+            tables.append(("MCP Retrieval Performance — Baseline vs MCP-Full", "retrieval_comparison", h, r))
+
+        ret_tools = _build_retrieval_tool_breakdown(runs, retrieval_data)
+        if ret_tools:
+            h, r = ret_tools
+            tables.append(("MCP Retrieval Performance — Tool Discovery Breakdown", "retrieval_tool_breakdown", h, r))
+
     # Write REPORT.md
     md_lines = [
         "# CodeContextBench Evaluation Report",

Original file line number	Diff line number	Diff line change
`@@ -603,7 +603,7 @@`
`603`	`603`	`"python3 scripts/generate_eval_report.py runs without errors"`
`604`	`604`	`],`
`605`	`605`	`"priority": 20,`
`606`		`- "passes": false,`
	`606`	`+ "passes": true,`
`607`	`607`	`"notes": "The report should clearly show that baseline has lower oracle_coverage on mcp_only repos."`
`608`	`608`	`},`
`609`	`609`	`{`