|
32 | 32 | if str(_SCRIPT_DIR) not in sys.path: |
33 | 33 | sys.path.insert(0, str(_SCRIPT_DIR)) |
34 | 34 |
|
35 | | -from ccb_metrics import discover_runs, EvalReport, RunMetrics |
| 35 | +from ccb_metrics import discover_runs, collect_retrieval_data, EvalReport, RunMetrics |
36 | 36 | from ccb_metrics.task_selection import ( |
37 | 37 | load_selected_tasks, |
38 | 38 | build_task_index, |
@@ -515,6 +515,193 @@ def _build_swebench_partial(runs: list[RunMetrics]) -> Optional[tuple[list[str], |
515 | 515 | return headers, rows |
516 | 516 |
|
517 | 517 |
|
| 518 | +# --------------------------------------------------------------------------- |
| 519 | +# MCP Retrieval Performance tables |
| 520 | +# --------------------------------------------------------------------------- |
| 521 | + |
| 522 | +# Type alias: (benchmark, config_name, task_id) -> retrieval metrics dict |
| 523 | +_RetrievalData = dict[tuple[str, str, str], dict] |
| 524 | + |
| 525 | + |
| 526 | +def _has_retrieval_data(retrieval_data: _RetrievalData) -> bool: |
| 527 | + return bool(retrieval_data) |
| 528 | + |
| 529 | + |
| 530 | +def _build_retrieval_per_task( |
| 531 | + runs: list[RunMetrics], |
| 532 | + retrieval_data: _RetrievalData, |
| 533 | +) -> Optional[tuple[list[str], list[list[str]]]]: |
| 534 | + """Table: per-task oracle coverage, time-to-first-hit, repos/orgs touched.""" |
| 535 | + # Collect rows for any task that has retrieval data |
| 536 | + rows = [] |
| 537 | + for r in sorted(runs, key=lambda x: (x.benchmark, x.config_name)): |
| 538 | + for t in sorted(r.tasks, key=lambda x: x.task_id): |
| 539 | + key = (r.benchmark, r.config_name, t.task_id) |
| 540 | + m = retrieval_data.get(key) |
| 541 | + if m is None: |
| 542 | + continue |
| 543 | + ttfh = m.get("time_to_first_oracle_hit_ms") |
| 544 | + rows.append([ |
| 545 | + r.benchmark, |
| 546 | + r.config_name, |
| 547 | + t.task_id, |
| 548 | + _fmt(m.get("oracle_coverage")), |
| 549 | + f"{int(ttfh):,}" if ttfh is not None else "-", |
| 550 | + str(m.get("unique_repos_touched", 0)), |
| 551 | + str(m.get("unique_orgs_touched", 0)), |
| 552 | + ]) |
| 553 | + |
| 554 | + if not rows: |
| 555 | + return None |
| 556 | + |
| 557 | + headers = [ |
| 558 | + "Suite", "Config", "Task", |
| 559 | + "Oracle Coverage", "Time-to-First-Hit (ms)", |
| 560 | + "Repos Touched", "Orgs Touched", |
| 561 | + ] |
| 562 | + return headers, rows |
| 563 | + |
| 564 | + |
| 565 | +def _build_retrieval_per_suite( |
| 566 | + runs: list[RunMetrics], |
| 567 | + retrieval_data: _RetrievalData, |
| 568 | +) -> Optional[tuple[list[str], list[list[str]]]]: |
| 569 | + """Table: per-suite aggregate retrieval metrics.""" |
| 570 | + # Group by (benchmark, config_name) |
| 571 | + agg: dict[tuple[str, str], list[dict]] = {} |
| 572 | + for r in runs: |
| 573 | + for t in r.tasks: |
| 574 | + key = (r.benchmark, r.config_name, t.task_id) |
| 575 | + m = retrieval_data.get(key) |
| 576 | + if m is None: |
| 577 | + continue |
| 578 | + gkey = (r.benchmark, r.config_name) |
| 579 | + agg.setdefault(gkey, []).append(m) |
| 580 | + |
| 581 | + if not agg: |
| 582 | + return None |
| 583 | + |
| 584 | + headers = [ |
| 585 | + "Suite", "Config", "Tasks", |
| 586 | + "Mean Coverage", "Mean Repos Touched", "Mean Orgs Touched", |
| 587 | + ] |
| 588 | + rows = [] |
| 589 | + for (bench, config) in sorted(agg.keys()): |
| 590 | + items = agg[(bench, config)] |
| 591 | + n = len(items) |
| 592 | + mean_cov = _safe_mean([m.get("oracle_coverage") for m in items]) |
| 593 | + mean_repos = _safe_mean([m.get("unique_repos_touched") for m in items]) |
| 594 | + mean_orgs = _safe_mean([m.get("unique_orgs_touched") for m in items]) |
| 595 | + rows.append([ |
| 596 | + bench, |
| 597 | + config, |
| 598 | + str(n), |
| 599 | + _fmt(mean_cov), |
| 600 | + _fmt(mean_repos, 1), |
| 601 | + _fmt(mean_orgs, 1), |
| 602 | + ]) |
| 603 | + return headers, rows |
| 604 | + |
| 605 | + |
| 606 | +def _build_retrieval_comparison( |
| 607 | + runs: list[RunMetrics], |
| 608 | + retrieval_data: _RetrievalData, |
| 609 | +) -> Optional[tuple[list[str], list[list[str]]]]: |
| 610 | + """Table: baseline vs MCP-Full oracle coverage comparison per task.""" |
| 611 | + # Identify baseline and mcp configs |
| 612 | + configs = sorted({r.config_name for r in runs}) |
| 613 | + # Heuristic: baseline has no "mcp" or "sourcegraph" in name; sg_full has "sourcegraph_full" |
| 614 | + baseline_configs = [c for c in configs if "sourcegraph" not in c.lower() and "mcp" not in c.lower()] |
| 615 | + mcp_configs = [c for c in configs if "sourcegraph_full" in c.lower() or "mcp_full" in c.lower()] |
| 616 | + |
| 617 | + if not baseline_configs or not mcp_configs: |
| 618 | + return None |
| 619 | + |
| 620 | + # Build (benchmark, task_id) -> {config -> metrics} lookup |
| 621 | + lookup: dict[tuple[str, str], dict[str, dict]] = {} |
| 622 | + for r in runs: |
| 623 | + for t in r.tasks: |
| 624 | + key = (r.benchmark, r.config_name, t.task_id) |
| 625 | + m = retrieval_data.get(key) |
| 626 | + if m is None: |
| 627 | + continue |
| 628 | + task_key = (r.benchmark, t.task_id) |
| 629 | + lookup.setdefault(task_key, {})[r.config_name] = m |
| 630 | + |
| 631 | + rows = [] |
| 632 | + for (bench, task_id) in sorted(lookup.keys()): |
| 633 | + cmap = lookup[(bench, task_id)] |
| 634 | + for bl_config in baseline_configs: |
| 635 | + for mcp_config in mcp_configs: |
| 636 | + bl = cmap.get(bl_config) |
| 637 | + mcp = cmap.get(mcp_config) |
| 638 | + if bl is None and mcp is None: |
| 639 | + continue |
| 640 | + bl_cov = bl.get("oracle_coverage") if bl else None |
| 641 | + mcp_cov = mcp.get("oracle_coverage") if mcp else None |
| 642 | + delta = (mcp_cov - bl_cov) if (bl_cov is not None and mcp_cov is not None) else None |
| 643 | + bl_orgs = str(bl.get("unique_orgs_touched", 0)) if bl else "-" |
| 644 | + mcp_orgs = str(mcp.get("unique_orgs_touched", 0)) if mcp else "-" |
| 645 | + rows.append([ |
| 646 | + bench, |
| 647 | + task_id, |
| 648 | + _fmt(bl_cov), |
| 649 | + _fmt(mcp_cov), |
| 650 | + _fmt(delta) if delta is not None else "-", |
| 651 | + bl_orgs, |
| 652 | + mcp_orgs, |
| 653 | + ]) |
| 654 | + |
| 655 | + if not rows: |
| 656 | + return None |
| 657 | + |
| 658 | + headers = [ |
| 659 | + "Suite", "Task", |
| 660 | + "Baseline Coverage", "MCP-Full Coverage", "Delta", |
| 661 | + "Baseline Orgs", "MCP Orgs", |
| 662 | + ] |
| 663 | + return headers, rows |
| 664 | + |
| 665 | + |
| 666 | +def _build_retrieval_tool_breakdown( |
| 667 | + runs: list[RunMetrics], |
| 668 | + retrieval_data: _RetrievalData, |
| 669 | +) -> Optional[tuple[list[str], list[list[str]]]]: |
| 670 | + """Table: which MCP tools drive oracle discovery, aggregated per suite.""" |
| 671 | + # Aggregate mcp_tool_counts across all tasks with retrieval data |
| 672 | + # Key: (benchmark, config_name, tool_name) -> total_calls |
| 673 | + tool_agg: dict[tuple[str, str, str], int] = {} |
| 674 | + found_any = False |
| 675 | + |
| 676 | + for r in runs: |
| 677 | + for t in r.tasks: |
| 678 | + key = (r.benchmark, r.config_name, t.task_id) |
| 679 | + m = retrieval_data.get(key) |
| 680 | + if m is None: |
| 681 | + continue |
| 682 | + mcp_counts = m.get("mcp_tool_counts") or {} |
| 683 | + for tool, count in mcp_counts.items(): |
| 684 | + found_any = True |
| 685 | + agg_key = (r.benchmark, r.config_name, tool) |
| 686 | + tool_agg[agg_key] = tool_agg.get(agg_key, 0) + count |
| 687 | + |
| 688 | + if not found_any: |
| 689 | + return None |
| 690 | + |
| 691 | + # Sort by (benchmark, config, count desc) |
| 692 | + sorted_items = sorted( |
| 693 | + tool_agg.items(), |
| 694 | + key=lambda x: (x[0][0], x[0][1], -x[1]), |
| 695 | + ) |
| 696 | + |
| 697 | + headers = ["Suite", "Config", "MCP Tool", "Total Calls"] |
| 698 | + rows = [ |
| 699 | + [bench, config, tool, str(count)] |
| 700 | + for (bench, config, tool), count in sorted_items |
| 701 | + ] |
| 702 | + return headers, rows |
| 703 | + |
| 704 | + |
518 | 705 | # --------------------------------------------------------------------------- |
519 | 706 | # Report generation |
520 | 707 | # --------------------------------------------------------------------------- |
@@ -585,6 +772,14 @@ def generate_report( |
585 | 772 | hc_path.write_text(json.dumps(harness_configs, indent=2) + "\n") |
586 | 773 | print(f"Written: {hc_path}") |
587 | 774 |
|
| 775 | + # Collect MCP retrieval data (backwards-compatible: empty dict if no files found) |
| 776 | + print(f"Collecting retrieval metrics from: {runs_dir}") |
| 777 | + retrieval_data = collect_retrieval_data(runs_dir) |
| 778 | + if retrieval_data: |
| 779 | + print(f"Found retrieval_metrics.json for {len(retrieval_data)} task(s).") |
| 780 | + else: |
| 781 | + print("No retrieval_metrics.json found — MCP Retrieval Performance section will be omitted.") |
| 782 | + |
588 | 783 | # Build all tables |
589 | 784 | tables: list[tuple[str, str, list[str], list[list[str]]]] = [] |
590 | 785 |
|
@@ -649,6 +844,28 @@ def generate_report( |
649 | 844 | h, r = mcp_corr |
650 | 845 | tables.append(("Performance by MCP Benefit Score", "mcp_benefit_correlation", h, r)) |
651 | 846 |
|
| 847 | + # MCP Retrieval Performance section (only when retrieval_metrics.json data exists) |
| 848 | + if _has_retrieval_data(retrieval_data): |
| 849 | + ret_per_task = _build_retrieval_per_task(runs, retrieval_data) |
| 850 | + if ret_per_task: |
| 851 | + h, r = ret_per_task |
| 852 | + tables.append(("MCP Retrieval Performance — Per Task", "retrieval_per_task", h, r)) |
| 853 | + |
| 854 | + ret_per_suite = _build_retrieval_per_suite(runs, retrieval_data) |
| 855 | + if ret_per_suite: |
| 856 | + h, r = ret_per_suite |
| 857 | + tables.append(("MCP Retrieval Performance — Per Suite", "retrieval_per_suite", h, r)) |
| 858 | + |
| 859 | + ret_cmp = _build_retrieval_comparison(runs, retrieval_data) |
| 860 | + if ret_cmp: |
| 861 | + h, r = ret_cmp |
| 862 | + tables.append(("MCP Retrieval Performance — Baseline vs MCP-Full", "retrieval_comparison", h, r)) |
| 863 | + |
| 864 | + ret_tools = _build_retrieval_tool_breakdown(runs, retrieval_data) |
| 865 | + if ret_tools: |
| 866 | + h, r = ret_tools |
| 867 | + tables.append(("MCP Retrieval Performance — Tool Discovery Breakdown", "retrieval_tool_breakdown", h, r)) |
| 868 | + |
652 | 869 | # Write REPORT.md |
653 | 870 | md_lines = [ |
654 | 871 | "# CodeContextBench Evaluation Report", |
|
0 commit comments