Add baseline-vs-MCP precision/recall/F1 IR tables

sjarmak · sjarmak · commit 0447be5f0e22 · 2026-03-03T21:17:03.000Z
diff --git a/docs/BLOG_POST.md b/docs/BLOG_POST.md
@@ -157,10 +157,10 @@ The refreshed retrieval pipeline run confirms moderate retrieval quality overall
 
 On the computable subset, aggregated baseline vs MCP retrieval metrics are:
 
-| Config Type | n | File Recall | MRR | MAP | Context Efficiency |
-|-------------|---|-------------|-----|-----|--------------------|
-| baseline | 132 | 0.330 | 0.346 | 0.231 | 0.184 |
-| mcp | 179 | 0.556 | 0.378 | 0.267 | 0.204 |
+| Config Type | n | File Recall | Precision@5 | Recall@5 | F1@5 | MRR |
+|-------------|---|-------------|-------------|----------|------|-----|
+| baseline | 132 | 0.330 | 0.212 | 0.237 | 0.185 | 0.346 |
+| mcp | 179 | 0.556 | 0.215 | 0.248 | 0.200 | 0.378 |
 
 But better retrieval doesn't always mean better outcomes. Still investigating this but likely finding the right files is necessary but not sufficient. The agent still has to correctly apply what it finds, and in some tasks the local code modification step is where removing local code availability from the MCP run environment hurts more than others.
 
diff --git a/docs/analysis/analysis_refresh_tables_20260303.json b/docs/analysis/analysis_refresh_tables_20260303.json
@@ -440,14 +440,32 @@
       "file_recall": 0.3295,
       "mrr": 0.3462,
       "map_score": 0.2307,
-      "context_efficiency": 0.1843
+      "context_efficiency": 0.1843,
+      "precision@1": 0.2727,
+      "recall@1": 0.0769,
+      "f1@1": 0.1048,
+      "precision@5": 0.2121,
+      "recall@5": 0.2368,
+      "f1@5": 0.185,
+      "precision@10": 0.1424,
+      "recall@10": 0.2848,
+      "f1@10": 0.1579
     },
     "mcp": {
       "n": 179,
       "file_recall": 0.5558,
       "mrr": 0.3778,
       "map_score": 0.2667,
-      "context_efficiency": 0.2043
+      "context_efficiency": 0.2043,
+      "precision@1": 0.3073,
+      "recall@1": 0.1005,
+      "f1@1": 0.1309,
+      "precision@5": 0.2145,
+      "recall@5": 0.2476,
+      "f1@5": 0.2001,
+      "precision@10": 0.1419,
+      "recall@10": 0.3227,
+      "f1@10": 0.1724
     }
   },
   "ir_by_config": {
diff --git a/docs/analysis/analysis_set_metrics_20260303.json b/docs/analysis/analysis_set_metrics_20260303.json
@@ -440,14 +440,32 @@
       "file_recall": 0.3295,
       "mrr": 0.3462,
       "map_score": 0.2307,
-      "context_efficiency": 0.1843
+      "context_efficiency": 0.1843,
+      "precision@1": 0.2727,
+      "recall@1": 0.0769,
+      "f1@1": 0.1048,
+      "precision@5": 0.2121,
+      "recall@5": 0.2368,
+      "f1@5": 0.185,
+      "precision@10": 0.1424,
+      "recall@10": 0.2848,
+      "f1@10": 0.1579
     },
     "mcp": {
       "n": 179,
       "file_recall": 0.5558,
       "mrr": 0.3778,
       "map_score": 0.2667,
-      "context_efficiency": 0.2043
+      "context_efficiency": 0.2043,
+      "precision@1": 0.3073,
+      "recall@1": 0.1005,
+      "f1@1": 0.1309,
+      "precision@5": 0.2145,
+      "recall@5": 0.2476,
+      "f1@5": 0.2001,
+      "precision@10": 0.1419,
+      "recall@10": 0.3227,
+      "f1@10": 0.1724
     }
   },
   "ir_by_config": {
diff --git a/docs/technical_reports/TECHNICAL_REPORT_V2.md b/docs/technical_reports/TECHNICAL_REPORT_V2.md
@@ -942,10 +942,10 @@ This indicates retrieval quality remains moderate on computable tasks, but groun
 
 **IR aggregates by configuration type (baseline vs MCP):**
 
-| Config Type | n | File Recall | MRR | MAP | Context Efficiency |
-|-------------|---|-------------|-----|-----|--------------------|
-| baseline | 132 | 0.3295 | 0.3462 | 0.2307 | 0.1843 |
-| mcp | 179 | 0.5558 | 0.3778 | 0.2667 | 0.2043 |
+| Config Type | n | File Recall | Precision@5 | Recall@5 | F1@5 | MRR |
+|-------------|---|-------------|-------------|----------|------|-----|
+| baseline | 132 | 0.3295 | 0.2121 | 0.2368 | 0.1850 | 0.3462 |
+| mcp | 179 | 0.5558 | 0.2145 | 0.2476 | 0.2001 | 0.3778 |
 
 MCP runs show higher recall and slightly higher ranking/efficiency metrics on computable retrieval tasks.