fix: oracle provenance check + add oracle IR analysis scripts

sjarmak · claude · sjarmak · commit d08f984e93fb · 2026-02-21T16:24:58.000Z
- Fix oracle_checks.py provenance: include structured JSON (chain, files)
  in answer_text so citations in structured data are found by substring
  matching. Fixes dep-trace-004 MCP false negative (0.875 → 1.0).
  Applied to all 12 MCP-unique task oracle_checks.py files.

- Add scripts/oracle_ir_analysis.py: oracle-based IR metrics (MRR, nDCG,
  MAP, P@K, R@K, context efficiency) for MCP-unique tasks. Handles
  Harbor JSONL transcript format, sg_list_files repo-from-input fallback,
  baseline suffix matching for steps_to_first_relevant.

- Add scripts/analyze_mcp_unique_haiku.py: comprehensive trace analysis
  (reward comparison, token efficiency, MCP tool distribution).

- Add scripts/oracle_retrieval_analysis.py: oracle-based retrieval quality
  per check type with root cause analysis.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/ccb_mcp_crossorg/ccx-crossorg-061/tests/oracle_checks.py b/benchmarks/ccb_mcp_crossorg/ccx-crossorg-061/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_crossorg/ccx-crossorg-066/tests/oracle_checks.py b/benchmarks/ccb_mcp_crossorg/ccx-crossorg-066/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010/tests/oracle_checks.py b/benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001/tests/oracle_checks.py b/benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004/tests/oracle_checks.py b/benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_incident/ccx-incident-031/tests/oracle_checks.py b/benchmarks/ccb_mcp_incident/ccx-incident-031/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds/tests/oracle_checks.py b/benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_onboarding/ccx-onboard-041/tests/oracle_checks.py b/benchmarks/ccb_mcp_onboarding/ccx-onboard-041/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds/tests/oracle_checks.py b/benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_platform/ccx-explore-091-ds/tests/oracle_checks.py b/benchmarks/ccb_mcp_platform/ccx-explore-091-ds/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_security/ccx-vuln-remed-011/tests/oracle_checks.py b/benchmarks/ccb_mcp_security/ccx-vuln-remed-011/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/benchmarks/ccb_mcp_security/ccx-vuln-remed-014/tests/oracle_checks.py b/benchmarks/ccb_mcp_security/ccx-vuln-remed-014/tests/oracle_checks.py
@@ -400,10 +400,16 @@ def run_all_checks(
     oracle = spec.get("artifacts", {}).get("oracle", {})
     eval_checks = spec.get("evaluation", {}).get("checks", [])
 
-    # If answer is a dict with "text" key, extract the text for text-based checks
+    # If answer is a dict with "text" key, extract the text for text-based checks.
+    # Also include the full JSON serialization so that provenance citations in
+    # structured fields (e.g. chain[].repo) are found by substring matching.
+    # This prevents penalizing agents that correctly cite repos in structured
+    # data but use natural language (e.g. "Loki") in the narrative text.
     answer_text = ""
     if isinstance(answer_data, dict):
-        answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
+        narrative = answer_data.get("text", answer_data.get("answer", ""))
+        full_json = json.dumps(answer_data)
+        answer_text = f"{narrative}\n{full_json}" if narrative else full_json
     elif isinstance(answer_data, str):
         answer_text = answer_data
 
diff --git a/scripts/analyze_mcp_unique_haiku.py b/scripts/analyze_mcp_unique_haiku.py
diff --git a/scripts/oracle_ir_analysis.py b/scripts/oracle_ir_analysis.py
diff --git a/scripts/oracle_retrieval_analysis.py b/scripts/oracle_retrieval_analysis.py