Skip to content

Commit d08f984

Browse files
sjarmakclaude
andcommitted
fix: oracle provenance check + add oracle IR analysis scripts
- Fix oracle_checks.py provenance: include structured JSON (chain, files) in answer_text so citations in structured data are found by substring matching. Fixes dep-trace-004 MCP false negative (0.875 → 1.0). Applied to all 12 MCP-unique task oracle_checks.py files. - Add scripts/oracle_ir_analysis.py: oracle-based IR metrics (MRR, nDCG, MAP, P@K, R@K, context efficiency) for MCP-unique tasks. Handles Harbor JSONL transcript format, sg_list_files repo-from-input fallback, baseline suffix matching for steps_to_first_relevant. - Add scripts/analyze_mcp_unique_haiku.py: comprehensive trace analysis (reward comparison, token efficiency, MCP tool distribution). - Add scripts/oracle_retrieval_analysis.py: oracle-based retrieval quality per check type with root cause analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2fef9e2 commit d08f984

File tree

15 files changed

+2719
-24
lines changed

15 files changed

+2719
-24
lines changed

benchmarks/ccb_mcp_crossorg/ccx-crossorg-061/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_crossorg/ccx-crossorg-066/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_incident/ccx-incident-031/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_onboarding/ccx-onboard-041/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

benchmarks/ccb_mcp_platform/ccx-explore-091-ds/tests/oracle_checks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,16 @@ def run_all_checks(
400400
oracle = spec.get("artifacts", {}).get("oracle", {})
401401
eval_checks = spec.get("evaluation", {}).get("checks", [])
402402

403-
# If answer is a dict with "text" key, extract the text for text-based checks
403+
# If answer is a dict with "text" key, extract the text for text-based checks.
404+
# Also include the full JSON serialization so that provenance citations in
405+
# structured fields (e.g. chain[].repo) are found by substring matching.
406+
# This prevents penalizing agents that correctly cite repos in structured
407+
# data but use natural language (e.g. "Loki") in the narrative text.
404408
answer_text = ""
405409
if isinstance(answer_data, dict):
406-
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
410+
narrative = answer_data.get("text", answer_data.get("answer", ""))
411+
full_json = json.dumps(answer_data)
412+
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
407413
elif isinstance(answer_data, str):
408414
answer_text = answer_data
409415

0 commit comments

Comments
 (0)