Skip to content

Commit 520cc1c

Browse files
sjarmakclaude
andcommitted
fix: improve _extract_files for SDLC ground truth formats
- Fix duplicate branch bug (both checked "files" key) - Handle root_cause_files and dependency_chain keys - Add docstring documenting all supported formats Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 96fd502 commit 520cc1c

File tree

1 file changed

+35
-16
lines changed

1 file changed

+35
-16
lines changed

scripts/cross_validate_oracles.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -251,37 +251,56 @@ def discover_comparison_pairs(
251251

252252

253253
def _extract_files(data: Dict[str, Any]) -> List[Dict]:
254-
"""Extract file list from various oracle/ground_truth formats."""
255-
# oracle_answer.json format
256-
if "files" in data and isinstance(data["files"], list):
257-
return data["files"]
254+
"""Extract file list from various oracle/ground_truth formats.
255+
256+
Handles:
257+
- oracle_answer.json: {"files": [{"repo": ..., "path": ...}]}
258+
- ground_truth.json (fix/feature/refactor): {"files": ["path", ...], "repo": "..."}
259+
- ground_truth.json (debug/design/understand): {"file_references": [...]}
260+
- ground_truth.json (dependency_chain): {"dependency_chain": ["path", ...]}
261+
- ground_truth_agent.json: same as oracle_answer.json format
262+
"""
263+
result = []
258264

259-
# ground_truth.json format (ccb_fix/feature/refactor)
265+
# "files" key — could be dicts or plain strings
260266
if "files" in data and isinstance(data["files"], list):
261-
# Files might be plain strings
262-
result = []
263267
repo = data.get("repo", "")
264268
for f in data["files"]:
265-
if isinstance(f, str):
266-
result.append({"repo": repo, "path": f})
267-
elif isinstance(f, dict):
269+
if isinstance(f, dict):
268270
result.append(f)
269-
return result
271+
elif isinstance(f, str):
272+
result.append({"repo": repo, "path": f})
273+
if result:
274+
return result
270275

271-
# ground_truth.json format (ccb_debug/design/understand)
276+
# "file_references" key (ccb_debug/design/understand)
272277
if "file_references" in data:
273278
refs = data["file_references"]
274-
result = []
275279
for ref in refs:
276280
if isinstance(ref, dict):
277281
path = ref.get("file", ref.get("path", ""))
278282
if path:
279283
result.append({"repo": "", "path": path})
280284
elif isinstance(ref, str):
281285
result.append({"repo": "", "path": ref})
282-
return result
283-
284-
return []
286+
if result:
287+
return result
288+
289+
# "root_cause_files" as additional files (ccb_fix)
290+
for key in ("root_cause_files", "dependency_chain"):
291+
items = data.get(key, [])
292+
if isinstance(items, list):
293+
repo = data.get("repo", "")
294+
for f in items:
295+
if isinstance(f, str):
296+
entry = {"repo": repo, "path": f}
297+
if entry not in result:
298+
result.append(entry)
299+
elif isinstance(f, dict):
300+
if f not in result:
301+
result.append(f)
302+
303+
return result
285304

286305

287306
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)