|
35 | 35 | Status, |
36 | 36 | get_criteria_for_suite, |
37 | 37 | ) |
| 38 | +from prompt_hygiene import audit_paths |
38 | 39 |
|
39 | 40 |
|
40 | 41 | # --------------------------------------------------------------------------- |
@@ -290,53 +291,36 @@ def check_t4_git_sha(tasks: list[Path]) -> CriterionResult: |
290 | 291 |
|
291 | 292 |
|
292 | 293 | def check_t5_no_solution_leak(tasks: list[Path]) -> CriterionResult: |
293 | | - """T.5: instruction.md doesn't leak solution content.""" |
294 | | - issues = [] |
| 294 | + """T.5: instructions do not leak code locations, solution strategies, or verifier details.""" |
| 295 | + prompt_paths: list[Path] = [] |
295 | 296 | for task_dir in tasks: |
296 | | - instruction = task_dir / "instruction.md" |
297 | | - if not instruction.is_file(): |
298 | | - continue |
299 | | - |
300 | | - inst_text = instruction.read_text(errors="replace").lower() |
301 | | - |
302 | | - # Check against solve.sh |
303 | | - solve_sh = task_dir / "solve.sh" |
304 | | - if solve_sh.is_file(): |
305 | | - solve_text = solve_sh.read_text(errors="replace") |
306 | | - # Extract meaningful code lines (not comments/blank) |
307 | | - solve_lines = [ |
308 | | - l.strip() for l in solve_text.splitlines() |
309 | | - if l.strip() and not l.strip().startswith("#") and len(l.strip()) > 15 |
310 | | - ] |
311 | | - for line in solve_lines: |
312 | | - if line.lower() in inst_text: |
313 | | - issues.append(f"{task_dir.name}: instruction contains solve.sh line: {line[:60]}") |
314 | | - break |
315 | | - |
316 | | - # Check against expected.diff |
317 | | - for diff_path in [task_dir / "expected.diff", task_dir / "tests" / "expected.diff"]: |
318 | | - if diff_path.is_file(): |
319 | | - diff_text = diff_path.read_text(errors="replace") |
320 | | - # Extract added lines from diff |
321 | | - added_lines = [ |
322 | | - l[1:].strip() for l in diff_text.splitlines() |
323 | | - if l.startswith("+") and not l.startswith("+++") and len(l.strip()) > 20 |
324 | | - ] |
325 | | - for line in added_lines[:20]: # Sample first 20 |
326 | | - if line.lower() in inst_text: |
327 | | - issues.append(f"{task_dir.name}: instruction contains diff content: {line[:60]}") |
328 | | - break |
329 | | - |
330 | | - if not issues: |
| 297 | + for name in ("instruction.md", "instruction_mcp.md"): |
| 298 | + path = task_dir / name |
| 299 | + if path.is_file(): |
| 300 | + prompt_paths.append(path) |
| 301 | + |
| 302 | + report = audit_paths(prompt_paths) |
| 303 | + files = report["files"] |
| 304 | + if not files: |
331 | 305 | return CriterionResult( |
332 | 306 | criterion_id="T.5", status=Status.PASS, |
333 | | - evidence="No solution content found in instructions", |
| 307 | + evidence="No prompt-hygiene findings across instruction.md or instruction_mcp.md", |
334 | 308 | ) |
| 309 | + issue_labels = { |
| 310 | + "code_location_hint": "code-location hints", |
| 311 | + "solution_leakage": "solution leakage", |
| 312 | + "scoring_leakage": "verifier leakage", |
| 313 | + } |
| 314 | + issues = [] |
| 315 | + for file_entry in files[:10]: |
| 316 | + rel_path = Path(file_entry["file"]).relative_to(PROJECT_ROOT) |
| 317 | + kinds = sorted({issue["type"] for issue in file_entry["issues"]}) |
| 318 | + issues.append(f"{rel_path}: {', '.join(issue_labels.get(kind, kind) for kind in kinds)}") |
335 | 319 | return CriterionResult( |
336 | 320 | criterion_id="T.5", status=Status.WARN, |
337 | 321 | evidence="\n".join(issues[:10]), |
338 | | - remediation="Review instructions to ensure they don't contain solution code", |
339 | | - details={"issues": issues}, |
| 322 | + remediation="Remove code-location guidance, prescribed fix steps, and verifier/scoring details from prompts.", |
| 323 | + details=report, |
340 | 324 | ) |
341 | 325 |
|
342 | 326 |
|
@@ -747,12 +731,12 @@ def check_r2_no_contamination(tasks: list[Path]) -> CriterionResult: |
747 | 731 | if not issues: |
748 | 732 | return CriterionResult( |
749 | 733 | criterion_id="R.2", status=Status.PASS, |
750 | | - evidence="No MCP/Sourcegraph tool guidance in baseline instructions", |
| 734 | + evidence="No MCP/Sourcegraph tool guidance in baseline instruction.md files", |
751 | 735 | ) |
752 | 736 | return CriterionResult( |
753 | 737 | criterion_id="R.2", status=Status.FAIL, |
754 | 738 | evidence="\n".join(issues[:10]), |
755 | | - remediation="Remove MCP/Sourcegraph tool guidance from baseline instructions", |
| 739 | + remediation="Remove MCP/Sourcegraph tool guidance from baseline instruction.md files and keep MCP-specific instructions in runtime injection or instruction_mcp.md only.", |
756 | 740 | details={"issue_count": len(issues), "issues": issues[:20]}, |
757 | 741 | ) |
758 | 742 |
|
|
0 commit comments