sourcegraph
diff --git a/‎configs/repo_health.json‎
Lines changed: 6 additions & 0 deletions b/‎configs/repo_health.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/START_HERE_BY_TASK.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/START_HERE_BY_TASK.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/ops/QA_PROCESS.md‎
Lines changed: 14 additions & 0 deletions b/‎docs/ops/QA_PROCESS.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/ops/WORKFLOWS.md‎
Lines changed: 3 additions & 2 deletions b/‎docs/ops/WORKFLOWS.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/ops/task_routes.json‎
Lines changed: 2 additions & 0 deletions b/‎docs/ops/task_routes.json‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/abc_audit.py‎
Lines changed: 26 additions & 42 deletions b/‎scripts/abc_audit.py‎
Lines changed: 26 additions & 42 deletions
diff --git a/‎scripts/abc_criteria.py‎
Lines changed: 4 additions & 4 deletions b/‎scripts/abc_criteria.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎scripts/generate_csb_org_tasks.py‎
Lines changed: 4 additions & 0 deletions b/‎scripts/generate_csb_org_tasks.py‎
Lines changed: 4 additions & 0 deletions
@@ -13,6 +13,12 @@
       "required": true,
       "description": "Task definitions valid (instruction length, test.sh, no placeholders)"
     },
+    "prompt_hygiene": {
+      "script": "scripts/prompt_hygiene.py",
+      "args": ["--git-modified", "--include-mcp", "--fail-on-findings"],
+      "required": true,
+      "description": "Modified prompt files must not leak code locations, solution strategies, or verifier details"
+    },
     "selection_file": {
       "script": null,
       "required": true,
 
@@ -53,6 +53,7 @@ python3 scripts/mcp_audit.py --run <run_dir>
 ### Key Commands
 ```bash
 python3 scripts/validate_task_run.py --run <run_dir>
+python3 scripts/prompt_hygiene.py --scan-root benchmarks/<suite_or_task_dir> --include-mcp
 python3 scripts/rerun_failed.py --help
 ```
 
@@ -101,5 +102,6 @@ python3 scripts/package_submission.py --help
 ### Key Commands
 ```bash
 python3 scripts/validate_tasks_preflight.py --task <task_dir> --smoke-runtime
+python3 scripts/prompt_hygiene.py --scan-root <task_dir> --include-mcp
 python3 scripts/sync_task_metadata.py --help
 ```
@@ -35,6 +35,7 @@ Runs before any benchmark execution to catch task definition errors that would w
 - **Language/difficulty mismatch** -- Cross-references `task.toml` fields against `selected_benchmark_tasks.json`
 - **Missing test scripts** -- Verifies `tests/test.sh` is present and executable
 - **Missing tasks** -- Detects tasks in the selection registry that have no corresponding benchmark directory
+- **Prompt hygiene on modified prompts** -- `instruction.md` and `instruction_mcp.md` files changed in the current worktree must not leak code locations, solution strategies, or verifier details
 
 ### Runtime Smoke (No Agent)
 
@@ -92,6 +93,9 @@ python3 scripts/validate_tasks_preflight.py --suite csb_sdlc_feature
 # Validate a single task
 python3 scripts/validate_tasks_preflight.py --task benchmarks/csb_sdlc_feature/envoy-grpc-server-impl-001
 
+# Audit modified prompt files for hygiene issues
+python3 scripts/prompt_hygiene.py --git-modified --include-mcp --fail-on-findings
+
 # Runtime smoke for a single task (no agent)
 python3 scripts/validate_tasks_preflight.py --task benchmarks/csb_sdlc_understand/terraform-plan-pipeline-qa-001 --smoke-runtime
 
@@ -236,6 +240,16 @@ Periodic full audits use a 6-dimension framework to ensure benchmark integrity:
 
 Checks whether `instruction.md` files contain references to MCP tools or Sourcegraph that would leak context into baseline (no-tool) runs. Any MCP-specific instructions should be injected at runtime via the agent harness, not baked into the task definition.
 
+Prompt-hygiene review is broader than MCP contamination. For both `instruction.md` and `instruction_mcp.md`, also investigate:
+- code-location hints such as exact source paths, directories to inspect, or named files/classes to open first
+- solution leakage such as prescribed helper names, replacement APIs, or step-by-step implementation plans
+- verifier leakage such as ground-truth diff counts, scoring formulas, or closed-world oracle wording
+
+Recommended investigation command:
+```bash
+python3 scripts/prompt_hygiene.py --scan-root benchmarks/<suite_or_task_dir> --include-mcp
+```
+
 ### Dimension 2: Reproducibility
 
 Verifies that task environments produce deterministic results:
 
@@ -242,6 +242,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 - `scripts/promote_agent_oracles.py` - Utility script for promote agent oracles.
 - `scripts/promote_blocked.py` - Utility script for promote blocked.
 - `scripts/promoted_verifier.py` - Utility script for promoted verifier.
+- `scripts/prompt_hygiene.py` - Utility script for prompt hygiene.
 - `scripts/push_base_images_ghcr.sh` - Utility script for push base images ghcr.
 - `scripts/regenerate_artifact_dockerfiles.py` - Utility script for regenerate artifact dockerfiles.
 - `scripts/rehost_sweap_images.py` - Utility script for rehost sweap images.
 
@@ -32,8 +32,9 @@
 ## Triage Workflow
 1. Confirm error class via `docs/ERROR_CATALOG.md` and status fingerprints.
 2. Check run outputs and trajectories.
-3. Isolate task-level fix or rerun scope.
-4. Avoid blind reruns; document root cause or limitation.
+3. Inspect prompt hygiene when the failure may be caused by prompt leakage or prompt wiring drift (`scripts/prompt_hygiene.py --scan-root benchmarks/<suite_or_task_dir> --include-mcp`).
+4. Isolate task-level fix or rerun scope.
+5. Avoid blind reruns; document root cause or limitation.
 
 ## ContextBench Calibration Workflow
 1. Ensure `claude` CLI is installed, authenticated, and `SOURCEGRAPH_ACCESS_TOKEN` is set.
 
@@ -48,6 +48,7 @@
       ],
       "commands": [
         "python3 scripts/validate_task_run.py --run <run_dir>",
+        "python3 scripts/prompt_hygiene.py --scan-root benchmarks/<suite_or_task_dir> --include-mcp",
         "python3 scripts/rerun_failed.py --help"
       ]
     },
@@ -90,6 +91,7 @@
       ],
       "commands": [
         "python3 scripts/validate_tasks_preflight.py --task <task_dir> --smoke-runtime",
+        "python3 scripts/prompt_hygiene.py --scan-root <task_dir> --include-mcp",
         "python3 scripts/sync_task_metadata.py --help"
       ]
     }
 
@@ -35,6 +35,7 @@
     Status,
     get_criteria_for_suite,
 )
+from prompt_hygiene import audit_paths
 
 
 # ---------------------------------------------------------------------------
@@ -290,53 +291,36 @@ def check_t4_git_sha(tasks: list[Path]) -> CriterionResult:
 
 
 def check_t5_no_solution_leak(tasks: list[Path]) -> CriterionResult:
-    """T.5: instruction.md doesn't leak solution content."""
-    issues = []
+    """T.5: instructions do not leak code locations, solution strategies, or verifier details."""
+    prompt_paths: list[Path] = []
     for task_dir in tasks:
-        instruction = task_dir / "instruction.md"
-        if not instruction.is_file():
-            continue
-
-        inst_text = instruction.read_text(errors="replace").lower()
-
-        # Check against solve.sh
-        solve_sh = task_dir / "solve.sh"
-        if solve_sh.is_file():
-            solve_text = solve_sh.read_text(errors="replace")
-            # Extract meaningful code lines (not comments/blank)
-            solve_lines = [
-                l.strip() for l in solve_text.splitlines()
-                if l.strip() and not l.strip().startswith("#") and len(l.strip()) > 15
-            ]
-            for line in solve_lines:
-                if line.lower() in inst_text:
-                    issues.append(f"{task_dir.name}: instruction contains solve.sh line: {line[:60]}")
-                    break
-
-        # Check against expected.diff
-        for diff_path in [task_dir / "expected.diff", task_dir / "tests" / "expected.diff"]:
-            if diff_path.is_file():
-                diff_text = diff_path.read_text(errors="replace")
-                # Extract added lines from diff
-                added_lines = [
-                    l[1:].strip() for l in diff_text.splitlines()
-                    if l.startswith("+") and not l.startswith("+++") and len(l.strip()) > 20
-                ]
-                for line in added_lines[:20]:  # Sample first 20
-                    if line.lower() in inst_text:
-                        issues.append(f"{task_dir.name}: instruction contains diff content: {line[:60]}")
-                        break
-
-    if not issues:
+        for name in ("instruction.md", "instruction_mcp.md"):
+            path = task_dir / name
+            if path.is_file():
+                prompt_paths.append(path)
+
+    report = audit_paths(prompt_paths)
+    files = report["files"]
+    if not files:
         return CriterionResult(
             criterion_id="T.5", status=Status.PASS,
-            evidence="No solution content found in instructions",
+            evidence="No prompt-hygiene findings across instruction.md or instruction_mcp.md",
         )
+    issue_labels = {
+        "code_location_hint": "code-location hints",
+        "solution_leakage": "solution leakage",
+        "scoring_leakage": "verifier leakage",
+    }
+    issues = []
+    for file_entry in files[:10]:
+        rel_path = Path(file_entry["file"]).relative_to(PROJECT_ROOT)
+        kinds = sorted({issue["type"] for issue in file_entry["issues"]})
+        issues.append(f"{rel_path}: {', '.join(issue_labels.get(kind, kind) for kind in kinds)}")
     return CriterionResult(
         criterion_id="T.5", status=Status.WARN,
         evidence="\n".join(issues[:10]),
-        remediation="Review instructions to ensure they don't contain solution code",
-        details={"issues": issues},
+        remediation="Remove code-location guidance, prescribed fix steps, and verifier/scoring details from prompts.",
+        details=report,
     )
 
 
@@ -747,12 +731,12 @@ def check_r2_no_contamination(tasks: list[Path]) -> CriterionResult:
     if not issues:
         return CriterionResult(
             criterion_id="R.2", status=Status.PASS,
-            evidence="No MCP/Sourcegraph tool guidance in baseline instructions",
+            evidence="No MCP/Sourcegraph tool guidance in baseline instruction.md files",
         )
     return CriterionResult(
         criterion_id="R.2", status=Status.FAIL,
         evidence="\n".join(issues[:10]),
-        remediation="Remove MCP/Sourcegraph tool guidance from baseline instructions",
+        remediation="Remove MCP/Sourcegraph tool guidance from baseline instruction.md files and keep MCP-specific instructions in runtime injection or instruction_mcp.md only.",
         details={"issue_count": len(issues), "issues": issues[:20]},
     )
 
 
@@ -252,8 +252,8 @@ def to_table(self) -> str:
     ABCCriterion(
         id="T.5",
         dimension=Dimension.TASK_VALIDITY,
-        title="instruction.md doesn't leak solution content",
-        description="Instructions must not contain solution code or test answers.",
+        title="Instructions do not leak code locations, solution strategy, or verifier details",
+        description="instruction.md and instruction_mcp.md must stay discovery-oriented and must not reveal where to look, how to fix it, or how the verifier scores.",
         severity=Severity.CRITICAL,
         automation=Automation.AUTOMATED,
     ),
@@ -388,8 +388,8 @@ def to_table(self) -> str:
     ABCCriterion(
         id="R.2",
         dimension=Dimension.REPORTING,
-        title="No MCP/Sourcegraph contamination in instruction.md",
-        description="Baseline instructions must not reference MCP, Sourcegraph, or Deep Search.",
+        title="Baseline instruction.md has no MCP/Sourcegraph contamination",
+        description="Baseline instructions must not reference MCP, Sourcegraph, or Deep Search. Prompt-hygiene review should also confirm the prompt stays tool-neutral and avoids code-location guidance.",
         severity=Severity.CRITICAL,
         automation=Automation.AUTOMATED,
     ),
 
@@ -23,6 +23,8 @@
 from string import Template
 from typing import Dict, List, Optional
 
+from prompt_hygiene import sanitize_instruction_text
+
 
 # ---------------------------------------------------------------------------
 # Constants and mappings
@@ -381,6 +383,8 @@ def generate_task(
 
     def write_rendered(fname: str, out_path: Path) -> None:
         content = render_template(templates_dir, fname, variables)
+        if out_path.name in {"instruction.md", "instruction_mcp.md"}:
+            content = sanitize_instruction_text(content)
         out_path.write_text(content)
         if verbose:
             logging.debug("  Wrote %s", out_path)
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@`
`48`	`48`	`],`
`49`	`49`	`"commands": [`
`50`	`50`	`"python3 scripts/validate_task_run.py --run <run_dir>",`
	`51`	`+ "python3 scripts/prompt_hygiene.py --scan-root benchmarks/<suite_or_task_dir> --include-mcp",`
`51`	`52`	`"python3 scripts/rerun_failed.py --help"`
`52`	`53`	`]`
`53`	`54`	`},`
`@@ -90,6 +91,7 @@`
`90`	`91`	`],`
`91`	`92`	`"commands": [`
`92`	`93`	`"python3 scripts/validate_tasks_preflight.py --task <task_dir> --smoke-runtime",`
	`94`	`+ "python3 scripts/prompt_hygiene.py --scan-root <task_dir> --include-mcp",`
`93`	`95`	`"python3 scripts/sync_task_metadata.py --help"`
`94`	`96`	`]`
`95`	`97`	`}`