feat: precision improvements — tighter prompt, pruning pass, parallel execution

sjarmak · claude · sjarmak · commit af296756c305 · 2026-03-02T03:13:35.000Z
- Add precision guidelines to SYSTEM_PROMPT_SUFFIX (exclude test files,
  docs, tangential code; aim for 1-5 files on simple bugs)
- Add prune_oracle_cli() that runs a haiku pruning pass to filter
  irrelevant files from agent output
- Add --prune flag to validate_on_contextbench.py
- Add --parallel N flag with ThreadPoolExecutor for concurrent tasks
- Refactor main loop into process_one_task() worker function

Phase 1 baseline: composite=0.6426, recall=0.90, precision=0.30
Target: improve precision to push composite above 0.65 threshold.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/scripts/context_retrieval_agent.py b/scripts/context_retrieval_agent.py
@@ -980,6 +980,16 @@ def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
 }
 
 SYSTEM_PROMPT_SUFFIX = """
+## Precision Guidelines
+- Include ONLY source files that would need to be **read or modified** to address the task.
+- Do NOT include test files, documentation, or configuration files unless the task \
+explicitly asks about testing, docs, or configuration.
+- Do NOT include files that merely import or reference the relevant code — only files \
+that contain the logic central to the task.
+- When in doubt, ask: "Would a developer need to open this file to fix/understand the issue?" \
+If no, exclude it.
+- Aim for 1-5 files for simple bugs, 3-10 for moderate tasks, 10+ only for large refactors.
+
 ## Output
 When you have identified all relevant files, output a JSON object with:
 ```json
@@ -1264,6 +1274,119 @@ def _cli_error_metadata(model: str, backend: str, start_time: float) -> Dict[str
     }
 
 
+PRUNE_PROMPT = """\
+You are a precision filter for code context retrieval. Given a task description \
+and a list of predicted files, remove files that are NOT directly relevant.
+
+## Rules
+- Keep ONLY files that a developer would need to **read or modify** to address the task.
+- Remove test files unless the task is specifically about testing.
+- Remove documentation files unless the task is about docs.
+- Remove files that merely import or reference the relevant code.
+- When unsure, keep the file (recall > precision).
+
+## Task
+{task_description}
+
+## Predicted Files
+{file_list}
+
+## Output
+Return a JSON object with the filtered file list:
+```json
+{{
+  "files": [
+    {{"repo": "repo-name", "path": "relative/path/to/file"}}
+  ],
+  "pruned_count": <number of files removed>,
+  "text": "Brief explanation of what was removed and why."
+}}
+```
+"""
+
+
+def prune_oracle_cli(
+    oracle: Dict[str, Any],
+    ctx: Dict[str, Any],
+    prune_model: str = "claude-haiku-4-5-20251001",
+    verbose: bool = False,
+) -> Dict[str, Any]:
+    """Run a pruning pass on the oracle output using a cheap model via CLI.
+
+    Asks a fast model to remove irrelevant files from the agent's output.
+    Returns the pruned oracle (or original if pruning fails).
+    """
+    files = oracle.get("files", [])
+    if len(files) <= 3:
+        # Too few to prune — skip
+        if verbose:
+            log.info("  Prune: skipping (%d files, <= 3)", len(files))
+        return oracle
+
+    task_desc = ctx.get("seed_prompt", "") or ctx.get("instruction", "")
+    file_list = "\n".join(
+        f"- {f.get('repo', '?')}: {f.get('path', '?')}" for f in files
+    )
+
+    prompt = PRUNE_PROMPT.format(
+        task_description=task_desc[:3000],
+        file_list=file_list,
+    )
+
+    cmd = [
+        "claude", "-p", prompt,
+        "--output-format", "json",
+        "--model", prune_model,
+        "--dangerously-skip-permissions",
+    ]
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+    if verbose:
+        log.info("  Prune: %d files -> calling %s", len(files), prune_model)
+
+    try:
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, env=env, timeout=60,
+        )
+    except subprocess.TimeoutExpired:
+        log.warning("  Prune: timeout, keeping original")
+        return oracle
+
+    if result.returncode != 0:
+        log.warning("  Prune: CLI failed (rc=%d), keeping original", result.returncode)
+        return oracle
+
+    try:
+        cli_output = json.loads(result.stdout)
+    except (json.JSONDecodeError, ValueError):
+        log.warning("  Prune: failed to parse CLI output, keeping original")
+        return oracle
+
+    result_text = cli_output.get("result", "")
+    pruned = _extract_json_from_text(result_text)
+    if pruned is None or "files" not in pruned:
+        log.warning("  Prune: no valid JSON in output, keeping original")
+        return oracle
+
+    pruned_files = pruned.get("files", [])
+    prune_cost = cli_output.get("total_cost_usd", 0.0)
+
+    if verbose:
+        log.info("  Prune: %d -> %d files ($%.4f)",
+                 len(files), len(pruned_files), prune_cost)
+
+    # Merge: keep pruned files but preserve original symbols/chain/text
+    result_oracle = dict(oracle)
+    result_oracle["files"] = pruned_files
+    result_oracle["_prune_metadata"] = {
+        "original_count": len(files),
+        "pruned_count": len(files) - len(pruned_files),
+        "prune_model": prune_model,
+        "prune_cost_usd": prune_cost,
+    }
+    return result_oracle
+
+
 def build_user_message(
     ctx: Dict[str, Any], repo_paths: Dict[str, Path]
 ) -> str:
diff --git a/scripts/validate_on_contextbench.py b/scripts/validate_on_contextbench.py
@@ -39,6 +39,7 @@
 """
 
 import argparse
+import concurrent.futures
 import json
 import logging
 import os
@@ -732,6 +733,14 @@ def main() -> int:
         "--max-tasks", type=int, default=0,
         help="Process at most N tasks (0 = all)",
     )
+    parser.add_argument(
+        "--parallel", type=int, default=1,
+        help="Number of tasks to run in parallel (default: 1)",
+    )
+    parser.add_argument(
+        "--prune", action="store_true",
+        help="Run a pruning pass with haiku to remove irrelevant files",
+    )
     args = parser.parse_args()
     use_cli = not args.use_sdk
 
@@ -844,46 +853,33 @@ def main() -> int:
             from context_retrieval_agent import SourcegraphClient
             sg = SourcegraphClient()
 
-    total_cost = 0.0
-    trajectories = []
-    evaluated_tasks = []
-
-    for i, task in enumerate(tasks):
-        if args.max_tasks > 0 and i >= args.max_tasks:
-            log.info("Max tasks limit reached (%d)", args.max_tasks)
-            break
-        if args.max_cost > 0 and total_cost >= args.max_cost:
-            log.warning("Cost limit reached ($%.2f)", total_cost)
-            break
-
-        instance_id = task.get("instance_id", f"task_{i}")
+    # -- Per-task worker function (can run in parallel) --
+    def process_one_task(task_tuple):
+        idx, task = task_tuple
+        instance_id = task.get("instance_id", f"task_{idx}")
         repo_url = task.get("repo_url", "")
         if not repo_url:
-            # Construct from repo field (org/repo -> full URL)
             repo_slug = task.get("repo", "")
             if repo_slug and "/" in repo_slug:
                 repo_url = f"https://github.com/{repo_slug}"
         commit = task.get("base_commit", task.get("commit", "HEAD"))
 
-        log.info("[%d/%d] %s", i + 1, len(tasks), instance_id)
-
         if not repo_url:
-            # Try to reconstruct from instance_id
             parts = instance_id.rsplit("-", 1)
             org_repo = parts[0].replace("__", "/") if parts else ""
             repo_url = f"https://github.com/{org_repo}" if org_repo else ""
 
         if not repo_url:
-            log.warning("  No repo URL, skipping")
-            continue
+            log.warning("[%d] No repo URL, skipping %s", idx + 1, instance_id)
+            return None
+
+        log.info("[%d/%d] %s", idx + 1, len(tasks), instance_id)
 
-        # Clone repo
         repo_path = clone_for_contextbench(repo_url, commit)
         if not repo_path:
-            log.warning("  Clone failed, skipping")
-            continue
+            log.warning("[%d] Clone failed, skipping %s", idx + 1, instance_id)
+            return None
 
-        # Run agent
         try:
             result = run_retrieval_agent_on_cb_task(
                 task, repo_path, client,
@@ -892,24 +888,75 @@ def main() -> int:
                 use_cli=use_cli,
             )
         except Exception as e:
-            log.error("  Agent failed: %s", e)
-            continue
+            log.error("[%d] Agent failed for %s: %s", idx + 1, instance_id, e)
+            return None
+
+        # Optional pruning pass
+        if args.prune:
+            from context_retrieval_agent import prune_oracle_cli
+            ctx_for_prune = {
+                "seed_prompt": task.get("problem_statement", ""),
+                "instruction": task.get("problem_statement", ""),
+            }
+            result["oracle"] = prune_oracle_cli(
+                result["oracle"], ctx_for_prune, verbose=args.verbose,
+            )
+            prune_meta = result["oracle"].get("_prune_metadata", {})
+            result["metadata"]["prune_cost_usd"] = prune_meta.get("prune_cost_usd", 0)
+            result["metadata"]["cost_usd"] = (
+                result["metadata"].get("cost_usd", 0) + prune_meta.get("prune_cost_usd", 0)
+            )
 
-        total_cost += result["metadata"].get("cost_usd", 0)
+        n_files = len(result["oracle"].get("files", []))
+        log.info("[%d] %s -> %d files, $%.4f",
+                 idx + 1, instance_id, n_files, result["metadata"]["cost_usd"])
 
-        # Convert to trajectory
         traj = convert_to_trajectory(
             instance_id, result["oracle"],
             model_patch=task.get("patch", ""),
         )
-        trajectories.append(traj)
-        evaluated_tasks.append(task)
+        return {"task": task, "traj": traj, "result": result}
 
-        n_files = len(result["oracle"].get("files", []))
-        log.info(
-            "  -> %d files, $%.4f",
-            n_files, result["metadata"]["cost_usd"],
-        )
+    # -- Apply limits --
+    run_tasks = tasks
+    if args.max_tasks > 0:
+        run_tasks = tasks[:args.max_tasks]
+
+    # -- Execute tasks (parallel or sequential) --
+    total_cost = 0.0
+    trajectories = []
+    evaluated_tasks = []
+
+    task_tuples = list(enumerate(run_tasks))
+    n_parallel = max(1, args.parallel)
+
+    if n_parallel > 1 and len(task_tuples) > 1:
+        log.info("Running %d tasks with %d workers", len(task_tuples), n_parallel)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor:
+            futures = {executor.submit(process_one_task, t): t for t in task_tuples}
+            for future in concurrent.futures.as_completed(futures):
+                outcome = future.result()
+                if outcome is None:
+                    continue
+                total_cost += outcome["result"]["metadata"].get("cost_usd", 0)
+                if args.max_cost > 0 and total_cost >= args.max_cost:
+                    log.warning("Cost limit reached ($%.2f), cancelling remaining", total_cost)
+                    for f in futures:
+                        f.cancel()
+                    break
+                trajectories.append(outcome["traj"])
+                evaluated_tasks.append(outcome["task"])
+    else:
+        for tt in task_tuples:
+            if args.max_cost > 0 and total_cost >= args.max_cost:
+                log.warning("Cost limit reached ($%.2f)", total_cost)
+                break
+            outcome = process_one_task(tt)
+            if outcome is None:
+                continue
+            total_cost += outcome["result"]["metadata"].get("cost_usd", 0)
+            trajectories.append(outcome["traj"])
+            evaluated_tasks.append(outcome["task"])
 
     if not trajectories:
         log.error("No tasks completed")