sourcegraph
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 18 additions & 4 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.artifact_only‎
Lines changed: 45 additions & 0 deletions b/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.artifact_only‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.sg_only‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.sg_only‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/ccb_test/aspnetcore-code-review-001/instruction.md‎
Lines changed: 14 additions & 11 deletions b/‎benchmarks/ccb_test/aspnetcore-code-review-001/instruction.md‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎benchmarks/ccb_test/aspnetcore-code-review-001/tests/artifact_verifier_lib.sh‎
Lines changed: 200 additions & 0 deletions b/‎benchmarks/ccb_test/aspnetcore-code-review-001/tests/artifact_verifier_lib.sh‎
Lines changed: 200 additions & 0 deletions
@@ -390,7 +390,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
         repo_display = self._get_repo_display()
 
         # For hybrid MCP modes, prepend V4 preamble to the instruction text.
-        if mcp_type in ("sourcegraph_full", "sourcegraph_base"):
+        if mcp_type in ("sourcegraph_full", "sourcegraph_base", "artifact_full"):
             # --- V4 Preamble ---
             # Skill-style guidance: tool selection, scoping, context-aware behavior.
             # No mandatory workflow mandates — teaches effective MCP usage by example.
@@ -411,6 +411,20 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
             mcp_preamble = V4_PREAMBLE_TEMPLATE.format(repo_scope=repo_scope)
             instruction = mcp_preamble + instruction
 
+            # Artifact-full: append guidance about expressing changes as diffs
+            if mcp_type == "artifact_full":
+                artifact_guidance = """
+
+## Artifact-Only Evaluation
+
+You are in **artifact-only mode**. Your workspace is empty — all code discovery
+must go through Sourcegraph MCP tools. Express all code changes as **unified
+diffs** in your output artifact (e.g., `fix_patch` fields in review.json, or a
+standalone `solution.patch` file). Do NOT attempt to edit source files directly
+— there are no source files in your workspace.
+"""
+                instruction = instruction + artifact_guidance
+
         elif mcp_type == "sourcegraph_isolated":
             # Isolated mode: agent has only the target package locally (via sparse checkout).
             # All cross-package discovery MUST go through Sourcegraph MCP.
@@ -543,7 +557,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
 before retrying."""
             system_prompt_append = EVALUATION_CONTEXT_PROMPT + "\n\n---\n\n" + mcp_system_prompt
 
-        elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated"):
+        elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full"):
             # V4 system prompt: lightweight reinforcement (detailed guidance is in the instruction preamble).
             if repo_display != "the codebase":
                 repo_filter_system = f"Sourcegraph repository: github.com/{repo_display}\nFor keyword_search: repo:^github.com/{repo_display}$ YourSearchTerm"
@@ -672,7 +686,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
 
                 # For hybrid mode and pure baseline: no tool restrictions
                 # For forced MCP modes (sourcegraph, deepsearch): apply tool restrictions
-                if mcp_type in ["sourcegraph_full", "sourcegraph_isolated", "deepsearch_hybrid", "none"]:
+                if mcp_type in ["sourcegraph_full", "sourcegraph_isolated", "artifact_full", "deepsearch_hybrid", "none"]:
                     # Hybrid mode and pure baseline: No tool restrictions
                     # Don't add --tools flag at all - let Claude use all available tools
                     # Skip debug flag - it causes massive bundled JS output to stdout
@@ -966,7 +980,7 @@ async def setup(self, environment: BaseEnvironment) -> None:
 
         if mcp_type == "sourcegraph":
             await self._setup_sourcegraph_mcp(environment)
-        elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated"):
+        elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full"):
             await self._setup_sourcegraph_full_mcp(environment, mcp_type=mcp_type)
         elif mcp_type == "deepsearch":
             await self._setup_deepsearch_mcp(environment)
 
@@ -3,6 +3,7 @@ FROM ubuntu:22.04
 # Install common tools (python3 needed for inject_defects.sh)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
+    ca-certificates \
     curl \
     python3 \
     ripgrep \
 
@@ -0,0 +1,45 @@
+# aspnetcore-code-review-001 — artifact_only variant
+# Full repo backed up to /repo_full, workspace cleared for agent.
+# Verifier applies agent patches to /repo_full copy for scoring.
+
+FROM ubuntu:22.04
+
+# Install common tools (python3 needed for inject_defects.sh)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    ripgrep \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Node.js (for Claude Code CLI)
+RUN if ! command -v node &> /dev/null; then \
+    curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
+    apt-get install -y --no-install-recommends nodejs; \
+    fi
+
+# Clone dotnet/aspnetcore at pinned commit (PR #64636 merge commit)
+RUN git clone --filter=blob:none --no-checkout https://github.com/dotnet/aspnetcore.git /workspace && \
+    cd /workspace && \
+    git checkout 875255737993775850f1f3650c10ddb43ef00ced && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# Inject defects into the codebase
+COPY inject_defects.sh /tmp/inject_defects.sh
+RUN chmod +x /tmp/inject_defects.sh && /tmp/inject_defects.sh && rm /tmp/inject_defects.sh
+
+# Create directories for verifier (tests uploaded at runtime by Harbor verifier)
+RUN mkdir -p /workspace/tests /logs/verifier
+
+WORKDIR /workspace
+
+# --- artifact_only: backup full repo, then clear workspace for agent ---
+RUN cp -a /workspace /repo_full
+RUN rm -rf /workspace && mkdir -p /workspace
+RUN touch /tmp/.artifact_only_mode && echo '/workspace' > /tmp/.artifact_only_workdir
+
+WORKDIR /workspace
+
+ENTRYPOINT []
@@ -7,6 +7,7 @@ FROM ubuntu:22.04
 # Install common tools (python3 needed for inject_defects.sh)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
+    ca-certificates \
     curl \
     python3 \
     ripgrep \
 
@@ -9,7 +9,7 @@
 
 You are reviewing a recently merged pull request that adds a `DisplayName<TValue>` component to Blazor. This component reads `[Display]` and `[DisplayName]` attributes from model properties and renders the display name in forms. The PR introduces the component class, an expression member accessor helper with caching, and updates to project templates. However, several defects were introduced during the merge — both functional bugs and compliance violations.
 
-Your task is to **find the defects, fix them in the code, and produce a structured review report**.
+Your task is to **find the defects and produce a structured review report with proposed fixes**.
 
 ## Context
 
@@ -20,21 +20,21 @@ The DisplayName feature spans two core C# source files:
 
 ## Task
 
-YOU MUST IMPLEMENT CODE CHANGES to complete this task.
-
 Review the two files listed above for the following types of defects:
 
 - **Functional bugs**: Logic errors that cause incorrect behavior (e.g., wrong attribute precedence, missing null checks, broken cache invalidation).
 - **Compliance violations**: Deviations from ASP.NET Core component conventions (e.g., unnecessary re-rendering, missing render optimization).
 
 For each defect you find:
 
-1. **Fix the code** by editing the affected file in `/workspace/`.
-2. **Record the defect** in your review report.
+1. **Describe the defect** in your review report.
+2. **Write a fix** as a unified diff in the `fix_patch` field.
+
+**Do NOT edit source files directly.** Express all fixes as unified diffs in your review report. The evaluation system will apply your patches and verify correctness.
 
 ### Expected Output
 
-After completing your review, write a JSON file at `/workspace/review.json` containing an array of defect objects:
+Write a JSON file at `/workspace/review.json` containing an array of defect objects:
 
 ```json
 [
@@ -43,7 +43,7 @@ After completing your review, write a JSON file at `/workspace/review.json` cont
     "line": 60,
     "severity": "critical",
     "description": "Brief description of what is wrong and why",
-    "fix_applied": true
+    "fix_patch": "--- a/src/Components/Web/src/Forms/ExpressionMemberAccessor.cs\n+++ b/src/Components/Web/src/Forms/ExpressionMemberAccessor.cs\n@@ -58,5 +58,5 @@\n-    old line\n+    new line\n"
   }
 ]
 ```
@@ -53,13 +53,16 @@ Each entry must include:
 - `line` — Approximate line number where the defect occurs
 - `severity` — One of: `critical`, `high`, `medium`, `low`
 - `description` — What the defect is and what impact it has
-- `fix_applied` — Boolean indicating whether you committed a fix
+- `fix_patch` — Unified diff showing the proposed fix (use `--- a/` and `+++ b/` prefix format)
 
 ## Evaluation
 
-Your review will be evaluated on detection accuracy and fix quality.
+Your review will be evaluated on:
+- **Detection accuracy** (50%): Precision and recall of reported defects
+- **Fix quality** (50%): Whether your proposed patches correctly resolve the defects
 
-## Testing
+## Constraints
 
 - **Time limit**: 1200 seconds
-- Run `bash /workspace/tests/test.sh` to verify your changes
+- Do NOT edit source files directly — express fixes only in `fix_patch`
+- Do NOT run tests — the evaluation system handles verification
@@ -0,0 +1,200 @@
+#!/bin/bash
+# Artifact-only verifier helper: apply patches from agent artifacts to /repo_full copy.
+#
+# Source this at the TOP of test.sh for artifact-only mode. It detects
+# /tmp/.artifact_only_mode and:
+#   1. Copies /repo_full to /tmp/verify_repo (clean scoring copy)
+#   2. Exports VERIFY_REPO for downstream fix-pattern checks
+#   3. Provides apply_patches_from_review_json() to apply fix_patch fields
+#   4. Provides apply_patch_file() to apply standalone .patch files
+#
+# For non-artifact-only runs (legacy or sg_only), this script is a no-op.
+#
+# Usage in test.sh:
+#   #!/bin/bash
+#   set -e
+#   # Legacy sg_only support (no-op if not in sg_only mode)
+#   [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
+#   # Artifact-only support
+#   [ -f /tmp/.artifact_only_mode ] && [ -f /tests/artifact_verifier_lib.sh ] && source /tests/artifact_verifier_lib.sh
+#   # ... rest of test.sh uses $VERIFY_REPO for file checks ...
+
+if [ ! -f /tmp/.artifact_only_mode ]; then
+    # Not in artifact-only mode — export VERIFY_REPO as /workspace for backward compat
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    export ARTIFACT_ONLY=false
+    return 0 2>/dev/null || true
+fi
+
+echo "[artifact_verifier] Detected artifact-only mode"
+export ARTIFACT_ONLY=true
+
+# Create clean scoring copy from /repo_full
+if [ -d /repo_full ]; then
+    echo "[artifact_verifier] Copying /repo_full -> /tmp/verify_repo..."
+    rm -rf /tmp/verify_repo
+    cp -a /repo_full /tmp/verify_repo
+    export VERIFY_REPO="/tmp/verify_repo"
+    # Ensure git works in the copy
+    cd /tmp/verify_repo
+    git config --global --add safe.directory /tmp/verify_repo 2>/dev/null || true
+    echo "[artifact_verifier] Clean scoring repo ready at $VERIFY_REPO"
+else
+    echo "[artifact_verifier] WARNING: /repo_full not found. Using /workspace as fallback."
+    export VERIFY_REPO="/workspace"
+fi
+
+# ── Patch application functions ──────────────────────────────
+
+# Apply a single unified diff string to VERIFY_REPO.
+# Returns 0 on success, 1 on failure.
+apply_single_patch() {
+    local patch_text="$1"
+    local patch_file="/tmp/artifact_patch_$$.patch"
+
+    echo "$patch_text" > "$patch_file"
+
+    # Try git apply first (strictest)
+    if cd "$VERIFY_REPO" && git apply --allow-empty "$patch_file" 2>/dev/null; then
+        echo "[artifact_verifier] Patch applied via git apply"
+        rm -f "$patch_file"
+        return 0
+    fi
+
+    # Fallback: patch with fuzz
+    if cd "$VERIFY_REPO" && patch -p1 --fuzz=3 -i "$patch_file" 2>/dev/null; then
+        echo "[artifact_verifier] Patch applied via patch -p1 --fuzz=3"
+        rm -f "$patch_file"
+        return 0
+    fi
+
+    # Fallback: git apply with less strict options
+    if cd "$VERIFY_REPO" && git apply --allow-empty --3way "$patch_file" 2>/dev/null; then
+        echo "[artifact_verifier] Patch applied via git apply --3way"
+        rm -f "$patch_file"
+        return 0
+    fi
+
+    echo "[artifact_verifier] WARNING: Patch failed to apply"
+    rm -f "$patch_file"
+    return 1
+}
+
+# Apply a standalone .patch file to VERIFY_REPO.
+# Usage: apply_patch_file /workspace/solution.patch
+apply_patch_file() {
+    local patch_path="$1"
+    if [ ! -f "$patch_path" ]; then
+        echo "[artifact_verifier] Patch file not found: $patch_path"
+        return 1
+    fi
+
+    local content
+    content="$(cat "$patch_path")"
+    apply_single_patch "$content"
+}
+
+# Extract and apply fix_patch fields from a review.json file.
+# Usage: apply_patches_from_review_json /workspace/review.json
+# Returns the number of successfully applied patches.
+apply_patches_from_review_json() {
+    local review_path="$1"
+    if [ ! -f "$review_path" ]; then
+        echo "[artifact_verifier] review.json not found: $review_path"
+        echo "0"
+        return 1
+    fi
+
+    # Use Python to parse JSON and extract/apply patches
+    python3 - "$review_path" "$VERIFY_REPO" <<'PYEOF'
+import json, sys, os, subprocess, tempfile, re
+
+review_path = sys.argv[1]
+verify_repo = sys.argv[2]
+
+# Parse review.json with nested-object fallback
+try:
+    with open(review_path) as f:
+        raw = f.read()
+    # Strip markdown code fences
+    m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
+    if m:
+        raw = m.group(1).strip()
+    reported = json.loads(raw)
+
+    # Handle nested objects: {"defects": [...]}, {"review": {"defects": [...]}}
+    if isinstance(reported, dict):
+        for key in ("defects", "findings", "issues", "review"):
+            val = reported.get(key, None)
+            if isinstance(val, list):
+                reported = val
+                break
+            elif isinstance(val, dict):
+                for k2 in ("defects", "findings", "issues"):
+                    v2 = val.get(k2, None)
+                    if isinstance(v2, list):
+                        reported = v2
+                        break
+                if isinstance(reported, list):
+                    break
+    if not isinstance(reported, list):
+        reported = []
+except Exception as e:
+    print(f"[artifact_verifier] Failed to parse review.json: {e}", file=sys.stderr)
+    reported = []
+
+applied = 0
+failed = 0
+
+for entry in reported:
+    patch_text = entry.get("fix_patch", "")
+    if not patch_text or not patch_text.strip():
+        continue
+
+    # Write patch to temp file
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
+        pf.write(patch_text)
+        pf.flush()
+        pf_path = pf.name
+
+    # Try git apply
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        continue
+
+    # Fallback: patch -p1 --fuzz=3
+    result = subprocess.run(
+        ["patch", "-p1", "--fuzz=3", "-i", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        continue
+
+    # Fallback: git apply with 3way
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", "--3way", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        continue
+
+    failed += 1
+    file_name = entry.get("file", "unknown")
+    print(f"[artifact_verifier] Patch for {file_name} failed to apply", file=sys.stderr)
+    os.unlink(pf_path)
+
+print(f"[artifact_verifier] Patches applied: {applied}, failed: {failed}", file=sys.stderr)
+print(applied)
+PYEOF
+}
+
+echo "[artifact_verifier] Helper functions loaded"