Skip to content

Commit 87151a3

Browse files
sjarmakclaude
andcommitted
feat: artifact-only evaluation for ccb_test (20 tasks)
Both configs now produce a single artifact in one shot. Verifier scores only the artifact applied to a clean repo copy — neither config benefits from iterative test-fix cycles. Steps 1-5 (other session): Updated 8 code review verifiers + 8 code review instructions + 3 perf instructions + 3 perf verifiers + 6 testing instructions with artifact-only guards, fix_patch schema, and "Do NOT edit/run" constraints. Steps 6-8 (this session): - scripts/generate_artifact_only_dockerfiles.py: generates Dockerfile.artifact_only for 17 tasks (8 code-review, 3 perf, 6 test) - 17 Dockerfile.artifact_only: code-review/perf backup to /repo_full then clear workspace; testing tasks get minimal images - 17 artifact_verifier_lib.sh copies in tests/ dirs - agents/claude_baseline_agent.py: recognize artifact_full MCP type (same MCP as sourcegraph_full + artifact guidance preamble) - configs/sdlc_suite_2config.sh: FULL_CONFIG variable for Dockerfile swap (artifact_only parallel to sg_only) - configs/_common.sh: run_paired_configs uses FULL_CONFIG - configs/test_artifact_2config.sh: thin wrapper setting FULL_CONFIG=artifact_full Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 25f8931 commit 87151a3

File tree

84 files changed

+5537
-467
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+5537
-467
lines changed

agents/claude_baseline_agent.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
390390
repo_display = self._get_repo_display()
391391

392392
# For hybrid MCP modes, prepend V4 preamble to the instruction text.
393-
if mcp_type in ("sourcegraph_full", "sourcegraph_base"):
393+
if mcp_type in ("sourcegraph_full", "sourcegraph_base", "artifact_full"):
394394
# --- V4 Preamble ---
395395
# Skill-style guidance: tool selection, scoping, context-aware behavior.
396396
# No mandatory workflow mandates — teaches effective MCP usage by example.
@@ -411,6 +411,20 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
411411
mcp_preamble = V4_PREAMBLE_TEMPLATE.format(repo_scope=repo_scope)
412412
instruction = mcp_preamble + instruction
413413

414+
# Artifact-full: append guidance about expressing changes as diffs
415+
if mcp_type == "artifact_full":
416+
artifact_guidance = """
417+
418+
## Artifact-Only Evaluation
419+
420+
You are in **artifact-only mode**. Your workspace is empty — all code discovery
421+
must go through Sourcegraph MCP tools. Express all code changes as **unified
422+
diffs** in your output artifact (e.g., `fix_patch` fields in review.json, or a
423+
standalone `solution.patch` file). Do NOT attempt to edit source files directly
424+
— there are no source files in your workspace.
425+
"""
426+
instruction = instruction + artifact_guidance
427+
414428
elif mcp_type == "sourcegraph_isolated":
415429
# Isolated mode: agent has only the target package locally (via sparse checkout).
416430
# All cross-package discovery MUST go through Sourcegraph MCP.
@@ -543,7 +557,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
543557
before retrying."""
544558
system_prompt_append = EVALUATION_CONTEXT_PROMPT + "\n\n---\n\n" + mcp_system_prompt
545559

546-
elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated"):
560+
elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full"):
547561
# V4 system prompt: lightweight reinforcement (detailed guidance is in the instruction preamble).
548562
if repo_display != "the codebase":
549563
repo_filter_system = f"Sourcegraph repository: github.com/{repo_display}\nFor keyword_search: repo:^github.com/{repo_display}$ YourSearchTerm"
@@ -672,7 +686,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
672686

673687
# For hybrid mode and pure baseline: no tool restrictions
674688
# For forced MCP modes (sourcegraph, deepsearch): apply tool restrictions
675-
if mcp_type in ["sourcegraph_full", "sourcegraph_isolated", "deepsearch_hybrid", "none"]:
689+
if mcp_type in ["sourcegraph_full", "sourcegraph_isolated", "artifact_full", "deepsearch_hybrid", "none"]:
676690
# Hybrid mode and pure baseline: No tool restrictions
677691
# Don't add --tools flag at all - let Claude use all available tools
678692
# Skip debug flag - it causes massive bundled JS output to stdout
@@ -966,7 +980,7 @@ async def setup(self, environment: BaseEnvironment) -> None:
966980

967981
if mcp_type == "sourcegraph":
968982
await self._setup_sourcegraph_mcp(environment)
969-
elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated"):
983+
elif mcp_type in ("sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full"):
970984
await self._setup_sourcegraph_full_mcp(environment, mcp_type=mcp_type)
971985
elif mcp_type == "deepsearch":
972986
await self._setup_deepsearch_mcp(environment)

benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ FROM ubuntu:22.04
33
# Install common tools (python3 needed for inject_defects.sh)
44
RUN apt-get update && apt-get install -y --no-install-recommends \
55
git \
6+
ca-certificates \
67
curl \
78
python3 \
89
ripgrep \
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# aspnetcore-code-review-001 — artifact_only variant
2+
# Full repo backed up to /repo_full, workspace cleared for agent.
3+
# Verifier applies agent patches to /repo_full copy for scoring.
4+
5+
FROM ubuntu:22.04
6+
7+
# Install common tools (python3 needed for inject_defects.sh)
8+
RUN apt-get update && apt-get install -y --no-install-recommends \
9+
git \
10+
ca-certificates \
11+
curl \
12+
python3 \
13+
ripgrep \
14+
&& rm -rf /var/lib/apt/lists/*
15+
16+
# Install Node.js (for Claude Code CLI)
17+
RUN if ! command -v node &> /dev/null; then \
18+
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
19+
apt-get install -y --no-install-recommends nodejs; \
20+
fi
21+
22+
# Clone dotnet/aspnetcore at pinned commit (PR #64636 merge commit)
23+
RUN git clone --filter=blob:none --no-checkout https://github.com/dotnet/aspnetcore.git /workspace && \
24+
cd /workspace && \
25+
git checkout 875255737993775850f1f3650c10ddb43ef00ced && \
26+
git config user.email "agent@example.com" && \
27+
git config user.name "Agent"
28+
29+
# Inject defects into the codebase
30+
COPY inject_defects.sh /tmp/inject_defects.sh
31+
RUN chmod +x /tmp/inject_defects.sh && /tmp/inject_defects.sh && rm /tmp/inject_defects.sh
32+
33+
# Create directories for verifier (tests uploaded at runtime by Harbor verifier)
34+
RUN mkdir -p /workspace/tests /logs/verifier
35+
36+
WORKDIR /workspace
37+
38+
# --- artifact_only: backup full repo, then clear workspace for agent ---
39+
RUN cp -a /workspace /repo_full
40+
RUN rm -rf /workspace && mkdir -p /workspace
41+
RUN touch /tmp/.artifact_only_mode && echo '/workspace' > /tmp/.artifact_only_workdir
42+
43+
WORKDIR /workspace
44+
45+
ENTRYPOINT []

benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.sg_only

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ FROM ubuntu:22.04
77
# Install common tools (python3 needed for inject_defects.sh)
88
RUN apt-get update && apt-get install -y --no-install-recommends \
99
git \
10+
ca-certificates \
1011
curl \
1112
python3 \
1213
ripgrep \

benchmarks/ccb_test/aspnetcore-code-review-001/instruction.md

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
You are reviewing a recently merged pull request that adds a `DisplayName<TValue>` component to Blazor. This component reads `[Display]` and `[DisplayName]` attributes from model properties and renders the display name in forms. The PR introduces the component class, an expression member accessor helper with caching, and updates to project templates. However, several defects were introduced during the merge — both functional bugs and compliance violations.
1111

12-
Your task is to **find the defects, fix them in the code, and produce a structured review report**.
12+
Your task is to **find the defects and produce a structured review report with proposed fixes**.
1313

1414
## Context
1515

@@ -20,21 +20,21 @@ The DisplayName feature spans two core C# source files:
2020

2121
## Task
2222

23-
YOU MUST IMPLEMENT CODE CHANGES to complete this task.
24-
2523
Review the two files listed above for the following types of defects:
2624

2725
- **Functional bugs**: Logic errors that cause incorrect behavior (e.g., wrong attribute precedence, missing null checks, broken cache invalidation).
2826
- **Compliance violations**: Deviations from ASP.NET Core component conventions (e.g., unnecessary re-rendering, missing render optimization).
2927

3028
For each defect you find:
3129

32-
1. **Fix the code** by editing the affected file in `/workspace/`.
33-
2. **Record the defect** in your review report.
30+
1. **Describe the defect** in your review report.
31+
2. **Write a fix** as a unified diff in the `fix_patch` field.
32+
33+
**Do NOT edit source files directly.** Express all fixes as unified diffs in your review report. The evaluation system will apply your patches and verify correctness.
3434

3535
### Expected Output
3636

37-
After completing your review, write a JSON file at `/workspace/review.json` containing an array of defect objects:
37+
Write a JSON file at `/workspace/review.json` containing an array of defect objects:
3838

3939
```json
4040
[
@@ -43,7 +43,7 @@ After completing your review, write a JSON file at `/workspace/review.json` cont
4343
"line": 60,
4444
"severity": "critical",
4545
"description": "Brief description of what is wrong and why",
46-
"fix_applied": true
46+
"fix_patch": "--- a/src/Components/Web/src/Forms/ExpressionMemberAccessor.cs\n+++ b/src/Components/Web/src/Forms/ExpressionMemberAccessor.cs\n@@ -58,5 +58,5 @@\n- old line\n+ new line\n"
4747
}
4848
]
4949
```
@@ -53,13 +53,16 @@ Each entry must include:
5353
- `line` — Approximate line number where the defect occurs
5454
- `severity` — One of: `critical`, `high`, `medium`, `low`
5555
- `description` — What the defect is and what impact it has
56-
- `fix_applied`Boolean indicating whether you committed a fix
56+
- `fix_patch`Unified diff showing the proposed fix (use `--- a/` and `+++ b/` prefix format)
5757

5858
## Evaluation
5959

60-
Your review will be evaluated on detection accuracy and fix quality.
60+
Your review will be evaluated on:
61+
- **Detection accuracy** (50%): Precision and recall of reported defects
62+
- **Fix quality** (50%): Whether your proposed patches correctly resolve the defects
6163

62-
## Testing
64+
## Constraints
6365

6466
- **Time limit**: 1200 seconds
65-
- Run `bash /workspace/tests/test.sh` to verify your changes
67+
- Do NOT edit source files directly — express fixes only in `fix_patch`
68+
- Do NOT run tests — the evaluation system handles verification
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#!/bin/bash
2+
# Artifact-only verifier helper: apply patches from agent artifacts to /repo_full copy.
3+
#
4+
# Source this at the TOP of test.sh for artifact-only mode. It detects
5+
# /tmp/.artifact_only_mode and:
6+
# 1. Copies /repo_full to /tmp/verify_repo (clean scoring copy)
7+
# 2. Exports VERIFY_REPO for downstream fix-pattern checks
8+
# 3. Provides apply_patches_from_review_json() to apply fix_patch fields
9+
# 4. Provides apply_patch_file() to apply standalone .patch files
10+
#
11+
# For non-artifact-only runs (legacy or sg_only), this script is a no-op.
12+
#
13+
# Usage in test.sh:
14+
# #!/bin/bash
15+
# set -e
16+
# # Legacy sg_only support (no-op if not in sg_only mode)
17+
# [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
18+
# # Artifact-only support
19+
# [ -f /tmp/.artifact_only_mode ] && [ -f /tests/artifact_verifier_lib.sh ] && source /tests/artifact_verifier_lib.sh
20+
# # ... rest of test.sh uses $VERIFY_REPO for file checks ...
21+
22+
if [ ! -f /tmp/.artifact_only_mode ]; then
23+
# Not in artifact-only mode — export VERIFY_REPO as /workspace for backward compat
24+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
25+
export ARTIFACT_ONLY=false
26+
return 0 2>/dev/null || true
27+
fi
28+
29+
echo "[artifact_verifier] Detected artifact-only mode"
30+
export ARTIFACT_ONLY=true
31+
32+
# Create clean scoring copy from /repo_full
33+
if [ -d /repo_full ]; then
34+
echo "[artifact_verifier] Copying /repo_full -> /tmp/verify_repo..."
35+
rm -rf /tmp/verify_repo
36+
cp -a /repo_full /tmp/verify_repo
37+
export VERIFY_REPO="/tmp/verify_repo"
38+
# Ensure git works in the copy
39+
cd /tmp/verify_repo
40+
git config --global --add safe.directory /tmp/verify_repo 2>/dev/null || true
41+
echo "[artifact_verifier] Clean scoring repo ready at $VERIFY_REPO"
42+
else
43+
echo "[artifact_verifier] WARNING: /repo_full not found. Using /workspace as fallback."
44+
export VERIFY_REPO="/workspace"
45+
fi
46+
47+
# ── Patch application functions ──────────────────────────────
48+
49+
# Apply a single unified diff string to VERIFY_REPO.
50+
# Returns 0 on success, 1 on failure.
51+
apply_single_patch() {
52+
local patch_text="$1"
53+
local patch_file="/tmp/artifact_patch_$$.patch"
54+
55+
echo "$patch_text" > "$patch_file"
56+
57+
# Try git apply first (strictest)
58+
if cd "$VERIFY_REPO" && git apply --allow-empty "$patch_file" 2>/dev/null; then
59+
echo "[artifact_verifier] Patch applied via git apply"
60+
rm -f "$patch_file"
61+
return 0
62+
fi
63+
64+
# Fallback: patch with fuzz
65+
if cd "$VERIFY_REPO" && patch -p1 --fuzz=3 -i "$patch_file" 2>/dev/null; then
66+
echo "[artifact_verifier] Patch applied via patch -p1 --fuzz=3"
67+
rm -f "$patch_file"
68+
return 0
69+
fi
70+
71+
# Fallback: git apply with less strict options
72+
if cd "$VERIFY_REPO" && git apply --allow-empty --3way "$patch_file" 2>/dev/null; then
73+
echo "[artifact_verifier] Patch applied via git apply --3way"
74+
rm -f "$patch_file"
75+
return 0
76+
fi
77+
78+
echo "[artifact_verifier] WARNING: Patch failed to apply"
79+
rm -f "$patch_file"
80+
return 1
81+
}
82+
83+
# Apply a standalone .patch file to VERIFY_REPO.
84+
# Usage: apply_patch_file /workspace/solution.patch
85+
apply_patch_file() {
86+
local patch_path="$1"
87+
if [ ! -f "$patch_path" ]; then
88+
echo "[artifact_verifier] Patch file not found: $patch_path"
89+
return 1
90+
fi
91+
92+
local content
93+
content="$(cat "$patch_path")"
94+
apply_single_patch "$content"
95+
}
96+
97+
# Extract and apply fix_patch fields from a review.json file.
98+
# Usage: apply_patches_from_review_json /workspace/review.json
99+
# Returns the number of successfully applied patches.
100+
apply_patches_from_review_json() {
101+
local review_path="$1"
102+
if [ ! -f "$review_path" ]; then
103+
echo "[artifact_verifier] review.json not found: $review_path"
104+
echo "0"
105+
return 1
106+
fi
107+
108+
# Use Python to parse JSON and extract/apply patches
109+
python3 - "$review_path" "$VERIFY_REPO" <<'PYEOF'
110+
import json, sys, os, subprocess, tempfile, re
111+
112+
review_path = sys.argv[1]
113+
verify_repo = sys.argv[2]
114+
115+
# Parse review.json with nested-object fallback
116+
try:
117+
with open(review_path) as f:
118+
raw = f.read()
119+
# Strip markdown code fences
120+
m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
121+
if m:
122+
raw = m.group(1).strip()
123+
reported = json.loads(raw)
124+
125+
# Handle nested objects: {"defects": [...]}, {"review": {"defects": [...]}}
126+
if isinstance(reported, dict):
127+
for key in ("defects", "findings", "issues", "review"):
128+
val = reported.get(key, None)
129+
if isinstance(val, list):
130+
reported = val
131+
break
132+
elif isinstance(val, dict):
133+
for k2 in ("defects", "findings", "issues"):
134+
v2 = val.get(k2, None)
135+
if isinstance(v2, list):
136+
reported = v2
137+
break
138+
if isinstance(reported, list):
139+
break
140+
if not isinstance(reported, list):
141+
reported = []
142+
except Exception as e:
143+
print(f"[artifact_verifier] Failed to parse review.json: {e}", file=sys.stderr)
144+
reported = []
145+
146+
applied = 0
147+
failed = 0
148+
149+
for entry in reported:
150+
patch_text = entry.get("fix_patch", "")
151+
if not patch_text or not patch_text.strip():
152+
continue
153+
154+
# Write patch to temp file
155+
with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
156+
pf.write(patch_text)
157+
pf.flush()
158+
pf_path = pf.name
159+
160+
# Try git apply
161+
result = subprocess.run(
162+
["git", "apply", "--allow-empty", pf_path],
163+
cwd=verify_repo, capture_output=True, text=True
164+
)
165+
if result.returncode == 0:
166+
applied += 1
167+
os.unlink(pf_path)
168+
continue
169+
170+
# Fallback: patch -p1 --fuzz=3
171+
result = subprocess.run(
172+
["patch", "-p1", "--fuzz=3", "-i", pf_path],
173+
cwd=verify_repo, capture_output=True, text=True
174+
)
175+
if result.returncode == 0:
176+
applied += 1
177+
os.unlink(pf_path)
178+
continue
179+
180+
# Fallback: git apply with 3way
181+
result = subprocess.run(
182+
["git", "apply", "--allow-empty", "--3way", pf_path],
183+
cwd=verify_repo, capture_output=True, text=True
184+
)
185+
if result.returncode == 0:
186+
applied += 1
187+
os.unlink(pf_path)
188+
continue
189+
190+
failed += 1
191+
file_name = entry.get("file", "unknown")
192+
print(f"[artifact_verifier] Patch for {file_name} failed to apply", file=sys.stderr)
193+
os.unlink(pf_path)
194+
195+
print(f"[artifact_verifier] Patches applied: {applied}, failed: {failed}", file=sys.stderr)
196+
print(applied)
197+
PYEOF
198+
}
199+
200+
echo "[artifact_verifier] Helper functions loaded"

0 commit comments

Comments
 (0)