Skip to content

Commit d055a6b

Browse files
committed
Close canonical artifact-mode coverage gaps
1 parent a227d2e commit d055a6b

File tree

128 files changed

+15326
-507
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

128 files changed

+15326
-507
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# tidb-query-plan-regression-debug-001 — artifact_only variant (build-requiring)
2+
# Repos cloned for baseline agent to read locally.
3+
# MCP agent deletes source files at runtime via agent startup script.
4+
# Verifier applies patches from answer.json to /repo_full copy for scoring.
5+
6+
FROM ubuntu:22.04
7+
8+
ENV DEBIAN_FRONTEND=noninteractive
9+
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
git \
12+
ca-certificates \
13+
python3 \
14+
curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Clone repo at pinned version
20+
RUN git clone --depth 1 https://github.com/sg-evals/tidb--v8.5.0.git . && \
21+
git config user.email "agent@benchmark.local" && \
22+
git config user.name "Agent"
23+
24+
# Create logs directory for agent output
25+
RUN mkdir -p /logs/agent /logs/verifier
26+
27+
# Pre-create claude user and set ownership at build time
28+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
29+
for d in /workspace /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
30+
31+
32+
# --- artifact_only: backup full repo for verifier scoring ---
33+
# Source stays in /workspace (readable by baseline agent).
34+
# MCP agent deletes source files at runtime via agent startup script.
35+
RUN cp -a /workspace /repo_full
36+
RUN chmod 700 /repo_full
37+
RUN touch /tmp/.artifact_only_mode && echo '/workspace' > /tmp/.artifact_only_workdir
38+
39+
ENTRYPOINT []
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
#!/bin/bash
2+
# answer_json_verifier_lib.sh — Unified answer.json verifier for artifact configs.
3+
#
4+
# Source this at the TOP of test.sh. It detects /tmp/.artifact_only_mode and:
5+
# 1. Validates /workspace/answer.json exists and is valid JSON
6+
# 2. Extracts analysis.reasoning → $ANALYSIS_TEXT_FILE (for keyword/pattern scoring)
7+
# 3. Extracts analysis.files_examined → $ANALYSIS_FILES_FILE (for IR metrics)
8+
# 4. If changes[] has diffs: applies diffs directly to /repo_full (zero-copy, container is ephemeral)
9+
# 5. Exports VERIFY_REPO, ARTIFACT_ONLY, ANALYSIS_TEXT_FILE, etc.
10+
#
11+
# For non-artifact-only runs, this script is a no-op that sets safe defaults.
12+
#
13+
# Usage in test.sh:
14+
# #!/bin/bash
15+
# set -e
16+
# # Artifact mode: parse answer.json, apply patches, export analysis
17+
# if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
18+
# source /tests/answer_json_verifier_lib.sh
19+
# fi
20+
# # ... rest of test.sh uses $VERIFY_REPO, $ANALYSIS_TEXT_FILE, etc. ...
21+
22+
if [ ! -f /tmp/.artifact_only_mode ]; then
23+
# Not in artifact-only mode — export defaults for backward compat
24+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
25+
export ARTIFACT_ONLY=false
26+
export ANALYSIS_TEXT_FILE=""
27+
export ANALYSIS_FILES_FILE=""
28+
export ANSWER_JSON=""
29+
export ANSWER_JSON_MISSING=false
30+
export ANSWER_JSON_NO_CHANGES=false
31+
return 0 2>/dev/null || true
32+
fi
33+
34+
echo "[answer_json_verifier] Detected artifact-only mode"
35+
export ARTIFACT_ONLY=true
36+
export ANSWER_JSON="/workspace/answer.json"
37+
export ANALYSIS_TEXT_FILE="/tmp/analysis.txt"
38+
export ANALYSIS_FILES_FILE="/tmp/analysis_files.json"
39+
export ANSWER_JSON_MISSING=false
40+
export ANSWER_JSON_NO_CHANGES=false
41+
42+
answer_json_fail_closed_if_missing() {
43+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ "${ANSWER_JSON_MISSING:-false}" = "true" ]; then
44+
mkdir -p /logs/verifier
45+
printf '0.0\n' > /logs/verifier/reward.txt
46+
echo "[answer_json_verifier] Scored 0.0 because answer.json is missing"
47+
exit 0
48+
fi
49+
}
50+
51+
answer_json_fail_closed_if_missing_or_no_changes() {
52+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && {
53+
[ "${ANSWER_JSON_MISSING:-false}" = "true" ] || [ "${ANSWER_JSON_NO_CHANGES:-false}" = "true" ]
54+
}; then
55+
mkdir -p /logs/verifier
56+
printf '0.0\n' > /logs/verifier/reward.txt
57+
echo "[answer_json_verifier] Scored 0.0 because answer.json has no usable artifact payload"
58+
exit 0
59+
fi
60+
}
61+
62+
answer_json_copy_analysis_text() {
63+
local target_path="$1"
64+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ -f "${ANALYSIS_TEXT_FILE:-}" ]; then
65+
mkdir -p "$(dirname "$target_path")"
66+
cp "$ANALYSIS_TEXT_FILE" "$target_path"
67+
echo "[answer_json_verifier] Copied analysis text to $target_path"
68+
fi
69+
}
70+
71+
rm -f /tmp/.answer_json_no_changes /tmp/.answer_json_verify_repo
72+
73+
# ── Validate answer.json ──────────────────────────────────────────────────
74+
75+
if [ ! -f "$ANSWER_JSON" ]; then
76+
echo "[answer_json_verifier] ERROR: /workspace/answer.json not found"
77+
echo "[answer_json_verifier] Agent did not produce required artifact"
78+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
79+
export ANSWER_JSON_MISSING=true
80+
# Signal to test.sh that there's no output — it should score 0
81+
return 0 2>/dev/null || true
82+
fi
83+
84+
# Validate JSON and extract fields using Python
85+
python3 - "$ANSWER_JSON" <<'PYEOF'
86+
import json, sys, os, subprocess, tempfile, re
87+
88+
answer_path = sys.argv[1]
89+
analysis_text_file = os.environ.get("ANALYSIS_TEXT_FILE", "/tmp/analysis.txt")
90+
analysis_files_file = os.environ.get("ANALYSIS_FILES_FILE", "/tmp/analysis_files.json")
91+
92+
# ── Parse answer.json ─────────────────────────────────────────────────────
93+
try:
94+
with open(answer_path) as f:
95+
raw = f.read()
96+
97+
# Strip markdown code fences if agent wrapped JSON in ```json blocks
98+
m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
99+
if m:
100+
raw = m.group(1).strip()
101+
102+
answer = json.loads(raw)
103+
if not isinstance(answer, dict):
104+
print("[answer_json_verifier] WARNING: answer.json is not a JSON object", file=sys.stderr)
105+
answer = {}
106+
except (json.JSONDecodeError, ValueError) as e:
107+
print(f"[answer_json_verifier] ERROR: Failed to parse answer.json: {e}", file=sys.stderr)
108+
answer = {}
109+
except FileNotFoundError:
110+
print("[answer_json_verifier] ERROR: answer.json not found", file=sys.stderr)
111+
answer = {}
112+
113+
# ── Extract analysis fields ───────────────────────────────────────────────
114+
analysis = answer.get("analysis", {})
115+
if not isinstance(analysis, dict):
116+
analysis = {}
117+
118+
# Build analysis text from summary + reasoning (what verifiers will grep)
119+
parts = []
120+
summary = analysis.get("summary", "")
121+
if summary:
122+
parts.append(summary)
123+
reasoning = analysis.get("reasoning", "")
124+
if reasoning:
125+
parts.append(reasoning)
126+
analysis_text = "\n\n".join(parts)
127+
128+
with open(analysis_text_file, "w") as f:
129+
f.write(analysis_text)
130+
print(f"[answer_json_verifier] Wrote analysis text ({len(analysis_text)} chars) to {analysis_text_file}")
131+
132+
# Extract files_examined for IR metrics
133+
files_examined = analysis.get("files_examined", [])
134+
if not isinstance(files_examined, list):
135+
files_examined = []
136+
with open(analysis_files_file, "w") as f:
137+
json.dump(files_examined, f, indent=2)
138+
print(f"[answer_json_verifier] Wrote {len(files_examined)} examined files to {analysis_files_file}")
139+
140+
# ── Extract and apply diffs from changes[] ────────────────────────────────
141+
changes = answer.get("changes", [])
142+
if not isinstance(changes, list):
143+
changes = []
144+
145+
if not changes:
146+
print("[answer_json_verifier] No changes[] in answer.json (analysis-only task)")
147+
# Signal no patches needed
148+
with open("/tmp/.answer_json_no_changes", "w") as f:
149+
f.write("1")
150+
151+
# ── Generate synthetic review.json for code-review verifiers ──────────────
152+
# Code-review verifiers expect /workspace/review.json with [{file, description, fix_patch}]
153+
# Generate this from answer.json changes[] so existing F1 scoring works unchanged.
154+
if changes:
155+
review_entries = []
156+
for change in changes:
157+
entry = {
158+
"file": change.get("file", ""),
159+
"description": change.get("description", ""),
160+
"fix_patch": change.get("diff", ""),
161+
}
162+
review_entries.append(entry)
163+
review_json_path = "/workspace/review.json"
164+
with open(review_json_path, "w") as f:
165+
json.dump(review_entries, f, indent=2)
166+
print(f"[answer_json_verifier] Generated synthetic review.json ({len(review_entries)} entries)")
167+
168+
# ── Extract new-file diffs to /workspace/ ─────────────────────────────────
169+
# For find-and-prove tasks: agent writes regression tests as new-file diffs.
170+
# Extract file content from diffs like "--- /dev/null\n+++ b/regression_test.py"
171+
# and write directly to /workspace/.
172+
new_files_written = 0
173+
for change in changes:
174+
diff_text = change.get("diff", "")
175+
file_path = change.get("file", "")
176+
if not diff_text or not file_path:
177+
continue
178+
179+
# Detect new-file diff: starts from /dev/null
180+
if "/dev/null" in diff_text:
181+
# Extract added lines (lines starting with +, excluding +++ header)
182+
lines = diff_text.split("\n")
183+
content_lines = []
184+
in_hunk = False
185+
for line in lines:
186+
if line.startswith("@@"):
187+
in_hunk = True
188+
continue
189+
if in_hunk:
190+
if line.startswith("+"):
191+
content_lines.append(line[1:]) # Strip leading +
192+
elif line.startswith("-"):
193+
pass # skip removed lines (shouldn't exist in new-file)
194+
elif line.startswith("\\"):
195+
pass # "\ No newline at end of file"
196+
else:
197+
content_lines.append(line) # context line
198+
199+
if content_lines:
200+
# Determine target path — use file field, write to /workspace/
201+
target = os.path.join("/workspace", os.path.basename(file_path))
202+
os.makedirs(os.path.dirname(target), exist_ok=True)
203+
with open(target, "w") as f:
204+
f.write("\n".join(content_lines))
205+
if content_lines and not content_lines[-1] == "":
206+
f.write("\n")
207+
new_files_written += 1
208+
print(f"[answer_json_verifier] Extracted new file: {target}")
209+
210+
if new_files_written > 0:
211+
print(f"[answer_json_verifier] Extracted {new_files_written} new files to /workspace/")
212+
213+
# ── Generate fault_localization_result.json ────────────────────────────────
214+
# Fault-loc verifiers expect /workspace/fault_localization_result.json with
215+
# {buggy_files, buggy_functions, reasoning, confidence}. Populate from analysis.
216+
if analysis:
217+
fl_result = {}
218+
# buggy_files: extract from files_examined
219+
fl_files = [fe.get("path", "") for fe in files_examined if fe.get("path")]
220+
if fl_files:
221+
fl_result["buggy_files"] = fl_files
222+
# buggy_functions: look for a "functions" or "buggy_functions" key in analysis
223+
fl_funcs = analysis.get("buggy_functions", analysis.get("functions", []))
224+
if isinstance(fl_funcs, list) and fl_funcs:
225+
fl_result["buggy_functions"] = fl_funcs
226+
# reasoning: use the full analysis text
227+
if reasoning:
228+
fl_result["reasoning"] = reasoning
229+
# confidence: look for a confidence key
230+
confidence = analysis.get("confidence", None)
231+
if isinstance(confidence, (int, float)):
232+
fl_result["confidence"] = confidence
233+
# Only write if we have substantive content
234+
fl_path = "/workspace/fault_localization_result.json"
235+
if fl_result and not os.path.exists(fl_path):
236+
with open(fl_path, "w") as f:
237+
json.dump(fl_result, f, indent=2)
238+
print(f"[answer_json_verifier] Generated fault_localization_result.json")
239+
240+
# ── Apply diffs directly to /repo_full (zero-copy) ───────────────────────
241+
if not changes:
242+
sys.exit(0)
243+
244+
# Apply diffs in-place — container is ephemeral, no need to preserve /repo_full
245+
repo_full = "/repo_full"
246+
verify_repo = repo_full
247+
248+
if not os.path.isdir(repo_full):
249+
print(f"[answer_json_verifier] WARNING: {repo_full} not found. Cannot apply diffs.")
250+
with open("/tmp/.answer_json_no_changes", "w") as f:
251+
f.write("1")
252+
sys.exit(0)
253+
254+
# Ensure verifier (root) can write to repo_full
255+
subprocess.run(["chmod", "-R", "u+w", repo_full], capture_output=True)
256+
print(f"[answer_json_verifier] Applying diffs to {repo_full} (in-place, zero-copy)...")
257+
subprocess.run(
258+
["git", "config", "--global", "--add", "safe.directory", repo_full],
259+
capture_output=True
260+
)
261+
262+
# Apply each diff
263+
applied = 0
264+
failed = 0
265+
266+
for entry in changes:
267+
diff_text = entry.get("diff", "")
268+
if not diff_text or not diff_text.strip():
269+
continue
270+
271+
file_name = entry.get("file", "unknown")
272+
273+
with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
274+
pf.write(diff_text)
275+
pf.flush()
276+
pf_path = pf.name
277+
278+
# Try git apply (strictest)
279+
result = subprocess.run(
280+
["git", "apply", "--allow-empty", pf_path],
281+
cwd=verify_repo, capture_output=True, text=True
282+
)
283+
if result.returncode == 0:
284+
applied += 1
285+
os.unlink(pf_path)
286+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply)")
287+
continue
288+
289+
# Fallback: patch -p1 --fuzz=3
290+
result = subprocess.run(
291+
["patch", "-p1", "--fuzz=3", "-i", pf_path],
292+
cwd=verify_repo, capture_output=True, text=True
293+
)
294+
if result.returncode == 0:
295+
applied += 1
296+
os.unlink(pf_path)
297+
print(f"[answer_json_verifier] Applied diff for {file_name} (patch -p1)")
298+
continue
299+
300+
# Fallback: git apply --3way
301+
result = subprocess.run(
302+
["git", "apply", "--allow-empty", "--3way", pf_path],
303+
cwd=verify_repo, capture_output=True, text=True
304+
)
305+
if result.returncode == 0:
306+
applied += 1
307+
os.unlink(pf_path)
308+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply --3way)")
309+
continue
310+
311+
failed += 1
312+
print(f"[answer_json_verifier] WARNING: Diff for {file_name} failed to apply", file=sys.stderr)
313+
os.unlink(pf_path)
314+
315+
print(f"[answer_json_verifier] Diffs applied: {applied}, failed: {failed}")
316+
317+
# Write verify_repo path for shell to pick up
318+
with open("/tmp/.answer_json_verify_repo", "w") as f:
319+
f.write(verify_repo)
320+
PYEOF
321+
322+
# Pick up VERIFY_REPO from Python output
323+
if [ -f /tmp/.answer_json_no_changes ]; then
324+
export ANSWER_JSON_NO_CHANGES=true
325+
fi
326+
if [ -f /tmp/.answer_json_verify_repo ]; then
327+
export VERIFY_REPO="$(cat /tmp/.answer_json_verify_repo)"
328+
cd "$VERIFY_REPO"
329+
echo "[answer_json_verifier] VERIFY_REPO set to $VERIFY_REPO"
330+
elif [ -f /tmp/.answer_json_no_changes ]; then
331+
# Analysis-only: no repo copy needed, use /workspace or /repo_full as fallback
332+
if [ -d /repo_full ]; then
333+
export VERIFY_REPO="/repo_full"
334+
else
335+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
336+
fi
337+
echo "[answer_json_verifier] Analysis-only mode, VERIFY_REPO=$VERIFY_REPO"
338+
else
339+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
340+
echo "[answer_json_verifier] WARNING: Using fallback VERIFY_REPO=$VERIFY_REPO"
341+
fi
342+
343+
# Clean up temp markers
344+
rm -f /tmp/.answer_json_verify_repo /tmp/.answer_json_no_changes
345+
346+
echo "[answer_json_verifier] Library loaded (ARTIFACT_ONLY=$ARTIFACT_ONLY, VERIFY_REPO=$VERIFY_REPO)"

0 commit comments

Comments
 (0)