Skip to content

Commit 8e9990f

Browse files
committed
benchmarks: migrate diff-similarity verifiers
1 parent a83a435 commit 8e9990f

File tree

9 files changed

+1165
-61
lines changed

9 files changed

+1165
-61
lines changed

benchmarks/csb_sdlc_fix/envoy-dfp-host-leak-fix-001/tests/test.sh

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,155 @@ fi
1111
set -eo pipefail
1212
TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
1313
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
14+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
15+
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
16+
ARTIFACT_REQUIRED=false
1417
if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
15-
answer_json_fail_closed_if_missing_or_no_changes
18+
ARTIFACT_REQUIRED=true
1619
fi
17-
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
1820

1921
mkdir -p /logs/verifier
2022
cd "$TASK_REPO_ROOT"
2123
git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true
24+
25+
write_invalid_output() {
26+
local code="$1"
27+
local message="$2"
28+
python3 - "$code" "$message" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" <<'PYEOF'
29+
import json
30+
import sys
31+
32+
code, message, primary_path, required_artifact = sys.argv[1:5]
33+
required = required_artifact == "true"
34+
reward_payload = {
35+
"reward": 0.0,
36+
"gate": code,
37+
"error": message,
38+
}
39+
validation_payload = {
40+
"schema_version": "validation_result.v1alpha1",
41+
"status": "invalid_output",
42+
"scorable": False,
43+
"scorer_family": "diff_similarity",
44+
"reward": 0.0,
45+
"pass_threshold": 0.5,
46+
"passed": False,
47+
"output_contract": {
48+
"mode": "answer_json_bridge",
49+
"primary_path": primary_path,
50+
"required_artifact": required,
51+
},
52+
"sub_scores": {},
53+
"failure": {
54+
"code": code,
55+
"message": message,
56+
"stage": "output_validation",
57+
},
58+
"legacy": {
59+
"reward_json": reward_payload,
60+
},
61+
}
62+
with open("/logs/verifier/reward.json", "w") as f:
63+
json.dump(reward_payload, f, indent=2)
64+
with open("/logs/verifier/validation_result.json", "w") as f:
65+
json.dump(validation_payload, f, indent=2)
66+
with open("/logs/verifier/reward.txt", "w") as f:
67+
f.write("0.0")
68+
PYEOF
69+
}
70+
71+
write_validation_from_reward_json() {
72+
local fallback_status="$1"
73+
python3 - "$fallback_status" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" <<'PYEOF'
74+
import json
75+
import os
76+
import sys
77+
78+
fallback_status, primary_path, required_artifact = sys.argv[1:4]
79+
required = required_artifact == "true"
80+
status = fallback_status
81+
reward_payload = {}
82+
reward_json_path = "/logs/verifier/reward.json"
83+
84+
if os.path.isfile(reward_json_path):
85+
try:
86+
with open(reward_json_path) as f:
87+
reward_payload = json.load(f)
88+
except Exception as exc:
89+
status = "verifier_error"
90+
reward_payload = {"reward": 0.0, "error": f"Failed to parse reward.json: {exc}"}
91+
else:
92+
status = "verifier_error"
93+
reward_payload = {"reward": 0.0, "error": "reward.json not written by verifier"}
94+
95+
reward = reward_payload.get("reward", 0.0)
96+
try:
97+
reward = float(reward)
98+
except (TypeError, ValueError):
99+
reward = 0.0
100+
status = "verifier_error"
101+
102+
if reward_payload.get("error"):
103+
status = "verifier_error"
104+
105+
sub_scores = {}
106+
for key in ("file_recall", "line_recall", "line_precision"):
107+
value = reward_payload.get(key)
108+
if isinstance(value, (int, float)):
109+
sub_scores[key] = float(value)
110+
111+
details = {}
112+
for key in ("expected_files", "actual_files", "expected_lines_total", "actual_lines_total", "gate", "error"):
113+
value = reward_payload.get(key)
114+
if value is not None:
115+
details[key] = value
116+
117+
failure = None
118+
passed = False
119+
if status == "scored":
120+
passed = reward >= 0.5
121+
else:
122+
failure = {
123+
"code": "verifier_exception" if reward_payload.get("error") else "missing_reward_json",
124+
"message": str(reward_payload.get("error") or "Verifier did not produce a usable reward payload"),
125+
"stage": "scoring",
126+
}
127+
128+
payload = {
129+
"schema_version": "validation_result.v1alpha1",
130+
"status": status,
131+
"scorable": status == "scored",
132+
"scorer_family": "diff_similarity",
133+
"reward": reward,
134+
"pass_threshold": 0.5,
135+
"passed": passed,
136+
"output_contract": {
137+
"mode": "answer_json_bridge",
138+
"primary_path": primary_path,
139+
"required_artifact": required,
140+
},
141+
"sub_scores": sub_scores,
142+
"failure": failure,
143+
"legacy": {
144+
"reward_json": reward_payload,
145+
},
146+
}
147+
if details:
148+
payload["details"] = details
149+
150+
with open("/logs/verifier/validation_result.json", "w") as f:
151+
json.dump(payload, f, indent=2)
152+
PYEOF
153+
}
154+
155+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && {
156+
[ "${ANSWER_JSON_MISSING:-false}" = "true" ] || [ "${ANSWER_JSON_NO_CHANGES:-false}" = "true" ];
157+
}; then
158+
write_invalid_output "missing_required_output" \
159+
"answer.json missing or has no usable artifact payload at $TASK_OUTPUT"
160+
exit 0
161+
fi
162+
22163
# Resolve initial commit — mirrors use orphan commits with different SHAs than upstream
23164
PRE_FIX_REV=$(git rev-parse HEAD 2>/dev/null || echo "HEAD")
24165
python3 /tests/verify_diff.py \
@@ -28,6 +169,7 @@ python3 /tests/verify_diff.py \
28169
2>&1 | tee /logs/verifier/verifier.log
29170
REWARD=$(python3 -c "import json; print(json.load(open('/logs/verifier/reward.json')).get('reward', 0.0))" 2>/dev/null || echo "0.0")
30171
echo "$REWARD" > /logs/verifier/reward.txt
172+
write_validation_from_reward_json "scored"
31173
echo "Final reward: $REWARD"
32174
git diff "$PRE_FIX_REV" > /logs/verifier/agent.diff 2>/dev/null || true
33175
git diff "$PRE_FIX_REV" --stat > /logs/verifier/diff.stat 2>/dev/null || true

benchmarks/csb_sdlc_fix/envoy-udp-proxy-cds-fix-001/tests/test.sh

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,155 @@ fi
1111
set -eo pipefail
1212
TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
1313
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
14+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
15+
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
16+
ARTIFACT_REQUIRED=false
1417
if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
15-
answer_json_fail_closed_if_missing_or_no_changes
18+
ARTIFACT_REQUIRED=true
1619
fi
17-
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
1820

1921
mkdir -p /logs/verifier
2022
cd "$TASK_REPO_ROOT"
2123
git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true
24+
25+
write_invalid_output() {
26+
local code="$1"
27+
local message="$2"
28+
python3 - "$code" "$message" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" <<'PYEOF'
29+
import json
30+
import sys
31+
32+
code, message, primary_path, required_artifact = sys.argv[1:5]
33+
required = required_artifact == "true"
34+
reward_payload = {
35+
"reward": 0.0,
36+
"gate": code,
37+
"error": message,
38+
}
39+
validation_payload = {
40+
"schema_version": "validation_result.v1alpha1",
41+
"status": "invalid_output",
42+
"scorable": False,
43+
"scorer_family": "diff_similarity",
44+
"reward": 0.0,
45+
"pass_threshold": 0.5,
46+
"passed": False,
47+
"output_contract": {
48+
"mode": "answer_json_bridge",
49+
"primary_path": primary_path,
50+
"required_artifact": required,
51+
},
52+
"sub_scores": {},
53+
"failure": {
54+
"code": code,
55+
"message": message,
56+
"stage": "output_validation",
57+
},
58+
"legacy": {
59+
"reward_json": reward_payload,
60+
},
61+
}
62+
with open("/logs/verifier/reward.json", "w") as f:
63+
json.dump(reward_payload, f, indent=2)
64+
with open("/logs/verifier/validation_result.json", "w") as f:
65+
json.dump(validation_payload, f, indent=2)
66+
with open("/logs/verifier/reward.txt", "w") as f:
67+
f.write("0.0")
68+
PYEOF
69+
}
70+
71+
write_validation_from_reward_json() {
72+
local fallback_status="$1"
73+
python3 - "$fallback_status" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" <<'PYEOF'
74+
import json
75+
import os
76+
import sys
77+
78+
fallback_status, primary_path, required_artifact = sys.argv[1:4]
79+
required = required_artifact == "true"
80+
status = fallback_status
81+
reward_payload = {}
82+
reward_json_path = "/logs/verifier/reward.json"
83+
84+
if os.path.isfile(reward_json_path):
85+
try:
86+
with open(reward_json_path) as f:
87+
reward_payload = json.load(f)
88+
except Exception as exc:
89+
status = "verifier_error"
90+
reward_payload = {"reward": 0.0, "error": f"Failed to parse reward.json: {exc}"}
91+
else:
92+
status = "verifier_error"
93+
reward_payload = {"reward": 0.0, "error": "reward.json not written by verifier"}
94+
95+
reward = reward_payload.get("reward", 0.0)
96+
try:
97+
reward = float(reward)
98+
except (TypeError, ValueError):
99+
reward = 0.0
100+
status = "verifier_error"
101+
102+
if reward_payload.get("error"):
103+
status = "verifier_error"
104+
105+
sub_scores = {}
106+
for key in ("file_recall", "line_recall", "line_precision"):
107+
value = reward_payload.get(key)
108+
if isinstance(value, (int, float)):
109+
sub_scores[key] = float(value)
110+
111+
details = {}
112+
for key in ("expected_files", "actual_files", "expected_lines_total", "actual_lines_total", "gate", "error"):
113+
value = reward_payload.get(key)
114+
if value is not None:
115+
details[key] = value
116+
117+
failure = None
118+
passed = False
119+
if status == "scored":
120+
passed = reward >= 0.5
121+
else:
122+
failure = {
123+
"code": "verifier_exception" if reward_payload.get("error") else "missing_reward_json",
124+
"message": str(reward_payload.get("error") or "Verifier did not produce a usable reward payload"),
125+
"stage": "scoring",
126+
}
127+
128+
payload = {
129+
"schema_version": "validation_result.v1alpha1",
130+
"status": status,
131+
"scorable": status == "scored",
132+
"scorer_family": "diff_similarity",
133+
"reward": reward,
134+
"pass_threshold": 0.5,
135+
"passed": passed,
136+
"output_contract": {
137+
"mode": "answer_json_bridge",
138+
"primary_path": primary_path,
139+
"required_artifact": required,
140+
},
141+
"sub_scores": sub_scores,
142+
"failure": failure,
143+
"legacy": {
144+
"reward_json": reward_payload,
145+
},
146+
}
147+
if details:
148+
payload["details"] = details
149+
150+
with open("/logs/verifier/validation_result.json", "w") as f:
151+
json.dump(payload, f, indent=2)
152+
PYEOF
153+
}
154+
155+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && {
156+
[ "${ANSWER_JSON_MISSING:-false}" = "true" ] || [ "${ANSWER_JSON_NO_CHANGES:-false}" = "true" ];
157+
}; then
158+
write_invalid_output "missing_required_output" \
159+
"answer.json missing or has no usable artifact payload at $TASK_OUTPUT"
160+
exit 0
161+
fi
162+
22163
# Resolve initial commit — mirrors use orphan commits with different SHAs than upstream
23164
PRE_FIX_REV=$(git rev-parse HEAD 2>/dev/null || echo "HEAD")
24165
python3 /tests/verify_diff.py \
@@ -28,6 +169,7 @@ python3 /tests/verify_diff.py \
28169
2>&1 | tee /logs/verifier/verifier.log
29170
REWARD=$(python3 -c "import json; print(json.load(open('/logs/verifier/reward.json')).get('reward', 0.0))" 2>/dev/null || echo "0.0")
30171
echo "$REWARD" > /logs/verifier/reward.txt
172+
write_validation_from_reward_json "scored"
31173
echo "Final reward: $REWARD"
32174
git diff "$PRE_FIX_REV" > /logs/verifier/agent.diff 2>/dev/null || true
33175
git diff "$PRE_FIX_REV" --stat > /logs/verifier/diff.stat 2>/dev/null || true

0 commit comments

Comments
 (0)