Skip to content

Commit 0580afc

Browse files
committed
Add validation results to repo state heuristics
1 parent 8f60e2f commit 0580afc

File tree

55 files changed

+6751
-565
lines changed
  • benchmarks
    • csb_sdlc_debug/tidb-query-plan-regression-debug-001/tests
    • csb_sdlc_design
      • django-pre-validate-signal-design-001/tests
      • django-rate-limit-design-001/tests
      • elasticsearch-shard-alloc-design-001/tests
      • flipt-protobuf-metadata-design-001/tests
    • csb_sdlc_document
      • godot-gdscript-api-docgen-001/tests
      • grpc-channel-api-docgen-001/tests
    • csb_sdlc_feature
      • cilium-policy-audit-logger-feat-001/tests
      • cilium-policy-quota-feat-001/tests
      • curl-http3-priority-feat-001/tests
      • django-rate-limit-middleware-feat-001/tests
      • envoy-custom-header-filter-feat-001/tests
      • numpy-rolling-median-feat-001/tests
      • pandas-merge-asof-indicator-feat-001/tests
      • postgres-copy-csv-header-feat-001/tests
      • prometheus-silence-bulk-api-feat-001/tests
      • pytorch-gradient-noise-feat-001/tests
      • servo-css-container-query-feat-001/tests
      • terraform-compact-diff-fmt-feat-001/tests
      • vscode-custom-fold-region-feat-001/tests
    • csb_sdlc_fix
      • django-modelchoice-fk-fix-001/tests
    • csb_sdlc_refactor
      • beam-pipeline-builder-refac-001/tests
      • cilium-endpoint-manager-refac-001/tests
      • django-request-factory-refac-001/tests
      • envoy-listener-manager-refac-001/tests
      • flipt-dep-refactor-001/tests
      • flipt-flagexists-refactor-001/tests
      • istio-discovery-server-refac-001/tests
      • kubernetes-scheduler-profile-refac-001/tests
      • numpy-array-dispatch-refac-001/tests
      • pandas-index-engine-refac-001/tests
      • prometheus-query-engine-refac-001/tests
      • pytorch-optimizer-foreach-refac-001/tests
      • roslyn-symbol-resolver-refac-001/tests
      • terraform-eval-context-refac-001/tests
    • csb_sdlc_secure
      • ceph-rgw-auth-secure-001/tests
      • django-audit-trail-implement-001/tests
      • django-cross-team-boundary-001/tests
      • django-legacy-dep-vuln-001/tests
      • django-repo-scoped-access-001/tests
      • django-role-based-access-001/tests
      • django-sensitive-file-exclusion-001/tests
      • flipt-degraded-context-fix-001/tests
      • flipt-repo-scoped-access-001/tests
      • typescript-type-narrowing-secure-001/tests
    • csb_sdlc_test
    • csb_sdlc_understand
      • clickhouse-mergetree-arch-understand-001/tests
      • django-composite-field-recover-001/tests
      • django-template-inherit-recall-001/tests
  • configs

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+6751
-565
lines changed

benchmarks/csb_sdlc_debug/tidb-query-plan-regression-debug-001/tests/test.sh

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,125 @@ SCORE=0
1111
TOTAL=6
1212
WORKSPACE="${VERIFY_REPO:-/workspace}"
1313

14+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
15+
PASS_THRESHOLD="0.6"
16+
OUTPUT_CONTRACT_MODE="answer_json_bridge"
17+
OUTPUT_PRIMARY_PATH="$TASK_OUTPUT"
18+
ARTIFACT_REQUIRED=false
19+
if [ "$OUTPUT_CONTRACT_MODE" = "repo_state" ]; then
20+
OUTPUT_PRIMARY_PATH=""
21+
elif [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
22+
ARTIFACT_REQUIRED=true
23+
fi
24+
25+
write_invalid_output() {
26+
local code="$1"
27+
local message="$2"
28+
python3 - "$code" "$message" "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
29+
import json
30+
import sys
31+
32+
code, message, mode, primary_path, required_artifact, pass_threshold = sys.argv[1:7]
33+
payload = {
34+
"schema_version": "validation_result.v1alpha1",
35+
"status": "invalid_output",
36+
"scorable": False,
37+
"scorer_family": "repo_state_heuristic",
38+
"reward": 0.0,
39+
"pass_threshold": float(pass_threshold),
40+
"passed": False,
41+
"output_contract": {
42+
"mode": mode,
43+
"primary_path": primary_path or None,
44+
"required_artifact": required_artifact == "true",
45+
},
46+
"sub_scores": {},
47+
"failure": {
48+
"code": code,
49+
"message": message,
50+
"stage": "output_validation",
51+
},
52+
}
53+
with open("/logs/verifier/validation_result.json", "w") as f:
54+
json.dump(payload, f, indent=2)
55+
PYEOF
56+
echo "0.0" > /logs/verifier/reward.txt
57+
}
58+
59+
write_scored_result() {
60+
local score="$1"
61+
local reason="${2:-}"
62+
local passed_checks="${3:-}"
63+
local total_checks="${4:-}"
64+
env VALIDATION_SCORE="$score" VALIDATION_REASON="$reason" VALIDATION_PASSED_CHECKS="$passed_checks" VALIDATION_TOTAL_CHECKS="$total_checks" CHANGE_UNSTAGED="${UNSTAGED_COUNT:-${UNSTAGED:-0}}" CHANGE_STAGED="${STAGED_COUNT:-${STAGED:-0}}" CHANGE_UNTRACKED="${UNTRACKED_COUNT:-${UNTRACKED:-0}}" CHANGE_COMMITS="${COMMIT_COUNT:-${COMMITS:-0}}" VALIDATION_OUTPUT_PATH="${VALIDATION_OUTPUT_PATH:-}" python3 - "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
65+
import json
66+
import os
67+
import sys
68+
69+
mode, primary_path, required_artifact, pass_threshold = sys.argv[1:5]
70+
reward = float(os.environ.get("VALIDATION_SCORE", "0.0") or 0.0)
71+
threshold = float(pass_threshold)
72+
checks = {"heuristic_score": reward}
73+
details = {}
74+
reason = os.environ.get("VALIDATION_REASON")
75+
if reason:
76+
details["reason"] = reason
77+
passed_checks_raw = os.environ.get("VALIDATION_PASSED_CHECKS", "")
78+
total_checks_raw = os.environ.get("VALIDATION_TOTAL_CHECKS", "")
79+
if passed_checks_raw and total_checks_raw:
80+
try:
81+
passed_checks = float(passed_checks_raw)
82+
total_checks = float(total_checks_raw)
83+
except ValueError:
84+
passed_checks = None
85+
total_checks = None
86+
if passed_checks is not None and total_checks and total_checks > 0:
87+
checks["passed_checks_ratio"] = round(passed_checks / total_checks, 4)
88+
details["passed_checks"] = int(passed_checks) if passed_checks.is_integer() else passed_checks
89+
details["total_checks"] = int(total_checks) if total_checks.is_integer() else total_checks
90+
change_detection = {
91+
"unstaged": int(os.environ.get("CHANGE_UNSTAGED", "0") or 0),
92+
"staged": int(os.environ.get("CHANGE_STAGED", "0") or 0),
93+
"untracked": int(os.environ.get("CHANGE_UNTRACKED", "0") or 0),
94+
"commits": int(os.environ.get("CHANGE_COMMITS", "0") or 0),
95+
}
96+
if any(change_detection.values()):
97+
checks["change_detected"] = 1.0
98+
details["change_detection"] = change_detection
99+
output_path = os.environ.get("VALIDATION_OUTPUT_PATH")
100+
if output_path:
101+
details["output_path"] = output_path
102+
payload = {
103+
"schema_version": "validation_result.v1alpha1",
104+
"status": "scored",
105+
"scorable": True,
106+
"scorer_family": "repo_state_heuristic",
107+
"reward": reward,
108+
"pass_threshold": threshold,
109+
"passed": reward >= threshold,
110+
"output_contract": {
111+
"mode": mode,
112+
"primary_path": primary_path or None,
113+
"required_artifact": required_artifact == "true",
114+
},
115+
"sub_scores": {"checks": checks},
116+
"failure": None,
117+
}
118+
if details:
119+
payload["details"] = details
120+
with open("/logs/verifier/validation_result.json", "w") as f:
121+
json.dump(payload, f, indent=2)
122+
PYEOF
123+
echo "$score" > /logs/verifier/reward.txt
124+
}
125+
126+
127+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ "${ANSWER_JSON_MISSING:-false}" = "true" ]; then
128+
write_invalid_output "missing_required_output" \
129+
"answer.json not found at ${ANSWER_JSON:-$TASK_OUTPUT}"
130+
exit 0
131+
fi
14132
if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
15-
answer_json_fail_closed_if_missing
16133
answer_json_copy_analysis_text "$WORKSPACE/debug_report.md"
17134
fi
18135

@@ -67,5 +184,5 @@ fi
67184
echo ""
68185
echo "Score: $SCORE / $TOTAL"
69186

70-
mkdir -p /logs/verifier
71-
python3 -c "print($SCORE / $TOTAL)" > /logs/verifier/reward.txt
187+
FINAL_SCORE=$(python3 -c "print($SCORE / $TOTAL)")
188+
write_scored_result "$FINAL_SCORE" "" "$SCORE" "$TOTAL"

benchmarks/csb_sdlc_design/django-pre-validate-signal-design-001/tests/test.sh

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,119 @@ TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
88
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
99
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
1010

11+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
12+
PASS_THRESHOLD="0.6"
13+
OUTPUT_CONTRACT_MODE="repo_state"
14+
OUTPUT_PRIMARY_PATH="$TASK_OUTPUT"
15+
ARTIFACT_REQUIRED=false
16+
if [ "$OUTPUT_CONTRACT_MODE" = "repo_state" ]; then
17+
OUTPUT_PRIMARY_PATH=""
18+
elif [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
19+
ARTIFACT_REQUIRED=true
20+
fi
21+
22+
write_invalid_output() {
23+
local code="$1"
24+
local message="$2"
25+
python3 - "$code" "$message" "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
26+
import json
27+
import sys
28+
29+
code, message, mode, primary_path, required_artifact, pass_threshold = sys.argv[1:7]
30+
payload = {
31+
"schema_version": "validation_result.v1alpha1",
32+
"status": "invalid_output",
33+
"scorable": False,
34+
"scorer_family": "repo_state_heuristic",
35+
"reward": 0.0,
36+
"pass_threshold": float(pass_threshold),
37+
"passed": False,
38+
"output_contract": {
39+
"mode": mode,
40+
"primary_path": primary_path or None,
41+
"required_artifact": required_artifact == "true",
42+
},
43+
"sub_scores": {},
44+
"failure": {
45+
"code": code,
46+
"message": message,
47+
"stage": "output_validation",
48+
},
49+
}
50+
with open("/logs/verifier/validation_result.json", "w") as f:
51+
json.dump(payload, f, indent=2)
52+
PYEOF
53+
echo "0.0" > /logs/verifier/reward.txt
54+
}
55+
56+
write_scored_result() {
57+
local score="$1"
58+
local reason="${2:-}"
59+
local passed_checks="${3:-}"
60+
local total_checks="${4:-}"
61+
env VALIDATION_SCORE="$score" VALIDATION_REASON="$reason" VALIDATION_PASSED_CHECKS="$passed_checks" VALIDATION_TOTAL_CHECKS="$total_checks" CHANGE_UNSTAGED="${UNSTAGED_COUNT:-${UNSTAGED:-0}}" CHANGE_STAGED="${STAGED_COUNT:-${STAGED:-0}}" CHANGE_UNTRACKED="${UNTRACKED_COUNT:-${UNTRACKED:-0}}" CHANGE_COMMITS="${COMMIT_COUNT:-${COMMITS:-0}}" VALIDATION_OUTPUT_PATH="${VALIDATION_OUTPUT_PATH:-}" python3 - "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
62+
import json
63+
import os
64+
import sys
65+
66+
mode, primary_path, required_artifact, pass_threshold = sys.argv[1:5]
67+
reward = float(os.environ.get("VALIDATION_SCORE", "0.0") or 0.0)
68+
threshold = float(pass_threshold)
69+
checks = {"heuristic_score": reward}
70+
details = {}
71+
reason = os.environ.get("VALIDATION_REASON")
72+
if reason:
73+
details["reason"] = reason
74+
passed_checks_raw = os.environ.get("VALIDATION_PASSED_CHECKS", "")
75+
total_checks_raw = os.environ.get("VALIDATION_TOTAL_CHECKS", "")
76+
if passed_checks_raw and total_checks_raw:
77+
try:
78+
passed_checks = float(passed_checks_raw)
79+
total_checks = float(total_checks_raw)
80+
except ValueError:
81+
passed_checks = None
82+
total_checks = None
83+
if passed_checks is not None and total_checks and total_checks > 0:
84+
checks["passed_checks_ratio"] = round(passed_checks / total_checks, 4)
85+
details["passed_checks"] = int(passed_checks) if passed_checks.is_integer() else passed_checks
86+
details["total_checks"] = int(total_checks) if total_checks.is_integer() else total_checks
87+
change_detection = {
88+
"unstaged": int(os.environ.get("CHANGE_UNSTAGED", "0") or 0),
89+
"staged": int(os.environ.get("CHANGE_STAGED", "0") or 0),
90+
"untracked": int(os.environ.get("CHANGE_UNTRACKED", "0") or 0),
91+
"commits": int(os.environ.get("CHANGE_COMMITS", "0") or 0),
92+
}
93+
if any(change_detection.values()):
94+
checks["change_detected"] = 1.0
95+
details["change_detection"] = change_detection
96+
output_path = os.environ.get("VALIDATION_OUTPUT_PATH")
97+
if output_path:
98+
details["output_path"] = output_path
99+
payload = {
100+
"schema_version": "validation_result.v1alpha1",
101+
"status": "scored",
102+
"scorable": True,
103+
"scorer_family": "repo_state_heuristic",
104+
"reward": reward,
105+
"pass_threshold": threshold,
106+
"passed": reward >= threshold,
107+
"output_contract": {
108+
"mode": mode,
109+
"primary_path": primary_path or None,
110+
"required_artifact": required_artifact == "true",
111+
},
112+
"sub_scores": {"checks": checks},
113+
"failure": None,
114+
}
115+
if details:
116+
payload["details"] = details
117+
with open("/logs/verifier/validation_result.json", "w") as f:
118+
json.dump(payload, f, indent=2)
119+
PYEOF
120+
echo "$score" > /logs/verifier/reward.txt
121+
}
122+
123+
11124
cd "$TASK_REPO_ROOT"
12125
mkdir -p /logs/verifier
13126
git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true
@@ -27,7 +140,7 @@ for ref in origin/master origin/main origin/HEAD; do
27140
done
28141
if [ "$UNSTAGED" -eq 0 ] && [ "$STAGED" -eq 0 ] && [ "$UNTRACKED" -eq 0 ] && [ "$COMMITS" -eq 0 ]; then
29142
echo "No code changes detected"
30-
echo "0.0" > /logs/verifier/reward.txt
143+
write_scored_result "0.0" "no_code_changes"
31144
exit 0
32145
fi
33146

@@ -117,6 +230,6 @@ elif [ "$SCOPE_OK" = true ]; then
117230
fi
118231

119232
REWARD=$(awk "BEGIN {printf \"%.2f\", $SCORE / 100}")
120-
echo "$REWARD" > /logs/verifier/reward.txt
233+
write_scored_result "$REWARD"
121234
echo ""
122235
echo "Score: $REWARD"

benchmarks/csb_sdlc_design/django-rate-limit-design-001/tests/test.sh

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,119 @@ TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
1212
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
1313
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
1414

15+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
16+
PASS_THRESHOLD="0.6"
17+
OUTPUT_CONTRACT_MODE="repo_state"
18+
OUTPUT_PRIMARY_PATH="$TASK_OUTPUT"
19+
ARTIFACT_REQUIRED=false
20+
if [ "$OUTPUT_CONTRACT_MODE" = "repo_state" ]; then
21+
OUTPUT_PRIMARY_PATH=""
22+
elif [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
23+
ARTIFACT_REQUIRED=true
24+
fi
25+
26+
write_invalid_output() {
27+
local code="$1"
28+
local message="$2"
29+
python3 - "$code" "$message" "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
30+
import json
31+
import sys
32+
33+
code, message, mode, primary_path, required_artifact, pass_threshold = sys.argv[1:7]
34+
payload = {
35+
"schema_version": "validation_result.v1alpha1",
36+
"status": "invalid_output",
37+
"scorable": False,
38+
"scorer_family": "repo_state_heuristic",
39+
"reward": 0.0,
40+
"pass_threshold": float(pass_threshold),
41+
"passed": False,
42+
"output_contract": {
43+
"mode": mode,
44+
"primary_path": primary_path or None,
45+
"required_artifact": required_artifact == "true",
46+
},
47+
"sub_scores": {},
48+
"failure": {
49+
"code": code,
50+
"message": message,
51+
"stage": "output_validation",
52+
},
53+
}
54+
with open("/logs/verifier/validation_result.json", "w") as f:
55+
json.dump(payload, f, indent=2)
56+
PYEOF
57+
echo "0.0" > /logs/verifier/reward.txt
58+
}
59+
60+
write_scored_result() {
61+
local score="$1"
62+
local reason="${2:-}"
63+
local passed_checks="${3:-}"
64+
local total_checks="${4:-}"
65+
env VALIDATION_SCORE="$score" VALIDATION_REASON="$reason" VALIDATION_PASSED_CHECKS="$passed_checks" VALIDATION_TOTAL_CHECKS="$total_checks" CHANGE_UNSTAGED="${UNSTAGED_COUNT:-${UNSTAGED:-0}}" CHANGE_STAGED="${STAGED_COUNT:-${STAGED:-0}}" CHANGE_UNTRACKED="${UNTRACKED_COUNT:-${UNTRACKED:-0}}" CHANGE_COMMITS="${COMMIT_COUNT:-${COMMITS:-0}}" VALIDATION_OUTPUT_PATH="${VALIDATION_OUTPUT_PATH:-}" python3 - "$OUTPUT_CONTRACT_MODE" "$OUTPUT_PRIMARY_PATH" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
66+
import json
67+
import os
68+
import sys
69+
70+
mode, primary_path, required_artifact, pass_threshold = sys.argv[1:5]
71+
reward = float(os.environ.get("VALIDATION_SCORE", "0.0") or 0.0)
72+
threshold = float(pass_threshold)
73+
checks = {"heuristic_score": reward}
74+
details = {}
75+
reason = os.environ.get("VALIDATION_REASON")
76+
if reason:
77+
details["reason"] = reason
78+
passed_checks_raw = os.environ.get("VALIDATION_PASSED_CHECKS", "")
79+
total_checks_raw = os.environ.get("VALIDATION_TOTAL_CHECKS", "")
80+
if passed_checks_raw and total_checks_raw:
81+
try:
82+
passed_checks = float(passed_checks_raw)
83+
total_checks = float(total_checks_raw)
84+
except ValueError:
85+
passed_checks = None
86+
total_checks = None
87+
if passed_checks is not None and total_checks and total_checks > 0:
88+
checks["passed_checks_ratio"] = round(passed_checks / total_checks, 4)
89+
details["passed_checks"] = int(passed_checks) if passed_checks.is_integer() else passed_checks
90+
details["total_checks"] = int(total_checks) if total_checks.is_integer() else total_checks
91+
change_detection = {
92+
"unstaged": int(os.environ.get("CHANGE_UNSTAGED", "0") or 0),
93+
"staged": int(os.environ.get("CHANGE_STAGED", "0") or 0),
94+
"untracked": int(os.environ.get("CHANGE_UNTRACKED", "0") or 0),
95+
"commits": int(os.environ.get("CHANGE_COMMITS", "0") or 0),
96+
}
97+
if any(change_detection.values()):
98+
checks["change_detected"] = 1.0
99+
details["change_detection"] = change_detection
100+
output_path = os.environ.get("VALIDATION_OUTPUT_PATH")
101+
if output_path:
102+
details["output_path"] = output_path
103+
payload = {
104+
"schema_version": "validation_result.v1alpha1",
105+
"status": "scored",
106+
"scorable": True,
107+
"scorer_family": "repo_state_heuristic",
108+
"reward": reward,
109+
"pass_threshold": threshold,
110+
"passed": reward >= threshold,
111+
"output_contract": {
112+
"mode": mode,
113+
"primary_path": primary_path or None,
114+
"required_artifact": required_artifact == "true",
115+
},
116+
"sub_scores": {"checks": checks},
117+
"failure": None,
118+
}
119+
if details:
120+
payload["details"] = details
121+
with open("/logs/verifier/validation_result.json", "w") as f:
122+
json.dump(payload, f, indent=2)
123+
PYEOF
124+
echo "$score" > /logs/verifier/reward.txt
125+
}
126+
127+
15128
cd "$TASK_REPO_ROOT"
16129

17130
mkdir -p /logs/verifier
@@ -161,6 +274,6 @@ elif [ "$SCOPE_OK" = true ]; then
161274
fi
162275

163276
SCORE=$(awk "BEGIN {printf \"%.2f\", $SCORE_NUMERATOR / 100}")
164-
echo "$SCORE" > /logs/verifier/reward.txt
277+
write_scored_result "$SCORE"
165278
echo ""
166279
echo "[x] Tests completed - Score: $SCORE"

0 commit comments

Comments
 (0)