Skip to content

Commit 1db21ed

Browse files
sjarmakclaude
andcommitted
feat: build deterministic SDLC verifiers for 67 promoted Org tasks
For each of the 67 Org tasks selected for SDLC promotion, generate a deterministic test.sh with suite-specific composite weights: - understand (15): file_f1=0.40, symbol=0.25, chain=0.20, keyword=0.15 - design (12): chain=0.40, file_f1=0.25, keyword=0.20, symbol=0.15 - debug (10): file_f1=0.50, keyword=0.30, symbol=0.20 - secure (12): file_f1=0.40, keyword=0.30, symbol=0.20, provenance=0.10 - refactor (10): file_f1=0.40, symbol=0.25, chain=0.20, keyword=0.15 - test (8): file_f1=0.50, keyword=0.30, symbol=0.20 Each verifier has 4+ assertion patterns (answer exists, valid JSON structure, oracle data available, per-check F1 scores), suite-weighted composite scoring via promoted_verifier.py, and detailed validation_result.json output. Original test.sh preserved as test.sh.org_backup. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a946f73 commit 1db21ed

File tree

205 files changed

+25637
-334
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

205 files changed

+25637
-334
lines changed
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/usr/bin/env python3
2+
"""Suite-aware verifier for promoted Org→SDLC tasks.
3+
4+
Wraps oracle_checks.py with suite-specific composite weights and detailed
5+
validation output. Designed to be deployed alongside oracle_checks.py in
6+
each promoted task's tests/ directory.
7+
8+
Stdlib only — no external dependencies.
9+
10+
Usage:
11+
python3 promoted_verifier.py \
12+
--answer /workspace/answer.json \
13+
--spec /tests/task_spec.json \
14+
--suite csb_sdlc_understand \
15+
--output /logs/verifier/validation_result.json
16+
17+
Exit codes:
18+
0 — composite score > 0
19+
1 — composite score == 0
20+
"""
21+
22+
import argparse
23+
import json
24+
import sys
25+
from pathlib import Path
26+
from typing import Any, Dict
27+
28+
# Import oracle_checks from the same directory
29+
sys.path.insert(0, str(Path(__file__).parent))
30+
from oracle_checks import (
31+
check_dependency_chain,
32+
check_file_set_match,
33+
check_keyword_presence,
34+
check_provenance,
35+
check_symbol_resolution,
36+
run_all_checks,
37+
)
38+
39+
# ---------------------------------------------------------------------------
40+
# Suite-specific composite weights
41+
# ---------------------------------------------------------------------------
42+
# Each suite emphasizes different oracle dimensions. Weights must sum to 1.0.
43+
# Checks not present in a task's spec get their weight redistributed to the
44+
# remaining checks proportionally.
45+
46+
SUITE_WEIGHTS: Dict[str, Dict[str, float]] = {
47+
"csb_sdlc_understand": {
48+
"file_set_match": 0.40,
49+
"symbol_resolution": 0.25,
50+
"dependency_chain": 0.20,
51+
"keyword_presence": 0.15,
52+
},
53+
"csb_sdlc_design": {
54+
"file_set_match": 0.25,
55+
"symbol_resolution": 0.15,
56+
"dependency_chain": 0.40,
57+
"keyword_presence": 0.20,
58+
},
59+
"csb_sdlc_debug": {
60+
"file_set_match": 0.50,
61+
"symbol_resolution": 0.20,
62+
"keyword_presence": 0.30,
63+
},
64+
"csb_sdlc_secure": {
65+
"file_set_match": 0.40,
66+
"keyword_presence": 0.30,
67+
"symbol_resolution": 0.20,
68+
"provenance": 0.10,
69+
},
70+
"csb_sdlc_refactor": {
71+
"file_set_match": 0.40,
72+
"symbol_resolution": 0.25,
73+
"dependency_chain": 0.20,
74+
"keyword_presence": 0.15,
75+
},
76+
"csb_sdlc_test": {
77+
"file_set_match": 0.50,
78+
"keyword_presence": 0.30,
79+
"symbol_resolution": 0.20,
80+
},
81+
}
82+
83+
# Score extraction keys per check type (must match oracle_checks.py)
84+
SCORE_KEYS: Dict[str, str] = {
85+
"file_set_match": "f1", # will prefer weighted_f1 if available
86+
"symbol_resolution": "recall",
87+
"dependency_chain": "chain_recall",
88+
"keyword_presence": "keyword_recall",
89+
"provenance": "provenance_score",
90+
}
91+
92+
93+
def _extract_score(check_result: Dict[str, Any], check_type: str) -> float:
94+
"""Extract the primary score from a check result."""
95+
if check_type == "file_set_match":
96+
return float(check_result.get("weighted_f1", check_result.get("f1", 0)))
97+
key = SCORE_KEYS.get(check_type, "")
98+
val = check_result.get(key, 0)
99+
return float(val) if not isinstance(val, bool) else (1.0 if val else 0.0)
100+
101+
102+
def compute_weighted_composite(
103+
check_results: Dict[str, Dict[str, Any]],
104+
target_suite: str,
105+
) -> Dict[str, Any]:
106+
"""Compute suite-weighted composite from per-check results.
107+
108+
Returns a dict with composite_score, per-check scores, and weight info.
109+
"""
110+
weights = SUITE_WEIGHTS.get(target_suite, SUITE_WEIGHTS["csb_sdlc_understand"])
111+
112+
# Filter to checks that are actually present in results
113+
active_weights = {k: v for k, v in weights.items() if k in check_results}
114+
115+
# Redistribute weights of missing checks proportionally
116+
if active_weights:
117+
total_active = sum(active_weights.values())
118+
normalized = {k: v / total_active for k, v in active_weights.items()}
119+
else:
120+
normalized = {}
121+
122+
per_check = {}
123+
weighted_sum = 0.0
124+
for check_type, weight in normalized.items():
125+
score = _extract_score(check_results[check_type], check_type)
126+
per_check[check_type] = {
127+
"score": round(score, 4),
128+
"weight": round(weight, 4),
129+
"weighted_contribution": round(score * weight, 4),
130+
}
131+
weighted_sum += score * weight
132+
133+
# Also include any checks not in the weight table (e.g., provenance for non-secure tasks)
134+
for check_type, result in check_results.items():
135+
if check_type not in per_check:
136+
score = _extract_score(result, check_type)
137+
per_check[check_type] = {
138+
"score": round(score, 4),
139+
"weight": 0.0,
140+
"weighted_contribution": 0.0,
141+
"note": "not in suite weight table",
142+
}
143+
144+
return {
145+
"composite_score": round(weighted_sum, 4),
146+
"target_suite": target_suite,
147+
"weights_used": {k: round(v, 4) for k, v in normalized.items()},
148+
"per_check": per_check,
149+
}
150+
151+
152+
def run_promoted_verifier(
153+
answer_path: str,
154+
task_spec_path: str,
155+
target_suite: str,
156+
output_path: str | None = None,
157+
) -> Dict[str, Any]:
158+
"""Run oracle checks with suite-specific weighting.
159+
160+
Returns full result dict. Optionally writes to output_path.
161+
"""
162+
# Run base oracle checks
163+
base_result = run_all_checks(answer_path, task_spec_path)
164+
165+
if "error" in base_result:
166+
result = {
167+
"composite_score": 0.0,
168+
"error": base_result["error"],
169+
"target_suite": target_suite,
170+
"oracle_checks": base_result,
171+
}
172+
else:
173+
# Compute suite-weighted composite
174+
weighted = compute_weighted_composite(
175+
base_result.get("checks", {}), target_suite
176+
)
177+
result = {
178+
"composite_score": weighted["composite_score"],
179+
"target_suite": target_suite,
180+
"weights_used": weighted["weights_used"],
181+
"per_check": weighted["per_check"],
182+
"oracle_checks": base_result,
183+
}
184+
185+
# Write output file
186+
if output_path:
187+
out = Path(output_path)
188+
out.parent.mkdir(parents=True, exist_ok=True)
189+
with open(out, "w") as f:
190+
json.dump(result, f, indent=2)
191+
192+
return result
193+
194+
195+
def main() -> None:
196+
parser = argparse.ArgumentParser(
197+
description="Suite-aware verifier for promoted Org→SDLC tasks."
198+
)
199+
parser.add_argument("--answer", required=True, help="Path to answer.json")
200+
parser.add_argument("--spec", required=True, help="Path to task_spec.json")
201+
parser.add_argument(
202+
"--suite", required=True, help="Target SDLC suite (e.g., csb_sdlc_understand)"
203+
)
204+
parser.add_argument(
205+
"--output", default=None, help="Path to write validation_result.json"
206+
)
207+
parser.add_argument("--verbose", action="store_true", help="Print detailed results")
208+
args = parser.parse_args()
209+
210+
result = run_promoted_verifier(args.answer, args.spec, args.suite, args.output)
211+
212+
if args.verbose:
213+
print(json.dumps(result, indent=2), file=sys.stderr)
214+
215+
# Print composite score to stdout (matches oracle_checks.py convention)
216+
print(f"{result['composite_score']:.4f}")
217+
218+
sys.exit(0 if result["composite_score"] > 0 else 1)
219+
220+
221+
if __name__ == "__main__":
222+
main()
Lines changed: 144 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,148 @@
11
#!/bin/bash
2-
# test.sh — Harbor compatibility wrapper
3-
# Harbor requires tests/test.sh for task discovery (TaskPaths.is_valid() check).
4-
# The actual evaluation logic lives in eval.sh (SWE-Factory exit-code-first pattern).
2+
# test.sh — Deterministic SDLC verifier for CCX-compliance-052
3+
# Promoted from csb_org_compliance -> csb_sdlc_secure
4+
#
5+
# Reward: suite-weighted composite (0.0-1.0) via oracle file/symbol/chain/keyword F1
6+
# Multiple assertion patterns: file_set_match + symbol_resolution + keyword_presence
7+
# [+ dependency_chain where oracle defines chains]
8+
#
9+
# Scoring weights (csb_sdlc_secure):
10+
# See promoted_verifier.py SUITE_WEIGHTS for per-check weight allocation.
511

6-
# sg_only_env: restore full repo before verification (no-op for regular runs)
12+
# sg_only mode guard: restore full repo before verification (no-op for regular runs)
713
[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
814

9-
exec bash "$(dirname "$0")/eval.sh" "$@"
15+
# NOTE: set -e intentionally NOT used — fallback logic requires graceful failure handling
16+
set -uo pipefail
17+
18+
TASK_ID="CCX-compliance-052"
19+
TARGET_SUITE="csb_sdlc_secure"
20+
ANSWER_PATH="/workspace/answer.json"
21+
TASK_SPEC_PATH="/tests/task_spec.json"
22+
PROMOTED_VERIFIER="/tests/promoted_verifier.py"
23+
ORACLE_CHECKS="/tests/oracle_checks.py"
24+
REWARD_PATH="/logs/verifier/reward.txt"
25+
VALIDATION_RESULT="/logs/verifier/validation_result.json"
26+
27+
mkdir -p /logs/verifier
28+
29+
echo "=== $TASK_ID deterministic verifier ===" >&2
30+
echo "Suite: $TARGET_SUITE (promoted from csb_org_compliance)" >&2
31+
echo "" >&2
32+
33+
# ------------------------------------------------------------------
34+
# Assertion 1: answer.json exists
35+
# ------------------------------------------------------------------
36+
if [ ! -f "$ANSWER_PATH" ]; then
37+
echo "FAIL: answer.json not found at $ANSWER_PATH" >&2
38+
echo "0.0" > "$REWARD_PATH"
39+
echo '{"composite_score": 0.0, "error": "answer.json not found"}' > "$VALIDATION_RESULT"
40+
exit 1
41+
fi
42+
echo "PASS: answer.json exists" >&2
43+
44+
# ------------------------------------------------------------------
45+
# Assertion 2: answer.json is valid JSON with expected structure
46+
# ------------------------------------------------------------------
47+
STRUCT_CHECK=$(python3 << 'PYEOF'
48+
import json, sys
49+
try:
50+
with open("/workspace/answer.json") as f:
51+
data = json.load(f)
52+
except (json.JSONDecodeError, OSError) as e:
53+
print(f"invalid JSON: {e}", file=sys.stderr)
54+
sys.exit(1)
55+
if not isinstance(data, dict):
56+
print("answer.json is not a JSON object", file=sys.stderr)
57+
sys.exit(1)
58+
keys = set(data.keys())
59+
expected = {"files", "symbols", "text", "chain", "dependency_chain", "answer"}
60+
if not keys & expected:
61+
print(f"answer.json missing expected keys (has: {keys})", file=sys.stderr)
62+
sys.exit(1)
63+
print("ok")
64+
PYEOF
65+
) 2>&1
66+
67+
if [ "$STRUCT_CHECK" != "ok" ]; then
68+
echo "FAIL: answer.json structure check: $STRUCT_CHECK" >&2
69+
echo "0.0" > "$REWARD_PATH"
70+
echo '{"composite_score": 0.0, "error": "answer.json invalid structure"}' > "$VALIDATION_RESULT"
71+
exit 1
72+
fi
73+
echo "PASS: answer.json is valid JSON with expected structure" >&2
74+
75+
# ------------------------------------------------------------------
76+
# Assertion 3: oracle data available
77+
# ------------------------------------------------------------------
78+
if [ ! -f "$TASK_SPEC_PATH" ]; then
79+
echo "FAIL: task_spec.json not found" >&2
80+
echo "0.0" > "$REWARD_PATH"
81+
exit 1
82+
fi
83+
if [ ! -f "$ORACLE_CHECKS" ]; then
84+
echo "FAIL: oracle_checks.py not found" >&2
85+
echo "0.0" > "$REWARD_PATH"
86+
exit 1
87+
fi
88+
echo "PASS: oracle data and checker available" >&2
89+
90+
# ------------------------------------------------------------------
91+
# Assertion 4+: Run suite-weighted oracle checks
92+
# ------------------------------------------------------------------
93+
echo "" >&2
94+
echo "Running suite-weighted oracle checks ($TARGET_SUITE)..." >&2
95+
96+
if [ -f "$PROMOTED_VERIFIER" ]; then
97+
# Use promoted_verifier.py for suite-specific weights
98+
SCORE=$(python3 "$PROMOTED_VERIFIER" \
99+
--answer "$ANSWER_PATH" \
100+
--spec "$TASK_SPEC_PATH" \
101+
--suite "$TARGET_SUITE" \
102+
--output "$VALIDATION_RESULT" \
103+
--verbose 2>&1 | tee /dev/stderr | tail -1) || true
104+
else
105+
# Fallback: use oracle_checks.py directly (equal weights)
106+
echo "WARNING: promoted_verifier.py not found, using oracle_checks.py directly" >&2
107+
SCORE=$(python3 "$ORACLE_CHECKS" \
108+
--answer "$ANSWER_PATH" \
109+
--spec "$TASK_SPEC_PATH" \
110+
--verbose 2>&1 | tee /dev/stderr | tail -1) || true
111+
fi
112+
113+
# ------------------------------------------------------------------
114+
# Validate and write reward
115+
# ------------------------------------------------------------------
116+
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
117+
echo "FAIL: verifier did not return a valid score: $SCORE" >&2
118+
echo "0.0" > "$REWARD_PATH"
119+
exit 1
120+
fi
121+
122+
echo "" >&2
123+
echo "Composite score: $SCORE" >&2
124+
echo "$SCORE" > "$REWARD_PATH"
125+
126+
# ------------------------------------------------------------------
127+
# Per-check assertion summary (if validation_result.json exists)
128+
# ------------------------------------------------------------------
129+
if [ -f "$VALIDATION_RESULT" ]; then
130+
python3 << 'PYEOF2' >&2 || true
131+
import json, sys
132+
result = json.load(open("/logs/verifier/validation_result.json"))
133+
per_check = result.get("per_check", {})
134+
print("")
135+
print("Per-check assertions:")
136+
for check_type, info in per_check.items():
137+
score = info.get("score", 0)
138+
weight = info.get("weight", 0)
139+
status = "PASS" if score > 0 else "FAIL"
140+
print(f" {status}: {check_type} = {score:.4f} (weight={weight:.2f})")
141+
PYEOF2
142+
fi
143+
144+
echo "" >&2
145+
echo "Reward: $SCORE" >&2
146+
147+
# Exit-code-first (SWE-Factory pattern)
148+
python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
# test.sh — Harbor compatibility wrapper
3+
# Harbor requires tests/test.sh for task discovery (TaskPaths.is_valid() check).
4+
# The actual evaluation logic lives in eval.sh (SWE-Factory exit-code-first pattern).
5+
6+
# sg_only_env: restore full repo before verification (no-op for regular runs)
7+
[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
8+
9+
exec bash "$(dirname "$0")/eval.sh" "$@"

0 commit comments

Comments
 (0)