Skip to content

Commit 52cb402

Browse files
sjarmakclaude
andcommitted
chore: archive broken batches, retire SWE-Perf tasks, fix servo verifier
- Archive 6 broken batch runs to runs/official/archive/ (Daytona pull denied, disk exceeded, Docker compose failures, timeout too short) - Move 3 SWE-Perf tasks (numpy/pandas/sklearn perf) to backups — stub verifiers that can never score > 0 regardless of model - Fix servo-scrollend-event-feat-001 verifier: convert cargo check from hard gate to soft signal (0.15 weight). Servo cannot compile in the sandbox (OOM on 4GB RAM, needs 30-60 min on 2 vCPU). Structural checks (keyword refs, file changes, WPT tests) now always run. - ccb_test: 18 → 15 tasks. Total active: 398 → 395. - Regenerate MANIFEST.json (758 scored tasks, 76 runs) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2198d43 commit 52cb402

File tree

3,515 files changed

+1407
-6961
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,515 files changed

+1407
-6961
lines changed

benchmarks/ccb_feature/servo-scrollend-event-feat-001/tests/test.sh

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -69,24 +69,18 @@ fi
6969

7070
echo "Testing scrollend event implementation..."
7171

72-
# ── Compilation check ──────────────────────────────────────────────────
73-
# Run cargo check on the workspace. If the code doesn't compile,
74-
# score is 0 regardless of keyword matches.
75-
echo "Running Rust compilation check (cargo check)..."
76-
BUILD_OK=1
77-
if ! cargo check 2>/logs/verifier/build_errors.txt; then
78-
echo "FAIL: cargo check failed"
79-
BUILD_OK=0
80-
fi
81-
82-
if [ "$BUILD_OK" -eq 0 ]; then
83-
echo "Compilation failed — score set to 0.0"
84-
echo "0.0" > /logs/verifier/reward.txt
85-
echo ""
86-
echo "[ ] Tests completed - Score: 0.0 (build failure)"
87-
exit 0
72+
# ── Compilation check (soft signal) ───────────────────────────────────
73+
# Servo is too large to compile in the sandbox environment (700+ crates,
74+
# needs OpenGL/X11/Wayland/fontconfig). Cargo check is best-effort;
75+
# failure does NOT gate structural checks below.
76+
echo "Running Rust compilation check (cargo check, best-effort)..."
77+
BUILD_OK=0
78+
if timeout 300 cargo check 2>/logs/verifier/build_errors.txt; then
79+
echo "[x] Rust compilation check passed"
80+
BUILD_OK=1
81+
else
82+
echo "NOTE: cargo check failed (expected for large workspaces — scoring on structural signals)"
8883
fi
89-
echo "[x] Rust compilation check passed"
9084

9185
# ── Unit/integration tests (best-effort) ──────────────────────────────
9286
# Run scroll-related tests if they exist; failures reduce score
@@ -163,25 +157,28 @@ if [ -d "tests/wpt" ]; then
163157
fi
164158

165159
# Calculate reward based on implementation (using bash arithmetic, avoiding bc)
166-
# Weights: scrollend keyword=0.2, file changes=0.1, WPT tests=0.1, unit tests pass=0.3, compilation=0.3
167-
# Note: compilation is a gate (already passed if we get here), so its 0.3 is implicit
168-
# Rebalanced weights for remaining signals: scrollend=0.3, changes=0.2, WPT=0.2, unit tests=0.3
160+
# Weights: scrollend keyword=0.3, file changes=0.2, WPT tests=0.2,
161+
# compilation=0.15, unit tests=0.15
162+
# Compilation and unit tests are best-effort (may fail due to environment limits)
169163
SCORE_NUMERATOR=0
170164
if [ "$SCROLLEND_FOUND" -eq 1 ]; then
171-
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 3)) # 0.3 * 10
165+
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 6)) # 0.30 * 20
172166
fi
173167
if [ "$CHANGES_MADE" -eq 1 ]; then
174-
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 2)) # 0.2 * 10
168+
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 4)) # 0.20 * 20
175169
fi
176170
if [ "$WPT_TESTS" -eq 1 ]; then
177-
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 2)) # 0.2 * 10
171+
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 4)) # 0.20 * 20
172+
fi
173+
if [ "$BUILD_OK" -eq 1 ]; then
174+
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 3)) # 0.15 * 20
178175
fi
179176
if [ "$UNIT_TEST_PASS" -eq 1 ]; then
180-
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 3)) # 0.3 * 10
177+
SCORE_NUMERATOR=$((SCORE_NUMERATOR + 3)) # 0.15 * 20
181178
fi
182179

183180
# Convert back to decimal (using awk for portable floating point)
184-
SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE_NUMERATOR / 10}")
181+
SCORE=$(awk "BEGIN {printf \"%.2f\", $SCORE_NUMERATOR / 20}")
185182

186183
echo "$SCORE" > /logs/verifier/reward.txt
187184
echo ""

0 commit comments

Comments
 (0)