Skip to content

Commit 347d552

Browse files
sjarmakclaude
andcommitted
fix: prevent eval.sh from exiting before writing reward.txt on score==0
oracle_checks.py exits with code 1 when composite score == 0. set -euo pipefail in eval.sh caused the script to exit without writing reward.txt, producing RewardFileNotFoundError in Harbor. Add || true to the SCORE=$(...) line across all 12 MCP-unique eval.sh scripts so the script continues to write reward.txt regardless of the oracle score. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent c7993e2 commit 347d552

File tree

12 files changed

+12
-12
lines changed
  • benchmarks
    • ccb_mcp_crossorg
    • ccb_mcp_crossrepo_tracing
    • ccb_mcp_incident/ccx-incident-031/tests
    • ccb_mcp_onboarding
    • ccb_mcp_platform/ccx-explore-091-ds/tests
    • ccb_mcp_security

12 files changed

+12
-12
lines changed

benchmarks/ccb_mcp_crossorg/ccx-crossorg-061/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossorg/ccx-crossorg-066/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_incident/ccx-incident-031/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5252
fi
5353

5454
echo "Running oracle checks..."
55-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
55+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5656

5757
# Validate score is a number
5858
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_onboarding/ccx-onboard-041/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5252
fi
5353

5454
echo "Running oracle checks..."
55-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
55+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5656

5757
# Validate score is a number
5858
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_platform/ccx-explore-091-ds/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5252
fi
5353

5454
echo "Running oracle checks..."
55-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
55+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5656

5757
# Validate score is a number
5858
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

0 commit comments

Comments
 (0)