Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e779ed4
Switch Phoenix GPU jobs to H200 nodes for faster scheduling
sbryngelson Feb 10, 2026
9cf00d3
Fix bash segfault in monitor_slurm_job.sh from fractional read timeout
sbryngelson Feb 12, 2026
7faf2d6
Merge branch 'master' into ci-fixes
sbryngelson Feb 12, 2026
a59db02
Restore pull_request_review trigger for benchmark workflow
sbryngelson Feb 12, 2026
2efc61e
Auto-retry sporadic test failures in CI
sbryngelson Feb 12, 2026
0658bd3
Preserve exit code for catastrophic test failures
sbryngelson Feb 12, 2026
c6b6f81
Harden SLURM monitor: robust state checks, orphan cleanup
sbryngelson Feb 13, 2026
a82959e
Use parsable sacct flags for robust state parsing
sbryngelson Feb 13, 2026
8022969
Guard squeue/sacct pipelines against set -euo pipefail
sbryngelson Feb 13, 2026
88d19ce
Retry delete_directory on Lustre ENOTEMPTY race
sbryngelson Feb 13, 2026
05d28f3
Remove stale failed_uuids.txt before test run
sbryngelson Feb 13, 2026
9eed0c6
Split benchmark concurrency group by event type
sbryngelson Feb 13, 2026
02c658d
Merge branch 'master' into ci-fixes
sbryngelson Feb 14, 2026
edefc01
Revert Phoenix test jobs to multi-partition GPU scheduling
sbryngelson Feb 15, 2026
2e15ab6
Fix doc lint for generated pages and hyphenated page IDs
sbryngelson Feb 15, 2026
0267756
Merge branch 'master' into ci-fixes
sbryngelson Feb 15, 2026
dfc524c
Add Lustre-safe workspace cleanup for self-hosted runners
sbryngelson Feb 15, 2026
ece1951
Revert Phoenix benchmark jobs to L40S GPU scheduling
sbryngelson Feb 15, 2026
a553a75
Improve Lustre-safe workspace cleanup with dotglob and nullglob
sbryngelson Feb 15, 2026
a149886
Auto-requeue SLURM jobs on preemption
sbryngelson Feb 15, 2026
ed8abd5
Merge branch 'master' into ci-fixes
sbryngelson Feb 15, 2026
273cced
Remove aggressive workspace cleanup from test jobs
sbryngelson Feb 16, 2026
c2e6543
Propagate exit code from test retry command
sbryngelson Feb 16, 2026
c0b1cd1
Restore default checkout clean for test jobs; tune PR reviewer
sbryngelson Feb 16, 2026
7a764d5
Remove aggressive workspace cleanup from bench jobs
sbryngelson Feb 16, 2026
b5dfa1f
Add test sharding for Frontier CI; switch to batch/hackathon partition
sbryngelson Feb 16, 2026
475caa3
Validate --shard argument format and bounds
sbryngelson Feb 16, 2026
ddd95ac
Use nick-fields/retry for Frontier builds; reduce -j to 4
sbryngelson Feb 16, 2026
197813a
Add set -e to Frontier build scripts for fail-fast behavior
sbryngelson Feb 16, 2026
73072bf
Add required timeout_minutes to nick-fields/retry steps
sbryngelson Feb 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 88 additions & 56 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@

set -euo pipefail

# Cleanup handler to prevent orphaned tail processes
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
cleanup() {
if [ -n "${tail_pid:-}" ]; then
kill "${tail_pid}" 2>/dev/null || true
fi
# Cancel the SLURM job if the monitor is exiting due to an error
# (e.g., the CI runner is being killed). Don't cancel on success.
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
fi
}
trap cleanup EXIT

Expand All @@ -23,30 +29,78 @@ output_file="$2"
echo "Submitted batch job $job_id"
echo "Monitoring output file: $output_file"

# Wait for file to appear with retry logic for transient squeue failures
# Robustly check SLURM job state using squeue with sacct fallback.
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
# or "UNKNOWN" if both commands fail.
get_job_state() {
local jid="$1"
local state

# Try squeue first (fast, works for active jobs)
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
if [ -n "$state" ]; then
echo "$state"
return
fi

# Fallback to sacct (works for completed/historical jobs)
if command -v sacct >/dev/null 2>&1; then
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
if [ -n "$state" ]; then
echo "$state"
return
fi
fi

echo "UNKNOWN"
}

# Check if a state is terminal (job is done, for better or worse)
is_terminal_state() {
case "$1" in
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE)
return 0 ;;
*)
return 1 ;;
esac
}

# Wait for file to appear, using robust state checking.
# Never give up due to transient squeue/sacct failures — the CI job timeout
# is the ultimate backstop.
echo "Waiting for job to start..."
squeue_retries=0
max_squeue_retries=5
unknown_count=0
while [ ! -f "$output_file" ]; do
# Check if job is still queued/running
if squeue -j "$job_id" &>/dev/null; then
squeue_retries=0 # Reset on success
sleep 5
else
squeue_retries=$((squeue_retries + 1))
if [ $squeue_retries -ge $max_squeue_retries ]; then
# Job not in queue and output file doesn't exist
if [ ! -f "$output_file" ]; then
echo "ERROR: Job $job_id not in queue and output file not created"
state=$(get_job_state "$job_id")

case "$state" in
PENDING|CONFIGURING)
unknown_count=0
sleep 5
;;
RUNNING|COMPLETING)
unknown_count=0
# Job is running but output file not yet visible (NFS delay)
sleep 2
;;
UNKNOWN)
unknown_count=$((unknown_count + 1))
# Only print warning periodically to avoid log spam
if [ $((unknown_count % 12)) -eq 1 ]; then
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
fi
sleep 5
;;
*)
# Terminal state — job finished without creating output
if is_terminal_state "$state"; then
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
exit 1
fi
break
fi
# Exponential backoff
sleep_time=$((2 ** squeue_retries))
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
sleep $sleep_time
fi
# Unrecognized state, keep waiting
sleep 5
;;
esac
done

echo "=== Streaming output for job $job_id ==="
Expand All @@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
tail_pid=$!

# Monitor job status and stream output simultaneously
squeue_failures=0
last_heartbeat=$(date +%s)

while true; do
# Try to read from tail output (non-blocking via timeout)
# Read multiple lines if available to avoid falling behind
lines_read=0
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
lines_read=$((lines_read + 1))
last_heartbeat=$(date +%s)
Expand All @@ -73,49 +126,30 @@ while true; do
break
fi
done

# Check job status
current_time=$(date +%s)
if ! squeue -j "$job_id" &>/dev/null; then
squeue_failures=$((squeue_failures + 1))
# Check if job actually completed using sacct (if available)
if [ $squeue_failures -ge 3 ]; then
if command -v sacct >/dev/null 2>&1; then
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
# Consider job done only if it reached a terminal state
case "$state" in
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
break
;;
*)
# treat as transient failure, reset failures and continue polling
squeue_failures=0
;;
esac
else
# No sacct: assume job completed after 3 failures
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
break
fi
fi
state=$(get_job_state "$job_id")

if is_terminal_state "$state"; then
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
break
else
squeue_failures=0
# Print heartbeat if no output for 60 seconds
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
last_heartbeat=$current_time
fi
fi

# Sleep briefly between status checks
sleep 1
done

# Drain any remaining output from tail after job completes
echo "Draining remaining output..."
drain_count=0
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
drain_count=$((drain_count + 1))
# Safety limit to avoid infinite loop
Expand All @@ -128,8 +162,9 @@ done
# Close the file descriptor and kill tail
exec 3<&-
kill "${tail_pid}" 2>/dev/null || true
tail_pid=""

# Wait for output file to finish growing (stabilize) before stopping tail
# Wait for output file to stabilize (NFS flush) before final read
if [ -f "$output_file" ]; then
last_size=-1
same_count=0
Expand All @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
done
fi

# Stop tailing (trap will also handle this on exit)
kill "${tail_pid}" 2>/dev/null || true

echo ""
echo "=== Final output ==="
cat "$output_file"
Expand Down Expand Up @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
exit 1
fi

monitor_success=1
echo "Job $job_id completed successfully"
exit 0

79 changes: 15 additions & 64 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
@@ -1,85 +1,35 @@
name: 'Benchmark'

on:
# Trigger when Test Suite completes (no polling needed)
workflow_run:
workflows: ["Test Suite"]
types: [completed]
pull_request:
pull_request_review:
types: [submitted]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
cancel-in-progress: true

jobs:
file-changes:
name: Detect File Changes
# Only run if Test Suite passed (or manual dispatch)
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
pr_number: ${{ steps.pr-info.outputs.pr_number }}
pr_approved: ${{ steps.pr-info.outputs.approved }}
pr_author: ${{ steps.pr-info.outputs.author }}
steps:
- name: Clone
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}

- name: Detect Changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: ".github/file-filter.yml"

- name: Get PR Info
id: pr-info
env:
GH_TOKEN: ${{ github.token }}
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=true" >> $GITHUB_OUTPUT
echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
else
# Get PR number from workflow_run
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
if [ -n "$PR_NUMBER" ]; then
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT

# Check if PR is approved
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
--jq '[.[] | select(.state == "APPROVED")] | length')
if [ "$APPROVED" -gt 0 ]; then
echo "approved=true" >> $GITHUB_OUTPUT
else
echo "approved=false" >> $GITHUB_OUTPUT
fi
else
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=false" >> $GITHUB_OUTPUT
echo "author=" >> $GITHUB_OUTPUT
fi
fi

self:
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: >
github.repository == 'MFlowCode/MFC' &&
needs.file-changes.outputs.checkall == 'true' &&
(
github.event_name == 'workflow_dispatch' ||
needs.file-changes.outputs.pr_approved == 'true' ||
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
needs.file-changes.outputs.pr_author == 'wilfonba'
)
needs: [file-changes]
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
needs: file-changes
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -136,14 +86,10 @@ jobs:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
timeout-minutes: 480
env:
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone - PR
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
path: pr

- name: Clone - Master
Expand All @@ -155,10 +101,15 @@ jobs:

- name: Setup & Build
if: matrix.build_script != ''
run: |
(cd pr && ${{ matrix.build_script }}) &
(cd master && ${{ matrix.build_script }}) &
wait %1 && wait %2
uses: nick-fields/retry@v3
with:
max_attempts: 3
retry_wait_seconds: 60
timeout_minutes: 480
command: |
(cd pr && ${{ matrix.build_script }}) &
(cd master && ${{ matrix.build_script }}) &
wait %1 && wait %2

- name: Bench (Master v. PR)
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
Expand Down
Loading
Loading