From e779ed41323e5a399e77f29ad786144014f33bf4 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Feb 2026 20:54:32 -0500 Subject: [PATCH 01/26] Switch Phoenix GPU jobs to H200 nodes for faster scheduling Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/submit-bench.sh | 5 ++--- .github/workflows/phoenix/submit.sh | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..fc28b3046b 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..5747c839f0 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then From 9cf00d3ee1f479a8902a90012fde4488602308b8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Feb 2026 21:34:12 -0500 Subject: [PATCH 02/26] Fix bash segfault in monitor_slurm_job.sh from fractional read timeout read -t 0.1 (sub-second timeout) in a loop with process substitution file descriptors triggers a bash internal error (unwind_frame_run: read_builtin: frame not found) leading to a segfault. Use integer timeout (read -t 1) instead. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..232a894f8a 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -64,7 +64,7 @@ while true; do # Try to read from tail output (non-blocking via timeout) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -115,7 +115,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop From a59db02c83acbab62874c75c978b8ddb033d7323 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 13:56:50 -0500 Subject: [PATCH 03/26] Restore pull_request_review trigger for benchmark workflow PR #1124 changed bench.yml to use workflow_run (triggered after Test Suite completes), which broke the approve-to-run flow for fork PRs. Revert to the original pull_request + pull_request_review triggers while keeping improvements (frontier_amd matrix, concurrency group, timeout, run_parallel_benchmarks.sh). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 65 ++++--------------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..fd240b7a11 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -1,32 +1,24 @@ name: 'Benchmark' on: - # Trigger when Test Suite completes (no polling needed) - workflow_run: - workflows: ["Test Suite"] - types: [completed] + pull_request: + pull_request_review: + types: [submitted] workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }} + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: file-changes: name: Detect File Changes - # Only run if Test Suite passed (or manual dispatch) - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} - pr_number: ${{ steps.pr-info.outputs.pr_number }} - pr_approved: ${{ steps.pr-info.outputs.approved }} - pr_author: ${{ steps.pr-info.outputs.author }} steps: - name: Clone uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - name: Detect Changes uses: dorny/paths-filter@v3 @@ -34,52 +26,10 @@ jobs: with: filters: ".github/file-filter.yml" - - name: Get PR Info - id: pr-info - env: - GH_TOKEN: ${{ github.token }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=true" >> $GITHUB_OUTPUT - echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT - else - # Get PR number from workflow_run - PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" - if [ -n "$PR_NUMBER" ]; then - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author) - PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') - echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi - else - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=false" >> $GITHUB_OUTPUT - echo "author=" >> $GITHUB_OUTPUT - fi - fi - self: name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" - if: > - github.repository == 'MFlowCode/MFC' && - needs.file-changes.outputs.checkall == 'true' && - ( - github.event_name == 'workflow_dispatch' || - needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' - ) - needs: [file-changes] + if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }} + needs: file-changes strategy: fail-fast: false matrix: @@ -143,7 +93,6 @@ jobs: - name: Clone - PR uses: actions/checkout@v4 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} path: pr - name: Clone - Master @@ -155,7 +104,7 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - run: | + run: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & wait %1 && wait %2 From 2efc61e1eb98a8ea1287275c407cff5539a153f1 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:28:56 -0500 Subject: [PATCH 04/26] Auto-retry sporadic test failures in CI Write failed test UUIDs to tests/failed_uuids.txt after a test run. In CI, if 1-5 tests fail, automatically re-run just those tests. If 6+ fail, treat it as a real issue and fail immediately. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 19 +++++++++++++++++-- toolchain/mfc/test/test.py | 9 +++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..3a5a0e33d7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -134,8 +134,23 @@ jobs: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} - name: Test - run: | - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT + run: | + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true + + # Retry only if a small number of tests failed (sporadic failures) + if [ -f tests/failed_uuids.txt ]; then + NUM_FAILED=$(wc -l < tests/failed_uuids.txt) + if [ "$NUM_FAILED" -le 5 ]; then + FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') + echo "" + echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" + echo "" + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL + else + echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." + exit 1 + fi + fi env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..d6dce92436 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -206,6 +206,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL) From 0658bd348512de9f051b8ca4c7adbbb9a19f576b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:40:49 -0500 Subject: [PATCH 05/26] Preserve exit code for catastrophic test failures Don't mask non-zero exit codes when tests crash before writing failed_uuids.txt. Only suppress the exit code when the file exists (meaning the test framework ran to completion and we can retry). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a5a0e33d7..eec9d19fd0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -135,7 +135,8 @@ jobs: - name: Test run: | - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true + TEST_EXIT=0 + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) if [ -f tests/failed_uuids.txt ]; then @@ -150,6 +151,8 @@ jobs: echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." exit 1 fi + elif [ "$TEST_EXIT" -ne 0 ]; then + exit $TEST_EXIT fi env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} From c6b6f8134409a0f99a375327752e4a5eee0c834d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 21:48:39 -0500 Subject: [PATCH 06/26] Harden SLURM monitor: robust state checks, orphan cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace squeue exit-code polling with get_job_state() that parses the actual state string (squeue + sacct fallback). Never give up on UNKNOWN state — CI timeout is the backstop. Cancel orphaned SLURM jobs on abnormal monitor exit. Include job state in heartbeats. Incorporates changes from PR #1140. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 138 +++++++++++++++++---------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 232a894f8a..408d205aab 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -4,11 +4,17 @@ set -euo pipefail -# Cleanup handler to prevent orphaned tail processes +# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs cleanup() { if [ -n "${tail_pid:-}" ]; then kill "${tail_pid}" 2>/dev/null || true fi + # Cancel the SLURM job if the monitor is exiting due to an error + # (e.g., the CI runner is being killed). Don't cancel on success. + if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then + echo "Monitor exiting abnormally — cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + fi } trap cleanup EXIT @@ -23,30 +29,78 @@ output_file="$2" echo "Submitted batch job $job_id" echo "Monitoring output file: $output_file" -# Wait for file to appear with retry logic for transient squeue failures +# Robustly check SLURM job state using squeue with sacct fallback. +# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.) +# or "UNKNOWN" if both commands fail. +get_job_state() { + local jid="$1" + local state + + # Try squeue first (fast, works for active jobs) + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ') + if [ -n "$state" ]; then + echo "$state" + return + fi + + # Fallback to sacct (works for completed/historical jobs) + if command -v sacct >/dev/null 2>&1; then + state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') + if [ -n "$state" ]; then + echo "$state" + return + fi + fi + + echo "UNKNOWN" +} + +# Check if a state is terminal (job is done, for better or worse) +is_terminal_state() { + case "$1" in + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE) + return 0 ;; + *) + return 1 ;; + esac +} + +# Wait for file to appear, using robust state checking. +# Never give up due to transient squeue/sacct failures — the CI job timeout +# is the ultimate backstop. echo "Waiting for job to start..." -squeue_retries=0 -max_squeue_retries=5 +unknown_count=0 while [ ! -f "$output_file" ]; do - # Check if job is still queued/running - if squeue -j "$job_id" &>/dev/null; then - squeue_retries=0 # Reset on success - sleep 5 - else - squeue_retries=$((squeue_retries + 1)) - if [ $squeue_retries -ge $max_squeue_retries ]; then - # Job not in queue and output file doesn't exist - if [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id not in queue and output file not created" + state=$(get_job_state "$job_id") + + case "$state" in + PENDING|CONFIGURING) + unknown_count=0 + sleep 5 + ;; + RUNNING|COMPLETING) + unknown_count=0 + # Job is running but output file not yet visible (NFS delay) + sleep 2 + ;; + UNKNOWN) + unknown_count=$((unknown_count + 1)) + # Only print warning periodically to avoid log spam + if [ $((unknown_count % 12)) -eq 1 ]; then + echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..." + fi + sleep 5 + ;; + *) + # Terminal state — job finished without creating output + if is_terminal_state "$state"; then + echo "ERROR: Job $job_id reached terminal state ($state) without creating output file" exit 1 fi - break - fi - # Exponential backoff - sleep_time=$((2 ** squeue_retries)) - echo "Warning: squeue check failed, retrying in ${sleep_time}s..." - sleep $sleep_time - fi + # Unrecognized state, keep waiting + sleep 5 + ;; + esac done echo "=== Streaming output for job $job_id ===" @@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) tail_pid=$! # Monitor job status and stream output simultaneously -squeue_failures=0 last_heartbeat=$(date +%s) while true; do @@ -73,41 +126,22 @@ while true; do break fi done - + # Check job status current_time=$(date +%s) - if ! squeue -j "$job_id" &>/dev/null; then - squeue_failures=$((squeue_failures + 1)) - # Check if job actually completed using sacct (if available) - if [ $squeue_failures -ge 3 ]; then - if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') - # Consider job done only if it reached a terminal state - case "$state" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) - echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" - break - ;; - *) - # treat as transient failure, reset failures and continue polling - squeue_failures=0 - ;; - esac - else - # No sacct: assume job completed after 3 failures - echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" - break - fi - fi + state=$(get_job_state "$job_id") + + if is_terminal_state "$state"; then + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" + break else - squeue_failures=0 # Print heartbeat if no output for 60 seconds if [ $((current_time - last_heartbeat)) -ge 60 ]; then - echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." + echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..." last_heartbeat=$current_time fi fi - + # Sleep briefly between status checks sleep 1 done @@ -128,6 +162,7 @@ done # Close the file descriptor and kill tail exec 3<&- kill "${tail_pid}" 2>/dev/null || true +tail_pid="" # Wait for output file to finish growing (stabilize) before stopping tail if [ -f "$output_file" ]; then @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then done fi -# Stop tailing (trap will also handle this on exit) -kill "${tail_pid}" 2>/dev/null || true - echo "" echo "=== Final output ===" cat "$output_file" @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then exit 1 fi +monitor_success=1 echo "Job $job_id completed successfully" exit 0 - From a82959e1e793bfba5983cad3f0c4c84c85da795c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 22:26:38 -0500 Subject: [PATCH 07/26] Use parsable sacct flags for robust state parsing Use -n -X -P flags with sacct: -X restricts to job allocation (not steps), -P gives pipe-delimited output for reliable parsing. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 408d205aab..d9f2237032 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -45,7 +45,7 @@ get_job_state() { # Fallback to sacct (works for completed/historical jobs) if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1) if [ -n "$state" ]; then echo "$state" return From 80229694a2283a5a5b9eac5ad5c5ef123934c669 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 22:49:45 -0500 Subject: [PATCH 08/26] Guard squeue/sacct pipelines against set -euo pipefail With pipefail, a transient squeue failure would exit the script instead of falling through to return UNKNOWN. Add || true to both pipelines. Also fix stale comment about tail stopping. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index d9f2237032..4981e5e607 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -37,7 +37,7 @@ get_job_state() { local state # Try squeue first (fast, works for active jobs) - state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ') + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true) if [ -n "$state" ]; then echo "$state" return @@ -45,7 +45,7 @@ get_job_state() { # Fallback to sacct (works for completed/historical jobs) if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1) + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true) if [ -n "$state" ]; then echo "$state" return @@ -164,7 +164,7 @@ exec 3<&- kill "${tail_pid}" 2>/dev/null || true tail_pid="" -# Wait for output file to finish growing (stabilize) before stopping tail +# Wait for output file to stabilize (NFS flush) before final read if [ -f "$output_file" ]; then last_size=-1 same_count=0 From 88d19ce4b562f14c9abf832c6cae19e4fc0851ea Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 09:29:42 -0500 Subject: [PATCH 09/26] Retry delete_directory on Lustre ENOTEMPTY race shutil.rmtree can fail with "Directory not empty" on networked filesystems (Lustre) due to metadata propagation delays. Retry up to 5 times with 1s backoff before raising. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index ce02e8251c..e56c6a9eb4 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -1,4 +1,4 @@ -import os, yaml, typing, shutil, subprocess, logging +import os, yaml, typing, shutil, subprocess, logging, time from os.path import join, abspath, normpath, dirname, realpath @@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None: def delete_directory(dirpath: str) -> None: - if os.path.isdir(dirpath): - shutil.rmtree(dirpath) + for attempt in range(5): + if not os.path.isdir(dirpath): + return + try: + shutil.rmtree(dirpath) + return + except OSError: + if attempt == 4: + raise + time.sleep(1) def get_program_output(arguments: typing.List[str] = None, cwd=None): From 05d28f37bb01c099d40b365ab2857c169e6954ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 09:44:16 -0500 Subject: [PATCH 10/26] Remove stale failed_uuids.txt before test run On self-hosted runners the workspace persists between runs, so a leftover file could trigger spurious retries. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index eec9d19fd0..21e52d5a5e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -135,6 +135,7 @@ jobs: - name: Test run: | + rm -f tests/failed_uuids.txt TEST_EXIT=0 /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? From 9eed0c65d2d114cc7aa8134fc3d3cf809b18c0b5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 10:08:33 -0500 Subject: [PATCH 11/26] Split benchmark concurrency group by event type Bot review events (pull_request_review) were racing against and cancelling legitimate push-triggered (pull_request) benchmark runs via the shared concurrency group. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index fd240b7a11..53efac21ed 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -7,7 +7,7 @@ on: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: From edefc015be0f5b707b596a0718467100e6640ae5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 14 Feb 2026 20:26:51 -0500 Subject: [PATCH 12/26] Revert Phoenix test jobs to multi-partition GPU scheduling Keep H200 targeting only for benchmarks; tests should run on any available GPU partition for faster scheduling. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/submit.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 5747c839f0..06a03e465a 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,8 +23,9 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH --gres=gpu:H200:2 -#SBATCH --ntasks-per-node=8 # Number of cores per node required\ +#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ " if [ "$2" = "cpu" ]; then From 2e15ab646d00a521e8835734b1cf74f3dabb5fb9 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 14 Feb 2026 20:35:46 -0500 Subject: [PATCH 13/26] Fix doc lint for generated pages and hyphenated page IDs Add build-time generated page IDs (parameters, cli-reference, examples, case_constraints) to the known set, and allow hyphens in @page/@ref ID patterns. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/lint_docs.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/toolchain/mfc/lint_docs.py b/toolchain/mfc/lint_docs.py index 783ac22c66..9c4be131c1 100644 --- a/toolchain/mfc/lint_docs.py +++ b/toolchain/mfc/lint_docs.py @@ -53,8 +53,8 @@ "docs/documentation/case.md": CASE_MD_SKIP, } -# Match @ref page_id patterns -REF_RE = re.compile(r"@ref\s+(\w+)") +# Match @ref page_id patterns (allow hyphens in page IDs like cli-reference) +REF_RE = re.compile(r"@ref\s+([\w-]+)") def check_docs(repo_root: Path) -> list[str]: @@ -322,10 +322,13 @@ def check_page_refs(repo_root: Path) -> list[str]: return [] # Collect all @page identifiers - page_ids = {"citelist"} # Doxygen built-in + # Include Doxygen built-ins and pages generated at build time by + # gen_parameters.sh, gen_cli_reference.sh, examples.sh, and + # gen_case_constraints_docs.py. + page_ids = {"citelist", "parameters", "cli-reference", "examples", "case_constraints"} for md_file in doc_dir.glob("*.md"): text = md_file.read_text(encoding="utf-8") - m = re.search(r"^\s*@page\s+(\w+)", text, flags=re.MULTILINE) + m = re.search(r"^\s*@page\s+([\w-]+)", text, flags=re.MULTILINE) if m: page_ids.add(m.group(1)) From dfc524ced81caf1b312a47361c14e062e63cd98c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 14 Feb 2026 23:05:49 -0500 Subject: [PATCH 14/26] Add Lustre-safe workspace cleanup for self-hosted runners Disable actions/checkout's built-in git clean (which fails on Lustre with ESTALE/ENOTEMPTY errors) and add a retry-based rm -rf step before checkout instead. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 11 +++++++++++ .github/workflows/test.yml | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 53efac21ed..9a45201376 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -90,14 +90,25 @@ jobs: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: + - name: Clean workspace (Lustre-safe) + run: | + for i in 1 2 3 4 5; do + rm -rf "$GITHUB_WORKSPACE"/pr "$GITHUB_WORKSPACE"/master 2>/dev/null && break + echo "Clean attempt $i failed, retrying in 2s..." + sleep 2 + done + true + - name: Clone - PR uses: actions/checkout@v4 with: + clean: false path: pr - name: Clone - Master uses: actions/checkout@v4 with: + clean: false repository: MFlowCode/MFC ref: master path: master diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 21e52d5a5e..4709ceb84c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -219,8 +219,19 @@ jobs: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: + - name: Clean workspace (Lustre-safe) + run: | + for i in 1 2 3 4 5; do + rm -rf "$GITHUB_WORKSPACE"/{.,}* 2>/dev/null && break + echo "Clean attempt $i failed, retrying in 2s..." + sleep 2 + done + true + - name: Clone uses: actions/checkout@v4 + with: + clean: false - name: Build if: matrix.cluster != 'phoenix' From ece195155a142e1a3560e9d973434d35df47efb3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Feb 2026 11:03:25 -0500 Subject: [PATCH 15/26] Revert Phoenix benchmark jobs to L40S GPU scheduling The H200 switch needs to land on master first so both PR and master benchmark builds use the same node type. Split into a separate PR. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/submit-bench.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index fc28b3046b..7ae85e66fe 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,8 +20,9 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH --gres=gpu:H200:2 -#SBATCH --ntasks-per-node=8 # Number of cores per node required\ +#SBATCH -CL40S +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ " if [ "$2" = "cpu" ]; then From a553a7573ad153793ac1a31bab13cc16cc990d46 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Feb 2026 16:02:08 -0500 Subject: [PATCH 16/26] Improve Lustre-safe workspace cleanup with dotglob and nullglob Use shopt dotglob/nullglob for cleaner glob expansion instead of manual dotfile patterns. Keep retry loop for Lustre ESTALE resilience. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 5 +++-- .github/workflows/test.yml | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 9a45201376..a3e19ca495 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -91,13 +91,14 @@ jobs: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clean workspace (Lustre-safe) + shell: bash run: | + shopt -s dotglob nullglob for i in 1 2 3 4 5; do - rm -rf "$GITHUB_WORKSPACE"/pr "$GITHUB_WORKSPACE"/master 2>/dev/null && break + rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break echo "Clean attempt $i failed, retrying in 2s..." sleep 2 done - true - name: Clone - PR uses: actions/checkout@v4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4709ceb84c..9d0e1a5ec7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -220,13 +220,14 @@ jobs: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clean workspace (Lustre-safe) + shell: bash run: | + shopt -s dotglob nullglob for i in 1 2 3 4 5; do - rm -rf "$GITHUB_WORKSPACE"/{.,}* 2>/dev/null && break + rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break echo "Clean attempt $i failed, retrying in 2s..." sleep 2 done - true - name: Clone uses: actions/checkout@v4 From a1498864ad6e77f96ea2fe9e9250ec552fd359d7 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Feb 2026 16:09:17 -0500 Subject: [PATCH 17/26] Auto-requeue SLURM jobs on preemption Add --requeue to Phoenix sbatch scripts so preempted embers-QOS jobs are automatically rescheduled. Remove PREEMPTED from the monitor's terminal state list so it keeps waiting through the requeue cycle. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 2 +- .github/workflows/phoenix/submit-bench.sh | 1 + .github/workflows/phoenix/submit.sh | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 4981e5e607..16717551cd 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -58,7 +58,7 @@ get_job_state() { # Check if a state is terminal (job is done, for better or worse) is_terminal_state() { case "$1" in - COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE) + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE) return 0 ;; *) return 1 ;; diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..a3830f5050 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -44,6 +44,7 @@ sbatch < Date: Sun, 15 Feb 2026 19:19:43 -0500 Subject: [PATCH 18/26] Remove aggressive workspace cleanup from test jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rm -rf * was destroying the build cache, causing CMake to rebuild from scratch and hit Lustre ioctl errors. With clean: false on checkout, git clean is already disabled — no pre-cleanup needed. Keep full cleanup only in bench.yml where pr/master are fresh clones. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c1e383f0e0..33d104fa73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -222,16 +222,6 @@ jobs: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - - name: Clean workspace (Lustre-safe) - shell: bash - run: | - shopt -s dotglob nullglob - for i in 1 2 3 4 5; do - rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break - echo "Clean attempt $i failed, retrying in 2s..." - sleep 2 - done - - name: Clone uses: actions/checkout@v4 with: From c2e6543f460b397efff538dacc6739f51d16aa7e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Feb 2026 19:37:36 -0500 Subject: [PATCH 19/26] Propagate exit code from test retry command Without || exit $?, a failed retry would silently exit 0. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 33d104fa73..ac10cab0d5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -150,7 +150,7 @@ jobs: echo "" echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $? else echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." exit 1 From c0b1cd15285a3acb1e1aeaf715a0f4bed62770a2 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 15 Feb 2026 19:51:56 -0500 Subject: [PATCH 20/26] Restore default checkout clean for test jobs; tune PR reviewer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert test.yml to clean: true (default) — the corrupted build cache from the ioctl failure was causing 100% test failures. The Lustre-safe cleanup is only needed for bench.yml where pr/master are separate trees. Also tune qodo PR reviewer: reduce max findings to 5, lower suggestion depth to medium, and add instructions to focus on correctness over style for CI scripts. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 2 -- .pr_agent.toml | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ac10cab0d5..0864fe345c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -224,8 +224,6 @@ jobs: steps: - name: Clone uses: actions/checkout@v4 - with: - clean: false - name: Build if: matrix.cluster != 'phoenix' diff --git a/.pr_agent.toml b/.pr_agent.toml index 9411d1cfe7..f87cd95910 100644 --- a/.pr_agent.toml +++ b/.pr_agent.toml @@ -9,7 +9,7 @@ handle_push_trigger = true push_commands = ["/improve"] [pr_reviewer] # (all fields optional) -num_max_findings = 10 # how many items to surface +num_max_findings = 5 # how many items to surface require_tests_review = true extra_instructions = """ Project context and review priorities: .github/copilot-instructions.md @@ -26,8 +26,12 @@ constraints for new parameters, and compiler portability across all four supported compilers. Python toolchain requires Python 3.10+; do not suggest __future__ imports or other backwards-compatibility shims. +For CI/shell scripts, focus on correctness bugs only — not style, atomic +writes, or FIFO alternatives. +Do not suggest changes to code that was not modified in the PR. """ [pr_code_suggestions] +suggestions_depth = "medium" commitable_code_suggestions = true apply_suggestions_checkbox = true From 7a764d5769dfa6f70908267c39a09760eaa8ccda Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 09:48:33 -0500 Subject: [PATCH 21/26] Remove aggressive workspace cleanup from bench jobs The Lustre-safe cleanup step was wiping the build cache (pr/build/, master/build/), forcing full rebuilds every run. This added ~32 min of build time and pushed NVHPC gpu-omp benchmarks past the 4h SLURM limit. Restore default checkout behavior to preserve build cache across runs. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index a3e19ca495..53efac21ed 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -90,26 +90,14 @@ jobs: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - - name: Clean workspace (Lustre-safe) - shell: bash - run: | - shopt -s dotglob nullglob - for i in 1 2 3 4 5; do - rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break - echo "Clean attempt $i failed, retrying in 2s..." - sleep 2 - done - - name: Clone - PR uses: actions/checkout@v4 with: - clean: false path: pr - name: Clone - Master uses: actions/checkout@v4 with: - clean: false repository: MFlowCode/MFC ref: master path: master From b5dfa1fbeaee3dda3b65f32193f90a3f6ab7d5a8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 10:59:42 -0500 Subject: [PATCH 22/26] Add test sharding for Frontier CI; switch to batch/hackathon partition Split Frontier GPU test configs into 2 shards (~75 min each) so they fit within the batch partition's 2h wall time limit. This allows all Frontier SLURM jobs to run concurrently instead of serially on the extended partition (which has a 1-job-per-user limit), reducing total CI wall clock from ~4.5h to ~2h. Changes: - Add --shard CLI argument (e.g., --shard 1/2) with modulo-based round-robin distribution across shards - Switch Frontier submit scripts from extended to batch/hackathon (CFD154 account, 1h59m wall time) - Shard the 3 Frontier GPU matrix entries into 6 (2 shards each) - CPU entries remain unsharded Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/submit.sh | 8 +++--- .github/workflows/frontier/test.sh | 7 ++++- .github/workflows/frontier_amd/submit.sh | 8 +++--- .github/workflows/frontier_amd/test.sh | 7 ++++- .github/workflows/test.yml | 34 +++++++++++++++++++++--- toolchain/mfc/cli/commands.py | 6 +++++ toolchain/mfc/test/test.py | 5 ++++ 7 files changed, 63 insertions(+), 12 deletions(-) diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index d5b416c65a..4c3e0e3e27 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -34,12 +34,13 @@ output_file="$job_slug.out" submit_output=$(sbatch < typing.List[TestCase]: skipped_cases += example_cases cases = [case for case in cases if case not in example_cases] + if ARG("shard") is not None: + shard_idx, shard_count = (int(x) for x in ARG("shard").split("/")) + skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] + cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + if ARG("percent") == 100: return cases, skipped_cases From 475caa32563291798b776eef8da4434e86d3a98f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 11:19:58 -0500 Subject: [PATCH 23/26] Validate --shard argument format and bounds Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 52c143038a..54e00186dd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -100,7 +100,10 @@ def __filter(cases_) -> typing.List[TestCase]: cases = [case for case in cases if case not in example_cases] if ARG("shard") is not None: - shard_idx, shard_count = (int(x) for x in ARG("shard").split("/")) + parts = ARG("shard").split("/") + if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]): + raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').") + shard_idx, shard_count = int(parts[0]), int(parts[1]) skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] From ddd95ac58e0bd9b650310058549aadb94516672d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 14:48:35 -0500 Subject: [PATCH 24/26] Use nick-fields/retry for Frontier builds; reduce -j to 4 Move build retry logic from shell scripts to GHA using nick-fields/retry with 60s backoff between attempts. This gives better visibility into retries and lets login node memory pressure subside between attempts. Also reduce build parallelism from -j 8 to -j 4 to lower peak memory on shared Frontier login nodes, and remove the outdated Node 16 version overrides from self-hosted runner env. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 15 +++++---- .github/workflows/frontier/build.sh | 43 ++++--------------------- .github/workflows/frontier_amd/build.sh | 43 ++++--------------------- .github/workflows/test.yml | 8 +++-- 4 files changed, 27 insertions(+), 82 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 53efac21ed..bc91c2635c 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -86,9 +86,6 @@ jobs: group: ${{ matrix.group }} labels: ${{ matrix.labels }} timeout-minutes: 480 - env: - ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clone - PR uses: actions/checkout@v4 @@ -104,10 +101,14 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - run: | - (cd pr && ${{ matrix.build_script }}) & - (cd master && ${{ matrix.build_script }}) & - wait %1 && wait %2 + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + retry_wait_seconds: 60 + command: | + (cd pr && ${{ matrix.build_script }}) & + (cd master && ${{ matrix.build_script }}) & + wait %1 && wait %2 - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 18cddc96ca..cbfaf3eada 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -18,39 +18,10 @@ fi . ./mfc.sh load -c f -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +if [ "$run_bench" == "bench" ]; then + for dir in benchmarks/*/; do + ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts + done +else + ./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts +fi diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh index 56c47d8ff4..9442a10d57 100644 --- a/.github/workflows/frontier_amd/build.sh +++ b/.github/workflows/frontier_amd/build.sh @@ -18,39 +18,10 @@ fi . ./mfc.sh load -c famd -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +if [ "$run_bench" == "bench" ]; then + for dir in benchmarks/*/; do + ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts + done +else + ./mfc.sh test -v -a --dry-run -j 4 $build_opts +fi diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4a4e70fd45..d36bc1686a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -245,15 +245,17 @@ jobs: labels: ${{ matrix.runner }} env: NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }} - ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clone uses: actions/checkout@v4 - name: Build if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + retry_wait_seconds: 60 + command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Test run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }} From 197813a97a7b1a68831de4bfb7b638e94bfb005b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 14:56:11 -0500 Subject: [PATCH 25/26] Add set -e to Frontier build scripts for fail-fast behavior Without set -e, the benchmark build loop could silently ignore failures of earlier benchmarks if a later one succeeded, since only the last command's exit code would propagate. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/build.sh | 2 ++ .github/workflows/frontier_amd/build.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index cbfaf3eada..199eda213c 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + # Ignore SIGHUP to survive login node session drops trap '' HUP diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh index 9442a10d57..60e396b54d 100644 --- a/.github/workflows/frontier_amd/build.sh +++ b/.github/workflows/frontier_amd/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + # Ignore SIGHUP to survive login node session drops trap '' HUP From 73072bf22b6a56664a7012d59572e3ce5b437b8e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 16 Feb 2026 15:45:36 -0500 Subject: [PATCH 26/26] Add required timeout_minutes to nick-fields/retry steps nick-fields/retry@v3 requires either timeout_minutes or timeout_seconds. Set to 480 minutes to match the GHA job timeout. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 1 + .github/workflows/test.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index bc91c2635c..eed7e002c7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -105,6 +105,7 @@ jobs: with: max_attempts: 3 retry_wait_seconds: 60 + timeout_minutes: 480 command: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d36bc1686a..eabb97b8fb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -255,6 +255,7 @@ jobs: with: max_attempts: 3 retry_wait_seconds: 60 + timeout_minutes: 480 command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Test