From e779ed41323e5a399e77f29ad786144014f33bf4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 9 Feb 2026 20:54:32 -0500
Subject: [PATCH 01/26] Switch Phoenix GPU jobs to H200 nodes for faster
 scheduling

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit-bench.sh | 5 ++---
 .github/workflows/phoenix/submit.sh       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 7ae85e66fe..fc28b3046b 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -20,9 +20,8 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH -CL40S
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
+#SBATCH --gres=gpu:H200:2
+#SBATCH --ntasks-per-node=8       # Number of cores per node required\
 "
 
 if [ "$2" = "cpu" ]; then
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 06a03e465a..5747c839f0 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -23,9 +23,8 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
+#SBATCH --gres=gpu:H200:2
+#SBATCH --ntasks-per-node=8       # Number of cores per node required\
 "
 
 if [ "$2" = "cpu" ]; then

From 9cf00d3ee1f479a8902a90012fde4488602308b8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 11 Feb 2026 21:34:12 -0500
Subject: [PATCH 02/26] Fix bash segfault in monitor_slurm_job.sh from
 fractional read timeout

read -t 0.1 (sub-second timeout) in a loop with process substitution
file descriptors triggers a bash internal error (unwind_frame_run:
read_builtin: frame not found) leading to a segfault. Use integer
timeout (read -t 1) instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 27472e01ef..232a894f8a 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -64,7 +64,7 @@ while true; do
   # Try to read from tail output (non-blocking via timeout)
   # Read multiple lines if available to avoid falling behind
   lines_read=0
-  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+  while IFS= read -r -t 1 line <&3 2>/dev/null; do
     echo "$line"
     lines_read=$((lines_read + 1))
     last_heartbeat=$(date +%s)
@@ -115,7 +115,7 @@ done
 # Drain any remaining output from tail after job completes
 echo "Draining remaining output..."
 drain_count=0
-while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+while IFS= read -r -t 1 line <&3 2>/dev/null; do
   echo "$line"
   drain_count=$((drain_count + 1))
   # Safety limit to avoid infinite loop

From a59db02c83acbab62874c75c978b8ddb033d7323 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 13:56:50 -0500
Subject: [PATCH 03/26] Restore pull_request_review trigger for benchmark
 workflow

PR #1124 changed bench.yml to use workflow_run (triggered after Test
Suite completes), which broke the approve-to-run flow for fork PRs.
Revert to the original pull_request + pull_request_review triggers
while keeping improvements (frontier_amd matrix, concurrency group,
timeout, run_parallel_benchmarks.sh).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 65 ++++---------------------------------
 1 file changed, 7 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 6279f5f578..fd240b7a11 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -1,32 +1,24 @@
 name: 'Benchmark'
 
 on:
-  # Trigger when Test Suite completes (no polling needed)
-  workflow_run:
-    workflows: ["Test Suite"]
-    types: [completed]
+  pull_request:
+  pull_request_review:
+    types: [submitted]
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
   file-changes:
     name: Detect File Changes
-    # Only run if Test Suite passed (or manual dispatch)
-    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}
-      pr_number: ${{ steps.pr-info.outputs.pr_number }}
-      pr_approved: ${{ steps.pr-info.outputs.approved }}
-      pr_author: ${{ steps.pr-info.outputs.author }}
     steps:
       - name: Clone
         uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
 
       - name: Detect Changes
         uses: dorny/paths-filter@v3
@@ -34,52 +26,10 @@ jobs:
         with:
           filters: ".github/file-filter.yml"
 
-      - name: Get PR Info
-        id: pr-info
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            echo "pr_number=" >> $GITHUB_OUTPUT
-            echo "approved=true" >> $GITHUB_OUTPUT
-            echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
-          else
-            # Get PR number from workflow_run
-            PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
-            if [ -n "$PR_NUMBER" ]; then
-              echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
-
-              # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
-              PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
-              echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
-
-              # Check if PR is approved
-              APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
-                --jq '[.[] | select(.state == "APPROVED")] | length')
-              if [ "$APPROVED" -gt 0 ]; then
-                echo "approved=true" >> $GITHUB_OUTPUT
-              else
-                echo "approved=false" >> $GITHUB_OUTPUT
-              fi
-            else
-              echo "pr_number=" >> $GITHUB_OUTPUT
-              echo "approved=false" >> $GITHUB_OUTPUT
-              echo "author=" >> $GITHUB_OUTPUT
-            fi
-          fi
-
   self:
     name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
-    if: >
-      github.repository == 'MFlowCode/MFC' &&
-      needs.file-changes.outputs.checkall == 'true' &&
-      (
-        github.event_name == 'workflow_dispatch' ||
-        needs.file-changes.outputs.pr_approved == 'true' ||
-        needs.file-changes.outputs.pr_author == 'sbryngelson' ||
-        needs.file-changes.outputs.pr_author == 'wilfonba'
-      )
-    needs: [file-changes]
+    if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
+    needs: file-changes
     strategy:
       fail-fast: false
       matrix:
@@ -143,7 +93,6 @@ jobs:
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
           path: pr
 
       - name: Clone - Master
@@ -155,7 +104,7 @@ jobs:
 
       - name: Setup & Build
         if: matrix.build_script != ''
-        run: | 
+        run: |
           (cd pr     && ${{ matrix.build_script }}) &
           (cd master && ${{ matrix.build_script }}) &
           wait %1 && wait %2

From 2efc61e1eb98a8ea1287275c407cff5539a153f1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 17:28:56 -0500
Subject: [PATCH 04/26] Auto-retry sporadic test failures in CI

Write failed test UUIDs to tests/failed_uuids.txt after a test run.
In CI, if 1-5 tests fail, automatically re-run just those tests.
If 6+ fail, treat it as a real issue and fail immediately.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 19 +++++++++++++++++--
 toolchain/mfc/test/test.py |  9 +++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0be51076ec..3a5a0e33d7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -134,8 +134,23 @@ jobs:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
       - name: Test
-        run:  |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
+        run: |
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true
+
+          # Retry only if a small number of tests failed (sporadic failures)
+          if [ -f tests/failed_uuids.txt ]; then
+            NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+            if [ "$NUM_FAILED" -le 5 ]; then
+              FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
+              echo ""
+              echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+              echo ""
+              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL
+            else
+              echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+              exit 1
+            fi
+          fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 31a3771cb9..d6dce92436 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -206,6 +206,15 @@ def test():
     # Build the summary report
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
+    # Write failed UUIDs to file for CI retry logic
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+    if failed_tests:
+        with open(failed_uuids_path, "w") as f:
+            for test_info in failed_tests:
+                f.write(test_info['uuid'] + "\n")
+    elif os.path.exists(failed_uuids_path):
+        os.remove(failed_uuids_path)
+
     exit(nFAIL)
 
 

From 0658bd348512de9f051b8ca4c7adbbb9a19f576b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 17:40:49 -0500
Subject: [PATCH 05/26] Preserve exit code for catastrophic test failures

Don't mask non-zero exit codes when tests crash before writing
failed_uuids.txt. Only suppress the exit code when the file exists
(meaning the test framework ran to completion and we can retry).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3a5a0e33d7..eec9d19fd0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -135,7 +135,8 @@ jobs:
 
       - name: Test
         run: |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true
+          TEST_EXIT=0
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 
           # Retry only if a small number of tests failed (sporadic failures)
           if [ -f tests/failed_uuids.txt ]; then
@@ -150,6 +151,8 @@ jobs:
               echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
               exit 1
             fi
+          elif [ "$TEST_EXIT" -ne 0 ]; then
+            exit $TEST_EXIT
           fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}

From c6b6f8134409a0f99a375327752e4a5eee0c834d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 21:48:39 -0500
Subject: [PATCH 06/26] Harden SLURM monitor: robust state checks, orphan
 cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace squeue exit-code polling with get_job_state() that parses
the actual state string (squeue + sacct fallback). Never give up on
UNKNOWN state — CI timeout is the backstop. Cancel orphaned SLURM
jobs on abnormal monitor exit. Include job state in heartbeats.

Incorporates changes from PR #1140.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 138 +++++++++++++++++----------
 1 file changed, 85 insertions(+), 53 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 232a894f8a..408d205aab 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -4,11 +4,17 @@
 
 set -euo pipefail
 
-# Cleanup handler to prevent orphaned tail processes
+# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
 cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
+  # Cancel the SLURM job if the monitor is exiting due to an error
+  # (e.g., the CI runner is being killed). Don't cancel on success.
+  if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
+    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
+    scancel "$job_id" 2>/dev/null || true
+  fi
 }
 trap cleanup EXIT
 
@@ -23,30 +29,78 @@ output_file="$2"
 echo "Submitted batch job $job_id"
 echo "Monitoring output file: $output_file"
 
-# Wait for file to appear with retry logic for transient squeue failures
+# Robustly check SLURM job state using squeue with sacct fallback.
+# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
+# or "UNKNOWN" if both commands fail.
+get_job_state() {
+  local jid="$1"
+  local state
+
+  # Try squeue first (fast, works for active jobs)
+  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
+  if [ -n "$state" ]; then
+    echo "$state"
+    return
+  fi
+
+  # Fallback to sacct (works for completed/historical jobs)
+  if command -v sacct >/dev/null 2>&1; then
+    state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+    if [ -n "$state" ]; then
+      echo "$state"
+      return
+    fi
+  fi
+
+  echo "UNKNOWN"
+}
+
+# Check if a state is terminal (job is done, for better or worse)
+is_terminal_state() {
+  case "$1" in
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
+      return 0 ;;
+    *)
+      return 1 ;;
+  esac
+}
+
+# Wait for file to appear, using robust state checking.
+# Never give up due to transient squeue/sacct failures — the CI job timeout
+# is the ultimate backstop.
 echo "Waiting for job to start..."
-squeue_retries=0
-max_squeue_retries=5
+unknown_count=0
 while [ ! -f "$output_file" ]; do
-  # Check if job is still queued/running
-  if squeue -j "$job_id" &>/dev/null; then
-    squeue_retries=0  # Reset on success
-    sleep 5
-  else
-    squeue_retries=$((squeue_retries + 1))
-    if [ $squeue_retries -ge $max_squeue_retries ]; then
-      # Job not in queue and output file doesn't exist
-      if [ ! -f "$output_file" ]; then
-        echo "ERROR: Job $job_id not in queue and output file not created"
+  state=$(get_job_state "$job_id")
+
+  case "$state" in
+    PENDING|CONFIGURING)
+      unknown_count=0
+      sleep 5
+      ;;
+    RUNNING|COMPLETING)
+      unknown_count=0
+      # Job is running but output file not yet visible (NFS delay)
+      sleep 2
+      ;;
+    UNKNOWN)
+      unknown_count=$((unknown_count + 1))
+      # Only print warning periodically to avoid log spam
+      if [ $((unknown_count % 12)) -eq 1 ]; then
+        echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
+      fi
+      sleep 5
+      ;;
+    *)
+      # Terminal state — job finished without creating output
+      if is_terminal_state "$state"; then
+        echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
         exit 1
       fi
-      break
-    fi
-    # Exponential backoff
-    sleep_time=$((2 ** squeue_retries))
-    echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
-    sleep $sleep_time
-  fi
+      # Unrecognized state, keep waiting
+      sleep 5
+      ;;
+  esac
 done
 
 echo "=== Streaming output for job $job_id ==="
@@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
 tail_pid=$!
 
 # Monitor job status and stream output simultaneously
-squeue_failures=0
 last_heartbeat=$(date +%s)
 
 while true; do
@@ -73,41 +126,22 @@ while true; do
       break
     fi
   done
-  
+
   # Check job status
   current_time=$(date +%s)
-  if ! squeue -j "$job_id" &>/dev/null; then
-    squeue_failures=$((squeue_failures + 1))
-    # Check if job actually completed using sacct (if available)
-    if [ $squeue_failures -ge 3 ]; then
-      if command -v sacct >/dev/null 2>&1; then
-        state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
-        # Consider job done only if it reached a terminal state
-        case "$state" in
-          COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
-            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
-            break
-            ;;
-          *)
-            # treat as transient failure, reset failures and continue polling
-            squeue_failures=0
-            ;;
-        esac
-      else
-        # No sacct: assume job completed after 3 failures
-        echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
-        break
-      fi
-    fi
+  state=$(get_job_state "$job_id")
+
+  if is_terminal_state "$state"; then
+    echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
+    break
   else
-    squeue_failures=0
     # Print heartbeat if no output for 60 seconds
     if [ $((current_time - last_heartbeat)) -ge 60 ]; then
-      echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
+      echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
       last_heartbeat=$current_time
     fi
   fi
-  
+
   # Sleep briefly between status checks
   sleep 1
 done
@@ -128,6 +162,7 @@ done
 # Close the file descriptor and kill tail
 exec 3<&-
 kill "${tail_pid}" 2>/dev/null || true
+tail_pid=""
 
 # Wait for output file to finish growing (stabilize) before stopping tail
 if [ -f "$output_file" ]; then
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
   done
 fi
 
-# Stop tailing (trap will also handle this on exit)
-kill "${tail_pid}" 2>/dev/null || true
-
 echo ""
 echo "=== Final output ==="
 cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
   exit 1
 fi
 
+monitor_success=1
 echo "Job $job_id completed successfully"
 exit 0
-

From a82959e1e793bfba5983cad3f0c4c84c85da795c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 22:26:38 -0500
Subject: [PATCH 07/26] Use parsable sacct flags for robust state parsing

Use -n -X -P flags with sacct: -X restricts to job allocation (not
steps), -P gives pipe-delimited output for reliable parsing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 408d205aab..d9f2237032 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -45,7 +45,7 @@ get_job_state() {
 
   # Fallback to sacct (works for completed/historical jobs)
   if command -v sacct >/dev/null 2>&1; then
-    state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1)
     if [ -n "$state" ]; then
       echo "$state"
       return

From 80229694a2283a5a5b9eac5ad5c5ef123934c669 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 22:49:45 -0500
Subject: [PATCH 08/26] Guard squeue/sacct pipelines against set -euo pipefail

With pipefail, a transient squeue failure would exit the script
instead of falling through to return UNKNOWN. Add || true to both
pipelines. Also fix stale comment about tail stopping.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index d9f2237032..4981e5e607 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -37,7 +37,7 @@ get_job_state() {
   local state
 
   # Try squeue first (fast, works for active jobs)
-  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
+  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
   if [ -n "$state" ]; then
     echo "$state"
     return
@@ -45,7 +45,7 @@ get_job_state() {
 
   # Fallback to sacct (works for completed/historical jobs)
   if command -v sacct >/dev/null 2>&1; then
-    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1)
+    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
     if [ -n "$state" ]; then
       echo "$state"
       return
@@ -164,7 +164,7 @@ exec 3<&-
 kill "${tail_pid}" 2>/dev/null || true
 tail_pid=""
 
-# Wait for output file to finish growing (stabilize) before stopping tail
+# Wait for output file to stabilize (NFS flush) before final read
 if [ -f "$output_file" ]; then
   last_size=-1
   same_count=0

From 88d19ce4b562f14c9abf832c6cae19e4fc0851ea Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 09:29:42 -0500
Subject: [PATCH 09/26] Retry delete_directory on Lustre ENOTEMPTY race

shutil.rmtree can fail with "Directory not empty" on networked
filesystems (Lustre) due to metadata propagation delays. Retry
up to 5 times with 1s backoff before raising.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/common.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py
index ce02e8251c..e56c6a9eb4 100644
--- a/toolchain/mfc/common.py
+++ b/toolchain/mfc/common.py
@@ -1,4 +1,4 @@
-import os, yaml, typing, shutil, subprocess, logging
+import os, yaml, typing, shutil, subprocess, logging, time
 
 from os.path import join, abspath, normpath, dirname, realpath
 
@@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None:
 
 
 def delete_directory(dirpath: str) -> None:
-    if os.path.isdir(dirpath):
-        shutil.rmtree(dirpath)
+    for attempt in range(5):
+        if not os.path.isdir(dirpath):
+            return
+        try:
+            shutil.rmtree(dirpath)
+            return
+        except OSError:
+            if attempt == 4:
+                raise
+            time.sleep(1)
 
 
 def get_program_output(arguments: typing.List[str] = None, cwd=None):

From 05d28f37bb01c099d40b365ab2857c169e6954ab Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 09:44:16 -0500
Subject: [PATCH 10/26] Remove stale failed_uuids.txt before test run

On self-hosted runners the workspace persists between runs, so a
leftover file could trigger spurious retries.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index eec9d19fd0..21e52d5a5e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -135,6 +135,7 @@ jobs:
 
       - name: Test
         run: |
+          rm -f tests/failed_uuids.txt
           TEST_EXIT=0
           /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 

From 9eed0c65d2d114cc7aa8134fc3d3cf809b18c0b5 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 10:08:33 -0500
Subject: [PATCH 11/26] Split benchmark concurrency group by event type

Bot review events (pull_request_review) were racing against and
cancelling legitimate push-triggered (pull_request) benchmark runs
via the shared concurrency group.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index fd240b7a11..53efac21ed 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -7,7 +7,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:

From edefc015be0f5b707b596a0718467100e6640ae5 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 14 Feb 2026 20:26:51 -0500
Subject: [PATCH 12/26] Revert Phoenix test jobs to multi-partition GPU
 scheduling

Keep H200 targeting only for benchmarks; tests should run on any
available GPU partition for faster scheduling.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 5747c839f0..06a03e465a 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -23,8 +23,9 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH --gres=gpu:H200:2
-#SBATCH --ntasks-per-node=8       # Number of cores per node required\
+#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
+#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH -G2\
 "
 
 if [ "$2" = "cpu" ]; then

From 2e15ab646d00a521e8835734b1cf74f3dabb5fb9 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 14 Feb 2026 20:35:46 -0500
Subject: [PATCH 13/26] Fix doc lint for generated pages and hyphenated page
 IDs

Add build-time generated page IDs (parameters, cli-reference, examples,
case_constraints) to the known set, and allow hyphens in @page/@ref
ID patterns.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/lint_docs.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/toolchain/mfc/lint_docs.py b/toolchain/mfc/lint_docs.py
index 783ac22c66..9c4be131c1 100644
--- a/toolchain/mfc/lint_docs.py
+++ b/toolchain/mfc/lint_docs.py
@@ -53,8 +53,8 @@
     "docs/documentation/case.md": CASE_MD_SKIP,
 }
 
-# Match @ref page_id patterns
-REF_RE = re.compile(r"@ref\s+(\w+)")
+# Match @ref page_id patterns (allow hyphens in page IDs like cli-reference)
+REF_RE = re.compile(r"@ref\s+([\w-]+)")
 
 
 def check_docs(repo_root: Path) -> list[str]:
@@ -322,10 +322,13 @@ def check_page_refs(repo_root: Path) -> list[str]:
         return []
 
     # Collect all @page identifiers
-    page_ids = {"citelist"}  # Doxygen built-in
+    # Include Doxygen built-ins and pages generated at build time by
+    # gen_parameters.sh, gen_cli_reference.sh, examples.sh, and
+    # gen_case_constraints_docs.py.
+    page_ids = {"citelist", "parameters", "cli-reference", "examples", "case_constraints"}
     for md_file in doc_dir.glob("*.md"):
         text = md_file.read_text(encoding="utf-8")
-        m = re.search(r"^\s*@page\s+(\w+)", text, flags=re.MULTILINE)
+        m = re.search(r"^\s*@page\s+([\w-]+)", text, flags=re.MULTILINE)
         if m:
             page_ids.add(m.group(1))
 

From dfc524ced81caf1b312a47361c14e062e63cd98c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 14 Feb 2026 23:05:49 -0500
Subject: [PATCH 14/26] Add Lustre-safe workspace cleanup for self-hosted
 runners

Disable actions/checkout's built-in git clean (which fails on Lustre
with ESTALE/ENOTEMPTY errors) and add a retry-based rm -rf step before
checkout instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 11 +++++++++++
 .github/workflows/test.yml  | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 53efac21ed..9a45201376 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -90,14 +90,25 @@ jobs:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
+      - name: Clean workspace (Lustre-safe)
+        run: |
+          for i in 1 2 3 4 5; do
+            rm -rf "$GITHUB_WORKSPACE"/pr "$GITHUB_WORKSPACE"/master 2>/dev/null && break
+            echo "Clean attempt $i failed, retrying in 2s..."
+            sleep 2
+          done
+          true
+
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
+          clean: false
           path: pr
 
       - name: Clone - Master
         uses: actions/checkout@v4
         with:
+          clean: false
           repository: MFlowCode/MFC
           ref: master
           path: master
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 21e52d5a5e..4709ceb84c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -219,8 +219,19 @@ jobs:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
+      - name: Clean workspace (Lustre-safe)
+        run: |
+          for i in 1 2 3 4 5; do
+            rm -rf "$GITHUB_WORKSPACE"/{.,}* 2>/dev/null && break
+            echo "Clean attempt $i failed, retrying in 2s..."
+            sleep 2
+          done
+          true
+
       - name: Clone
         uses: actions/checkout@v4
+        with:
+          clean: false
 
       - name: Build
         if:   matrix.cluster != 'phoenix'

From ece195155a142e1a3560e9d973434d35df47efb3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 11:03:25 -0500
Subject: [PATCH 15/26] Revert Phoenix benchmark jobs to L40S GPU scheduling

The H200 switch needs to land on master first so both PR and master
benchmark builds use the same node type. Split into a separate PR.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit-bench.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index fc28b3046b..7ae85e66fe 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -20,8 +20,9 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH --gres=gpu:H200:2
-#SBATCH --ntasks-per-node=8       # Number of cores per node required\
+#SBATCH -CL40S
+#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH -G2\
 "
 
 if [ "$2" = "cpu" ]; then

From a553a7573ad153793ac1a31bab13cc16cc990d46 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 16:02:08 -0500
Subject: [PATCH 16/26] Improve Lustre-safe workspace cleanup with dotglob and
 nullglob

Use shopt dotglob/nullglob for cleaner glob expansion instead of
manual dotfile patterns. Keep retry loop for Lustre ESTALE resilience.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 5 +++--
 .github/workflows/test.yml  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 9a45201376..a3e19ca495 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -91,13 +91,14 @@ jobs:
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clean workspace (Lustre-safe)
+        shell: bash
         run: |
+          shopt -s dotglob nullglob
           for i in 1 2 3 4 5; do
-            rm -rf "$GITHUB_WORKSPACE"/pr "$GITHUB_WORKSPACE"/master 2>/dev/null && break
+            rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break
             echo "Clean attempt $i failed, retrying in 2s..."
             sleep 2
           done
-          true
 
       - name: Clone - PR
         uses: actions/checkout@v4
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4709ceb84c..9d0e1a5ec7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -220,13 +220,14 @@ jobs:
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clean workspace (Lustre-safe)
+        shell: bash
         run: |
+          shopt -s dotglob nullglob
           for i in 1 2 3 4 5; do
-            rm -rf "$GITHUB_WORKSPACE"/{.,}* 2>/dev/null && break
+            rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break
             echo "Clean attempt $i failed, retrying in 2s..."
             sleep 2
           done
-          true
 
       - name: Clone
         uses: actions/checkout@v4

From a1498864ad6e77f96ea2fe9e9250ec552fd359d7 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 16:09:17 -0500
Subject: [PATCH 17/26] Auto-requeue SLURM jobs on preemption

Add --requeue to Phoenix sbatch scripts so preempted embers-QOS jobs
are automatically rescheduled. Remove PREEMPTED from the monitor's
terminal state list so it keeps waiting through the requeue cycle.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh      | 2 +-
 .github/workflows/phoenix/submit-bench.sh | 1 +
 .github/workflows/phoenix/submit.sh       | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 4981e5e607..16717551cd 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -58,7 +58,7 @@ get_job_state() {
 # Check if a state is terminal (job is done, for better or worse)
 is_terminal_state() {
   case "$1" in
-    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE)
       return 0 ;;
     *)
       return 1 ;;
diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 7ae85e66fe..a3830f5050 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -44,6 +44,7 @@ sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$job_slug.out            # Combined output and error messages file
 
 set -e
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 06a03e465a..874f5afa44 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$output_file             # Combined output and error messages file
 
 set -e

From 273cced93676b5fb6e251be5b63ec97e2dc568eb Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 19:19:43 -0500
Subject: [PATCH 18/26] Remove aggressive workspace cleanup from test jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rm -rf * was destroying the build cache, causing CMake to rebuild
from scratch and hit Lustre ioctl errors. With clean: false on
checkout, git clean is already disabled — no pre-cleanup needed.
Keep full cleanup only in bench.yml where pr/master are fresh clones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c1e383f0e0..33d104fa73 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -222,16 +222,6 @@ jobs:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
-      - name: Clean workspace (Lustre-safe)
-        shell: bash
-        run: |
-          shopt -s dotglob nullglob
-          for i in 1 2 3 4 5; do
-            rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break
-            echo "Clean attempt $i failed, retrying in 2s..."
-            sleep 2
-          done
-
       - name: Clone
         uses: actions/checkout@v4
         with:

From c2e6543f460b397efff538dacc6739f51d16aa7e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 19:37:36 -0500
Subject: [PATCH 19/26] Propagate exit code from test retry command

Without || exit $?, a failed retry would silently exit 0.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 33d104fa73..ac10cab0d5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -150,7 +150,7 @@ jobs:
               echo ""
               echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
               echo ""
-              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL
+              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
             else
               echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
               exit 1

From c0b1cd15285a3acb1e1aeaf715a0f4bed62770a2 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 15 Feb 2026 19:51:56 -0500
Subject: [PATCH 20/26] Restore default checkout clean for test jobs; tune PR
 reviewer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert test.yml to clean: true (default) — the corrupted build cache
from the ioctl failure was causing 100% test failures. The Lustre-safe
cleanup is only needed for bench.yml where pr/master are separate trees.

Also tune qodo PR reviewer: reduce max findings to 5, lower suggestion
depth to medium, and add instructions to focus on correctness over style
for CI scripts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 2 --
 .pr_agent.toml             | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ac10cab0d5..0864fe345c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -224,8 +224,6 @@ jobs:
     steps:
       - name: Clone
         uses: actions/checkout@v4
-        with:
-          clean: false
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
diff --git a/.pr_agent.toml b/.pr_agent.toml
index 9411d1cfe7..f87cd95910 100644
--- a/.pr_agent.toml
+++ b/.pr_agent.toml
@@ -9,7 +9,7 @@ handle_push_trigger = true
 push_commands = ["/improve"]
 
 [pr_reviewer]                # (all fields optional)
-num_max_findings        = 10  # how many items to surface
+num_max_findings        = 5   # how many items to surface
 require_tests_review    = true
 extra_instructions = """
 Project context and review priorities: .github/copilot-instructions.md
@@ -26,8 +26,12 @@ constraints for new parameters, and compiler portability across all four
 supported compilers.
 Python toolchain requires Python 3.10+; do not suggest __future__ imports
 or other backwards-compatibility shims.
+For CI/shell scripts, focus on correctness bugs only — not style, atomic
+writes, or FIFO alternatives.
+Do not suggest changes to code that was not modified in the PR.
 """
 
 [pr_code_suggestions]
+suggestions_depth           = "medium"
 commitable_code_suggestions = true
 apply_suggestions_checkbox  = true

From 7a764d5769dfa6f70908267c39a09760eaa8ccda Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 09:48:33 -0500
Subject: [PATCH 21/26] Remove aggressive workspace cleanup from bench jobs

The Lustre-safe cleanup step was wiping the build cache (pr/build/,
master/build/), forcing full rebuilds every run. This added ~32 min of
build time and pushed NVHPC gpu-omp benchmarks past the 4h SLURM limit.
Restore default checkout behavior to preserve build cache across runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index a3e19ca495..53efac21ed 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -90,26 +90,14 @@ jobs:
       ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
       ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
-      - name: Clean workspace (Lustre-safe)
-        shell: bash
-        run: |
-          shopt -s dotglob nullglob
-          for i in 1 2 3 4 5; do
-            rm -rf -- "${GITHUB_WORKSPACE:?}/"* 2>/dev/null && break
-            echo "Clean attempt $i failed, retrying in 2s..."
-            sleep 2
-          done
-
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
-          clean: false
           path: pr
 
       - name: Clone - Master
         uses: actions/checkout@v4
         with:
-          clean: false
           repository: MFlowCode/MFC
           ref: master
           path: master

From b5dfa1fbeaee3dda3b65f32193f90a3f6ab7d5a8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 10:59:42 -0500
Subject: [PATCH 22/26] Add test sharding for Frontier CI; switch to
 batch/hackathon partition

Split Frontier GPU test configs into 2 shards (~75 min each) so they
fit within the batch partition's 2h wall time limit. This allows all
Frontier SLURM jobs to run concurrently instead of serially on the
extended partition (which has a 1-job-per-user limit), reducing total
CI wall clock from ~4.5h to ~2h.

Changes:
- Add --shard CLI argument (e.g., --shard 1/2) with modulo-based
  round-robin distribution across shards
- Switch Frontier submit scripts from extended to batch/hackathon
  (CFD154 account, 1h59m wall time)
- Shard the 3 Frontier GPU matrix entries into 6 (2 shards each)
- CPU entries remain unsharded

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/submit.sh     |  8 +++---
 .github/workflows/frontier/test.sh       |  7 ++++-
 .github/workflows/frontier_amd/submit.sh |  8 +++---
 .github/workflows/frontier_amd/test.sh   |  7 ++++-
 .github/workflows/test.yml               | 34 +++++++++++++++++++++---
 toolchain/mfc/cli/commands.py            |  6 +++++
 toolchain/mfc/test/test.py               |  5 ++++
 7 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index d5b416c65a..4c3e0e3e27 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh
index 17fbbaf8e5..ad109c6478 100644
--- a/.github/workflows/frontier/test.sh
+++ b/.github/workflows/frontier/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
 fi
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
index 551e0056b8..df73db5807 100644
--- a/.github/workflows/frontier_amd/submit.sh
+++ b/.github/workflows/frontier_amd/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
index ff65aa2b0e..c051144b2d 100644
--- a/.github/workflows/frontier_amd/test.sh
+++ b/.github/workflows/frontier_amd/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
+    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
 fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0864fe345c..4a4e70fd45 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -163,7 +163,7 @@ jobs:
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: [lint-gate, file-changes]
     continue-on-error: false
@@ -177,43 +177,69 @@ jobs:
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'acc'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'omp'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'cpu'
             interface: 'none'
-          # Frontier (ORNL) — build on login node, test via SLURM
+            shard: ''
+          # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'acc'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
-          # Frontier AMD — build on login node, test via SLURM
+            shard: ''
+          # Frontier AMD — build on login node, GPU tests sharded for batch partition
+          - runner:       'frontier'
+            cluster:      'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '1/2'
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'cpu'
             interface: 'none'
+            shard: ''
     runs-on:
       group:  phoenix
       labels: ${{ matrix.runner }}
@@ -230,7 +256,7 @@ jobs:
         run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test
-        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
+        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
       - name: Print Logs
         if:   always()
diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py
index 8ad8c4bd07..018e3cef83 100644
--- a/toolchain/mfc/cli/commands.py
+++ b/toolchain/mfc/cli/commands.py
@@ -452,6 +452,12 @@
             default=False,
             dest="dry_run",
         ),
+        Argument(
+            name="shard",
+            help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
+            type=str,
+            default=None,
+        ),
     ],
     mutually_exclusive=[
         MutuallyExclusiveGroup(arguments=[
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index d6dce92436..52c143038a 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -99,6 +99,11 @@ def __filter(cases_) -> typing.List[TestCase]:
         skipped_cases += example_cases
         cases = [case for case in cases if case not in example_cases]
 
+    if ARG("shard") is not None:
+        shard_idx, shard_count = (int(x) for x in ARG("shard").split("/"))
+        skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
+        cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]
+
     if ARG("percent") == 100:
         return cases, skipped_cases
 

From 475caa32563291798b776eef8da4434e86d3a98f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 11:19:58 -0500
Subject: [PATCH 23/26] Validate --shard argument format and bounds

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/test/test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 52c143038a..54e00186dd 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -100,7 +100,10 @@ def __filter(cases_) -> typing.List[TestCase]:
         cases = [case for case in cases if case not in example_cases]
 
     if ARG("shard") is not None:
-        shard_idx, shard_count = (int(x) for x in ARG("shard").split("/"))
+        parts = ARG("shard").split("/")
+        if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
+            raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
+        shard_idx, shard_count = int(parts[0]), int(parts[1])
         skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
         cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]
 

From ddd95ac58e0bd9b650310058549aadb94516672d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 14:48:35 -0500
Subject: [PATCH 24/26] Use nick-fields/retry for Frontier builds; reduce -j to
 4

Move build retry logic from shell scripts to GHA using nick-fields/retry
with 60s backoff between attempts. This gives better visibility into
retries and lets login node memory pressure subside between attempts.

Also reduce build parallelism from -j 8 to -j 4 to lower peak memory
on shared Frontier login nodes, and remove the outdated Node 16 version
overrides from self-hosted runner env.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml             | 15 +++++----
 .github/workflows/frontier/build.sh     | 43 ++++---------------------
 .github/workflows/frontier_amd/build.sh | 43 ++++---------------------
 .github/workflows/test.yml              |  8 +++--
 4 files changed, 27 insertions(+), 82 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 53efac21ed..bc91c2635c 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -86,9 +86,6 @@ jobs:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
     timeout-minutes: 480
-    env:
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
@@ -104,10 +101,14 @@ jobs:
 
       - name: Setup & Build
         if: matrix.build_script != ''
-        run: |
-          (cd pr     && ${{ matrix.build_script }}) &
-          (cd master && ${{ matrix.build_script }}) &
-          wait %1 && wait %2
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          command: |
+            (cd pr     && ${{ matrix.build_script }}) &
+            (cd master && ${{ matrix.build_script }}) &
+            wait %1 && wait %2
 
       - name: Bench (Master v. PR)
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 18cddc96ca..cbfaf3eada 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -18,39 +18,10 @@ fi
 
 . ./mfc.sh load -c f -m g
 
-max_attempts=3
-attempt=1
-while [ $attempt -le $max_attempts ]; do
-    echo "Build attempt $attempt of $max_attempts..."
-    if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
-    else
-        if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
-            build_cmd_ok=true
-        else
-            build_cmd_ok=false
-        fi
-    fi
-
-    if [ "$build_cmd_ok" = true ]; then
-        echo "Build succeeded on attempt $attempt."
-        exit 0
-    fi
-
-    if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
-        sleep 30
-    fi
-    attempt=$((attempt + 1))
-done
-
-echo "Build failed after $max_attempts attempts."
-exit 1
+if [ "$run_bench" == "bench" ]; then
+    for dir in benchmarks/*/; do
+        ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
+    done
+else
+    ./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts
+fi
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
index 56c47d8ff4..9442a10d57 100644
--- a/.github/workflows/frontier_amd/build.sh
+++ b/.github/workflows/frontier_amd/build.sh
@@ -18,39 +18,10 @@ fi
 
 . ./mfc.sh load -c famd -m g
 
-max_attempts=3
-attempt=1
-while [ $attempt -le $max_attempts ]; do
-    echo "Build attempt $attempt of $max_attempts..."
-    if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
-    else
-        if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
-            build_cmd_ok=true
-        else
-            build_cmd_ok=false
-        fi
-    fi
-
-    if [ "$build_cmd_ok" = true ]; then
-        echo "Build succeeded on attempt $attempt."
-        exit 0
-    fi
-
-    if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
-        sleep 30
-    fi
-    attempt=$((attempt + 1))
-done
-
-echo "Build failed after $max_attempts attempts."
-exit 1
+if [ "$run_bench" == "bench" ]; then
+    for dir in benchmarks/*/; do
+        ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
+    done
+else
+    ./mfc.sh test -v -a --dry-run -j 4 $build_opts
+fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4a4e70fd45..d36bc1686a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -245,15 +245,17 @@ jobs:
       labels: ${{ matrix.runner }}
     env:
       NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone
         uses: actions/checkout@v4
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
-        run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}

From 197813a97a7b1a68831de4bfb7b638e94bfb005b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 14:56:11 -0500
Subject: [PATCH 25/26] Add set -e to Frontier build scripts for fail-fast
 behavior

Without set -e, the benchmark build loop could silently ignore failures
of earlier benchmarks if a later one succeeded, since only the last
command's exit code would propagate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/build.sh     | 2 ++
 .github/workflows/frontier_amd/build.sh | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index cbfaf3eada..199eda213c 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
index 9442a10d57..60e396b54d 100644
--- a/.github/workflows/frontier_amd/build.sh
+++ b/.github/workflows/frontier_amd/build.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 

From 73072bf22b6a56664a7012d59572e3ce5b437b8e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 16 Feb 2026 15:45:36 -0500
Subject: [PATCH 26/26] Add required timeout_minutes to nick-fields/retry steps

nick-fields/retry@v3 requires either timeout_minutes or
timeout_seconds. Set to 480 minutes to match the GHA job timeout.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 1 +
 .github/workflows/test.yml  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index bc91c2635c..eed7e002c7 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -105,6 +105,7 @@ jobs:
         with:
           max_attempts: 3
           retry_wait_seconds: 60
+          timeout_minutes: 480
           command: |
             (cd pr     && ${{ matrix.build_script }}) &
             (cd master && ${{ matrix.build_script }}) &
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d36bc1686a..eabb97b8fb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -255,6 +255,7 @@ jobs:
         with:
           max_attempts: 3
           retry_wait_seconds: 60
+          timeout_minutes: 480
           command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test