diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 27472e01ef..16717551cd 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -4,11 +4,17 @@
 
 set -euo pipefail
 
-# Cleanup handler to prevent orphaned tail processes
+# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
 cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
+  # Cancel the SLURM job if the monitor is exiting due to an error
+  # (e.g., the CI runner is being killed). Don't cancel on success.
+  if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
+    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
+    scancel "$job_id" 2>/dev/null || true
+  fi
 }
 trap cleanup EXIT
 
@@ -23,30 +29,78 @@ output_file="$2"
 echo "Submitted batch job $job_id"
 echo "Monitoring output file: $output_file"
 
-# Wait for file to appear with retry logic for transient squeue failures
+# Robustly check SLURM job state using squeue with sacct fallback.
+# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
+# or "UNKNOWN" if both commands fail.
+get_job_state() {
+  local jid="$1"
+  local state
+
+  # Try squeue first (fast, works for active jobs)
+  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
+  if [ -n "$state" ]; then
+    echo "$state"
+    return
+  fi
+
+  # Fallback to sacct (works for completed/historical jobs)
+  if command -v sacct >/dev/null 2>&1; then
+    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
+    if [ -n "$state" ]; then
+      echo "$state"
+      return
+    fi
+  fi
+
+  echo "UNKNOWN"
+}
+
+# Check if a state is terminal (job is done, for better or worse)
+is_terminal_state() {
+  case "$1" in
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE)
+      return 0 ;;
+    *)
+      return 1 ;;
+  esac
+}
+
+# Wait for file to appear, using robust state checking.
+# Never give up due to transient squeue/sacct failures — the CI job timeout
+# is the ultimate backstop.
 echo "Waiting for job to start..."
-squeue_retries=0
-max_squeue_retries=5
+unknown_count=0
 while [ ! -f "$output_file" ]; do
-  # Check if job is still queued/running
-  if squeue -j "$job_id" &>/dev/null; then
-    squeue_retries=0  # Reset on success
-    sleep 5
-  else
-    squeue_retries=$((squeue_retries + 1))
-    if [ $squeue_retries -ge $max_squeue_retries ]; then
-      # Job not in queue and output file doesn't exist
-      if [ ! -f "$output_file" ]; then
-        echo "ERROR: Job $job_id not in queue and output file not created"
+  state=$(get_job_state "$job_id")
+
+  case "$state" in
+    PENDING|CONFIGURING)
+      unknown_count=0
+      sleep 5
+      ;;
+    RUNNING|COMPLETING)
+      unknown_count=0
+      # Job is running but output file not yet visible (NFS delay)
+      sleep 2
+      ;;
+    UNKNOWN)
+      unknown_count=$((unknown_count + 1))
+      # Only print warning periodically to avoid log spam
+      if [ $((unknown_count % 12)) -eq 1 ]; then
+        echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
+      fi
+      sleep 5
+      ;;
+    *)
+      # Terminal state — job finished without creating output
+      if is_terminal_state "$state"; then
+        echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
         exit 1
       fi
-      break
-    fi
-    # Exponential backoff
-    sleep_time=$((2 ** squeue_retries))
-    echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
-    sleep $sleep_time
-  fi
+      # Unrecognized state, keep waiting
+      sleep 5
+      ;;
+  esac
 done
 
 echo "=== Streaming output for job $job_id ==="
@@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
 tail_pid=$!
 
 # Monitor job status and stream output simultaneously
-squeue_failures=0
 last_heartbeat=$(date +%s)
 
 while true; do
   # Try to read from tail output (non-blocking via timeout)
   # Read multiple lines if available to avoid falling behind
   lines_read=0
-  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+  while IFS= read -r -t 1 line <&3 2>/dev/null; do
     echo "$line"
     lines_read=$((lines_read + 1))
     last_heartbeat=$(date +%s)
@@ -73,41 +126,22 @@ while true; do
       break
     fi
   done
-  
+
   # Check job status
   current_time=$(date +%s)
-  if ! squeue -j "$job_id" &>/dev/null; then
-    squeue_failures=$((squeue_failures + 1))
-    # Check if job actually completed using sacct (if available)
-    if [ $squeue_failures -ge 3 ]; then
-      if command -v sacct >/dev/null 2>&1; then
-        state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
-        # Consider job done only if it reached a terminal state
-        case "$state" in
-          COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
-            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
-            break
-            ;;
-          *)
-            # treat as transient failure, reset failures and continue polling
-            squeue_failures=0
-            ;;
-        esac
-      else
-        # No sacct: assume job completed after 3 failures
-        echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
-        break
-      fi
-    fi
+  state=$(get_job_state "$job_id")
+
+  if is_terminal_state "$state"; then
+    echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
+    break
   else
-    squeue_failures=0
     # Print heartbeat if no output for 60 seconds
     if [ $((current_time - last_heartbeat)) -ge 60 ]; then
-      echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
+      echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
       last_heartbeat=$current_time
     fi
   fi
-  
+
   # Sleep briefly between status checks
   sleep 1
 done
@@ -115,7 +149,7 @@ done
 # Drain any remaining output from tail after job completes
 echo "Draining remaining output..."
 drain_count=0
-while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+while IFS= read -r -t 1 line <&3 2>/dev/null; do
   echo "$line"
   drain_count=$((drain_count + 1))
   # Safety limit to avoid infinite loop
@@ -128,8 +162,9 @@ done
 # Close the file descriptor and kill tail
 exec 3<&-
 kill "${tail_pid}" 2>/dev/null || true
+tail_pid=""
 
-# Wait for output file to finish growing (stabilize) before stopping tail
+# Wait for output file to stabilize (NFS flush) before final read
 if [ -f "$output_file" ]; then
   last_size=-1
   same_count=0
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
   done
 fi
 
-# Stop tailing (trap will also handle this on exit)
-kill "${tail_pid}" 2>/dev/null || true
-
 echo ""
 echo "=== Final output ==="
 cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
   exit 1
 fi
 
+monitor_success=1
 echo "Job $job_id completed successfully"
 exit 0
-
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 6279f5f578..eed7e002c7 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -1,32 +1,24 @@
 name: 'Benchmark'
 
 on:
-  # Trigger when Test Suite completes (no polling needed)
-  workflow_run:
-    workflows: ["Test Suite"]
-    types: [completed]
+  pull_request:
+  pull_request_review:
+    types: [submitted]
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:
   file-changes:
     name: Detect File Changes
-    # Only run if Test Suite passed (or manual dispatch)
-    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}
-      pr_number: ${{ steps.pr-info.outputs.pr_number }}
-      pr_approved: ${{ steps.pr-info.outputs.approved }}
-      pr_author: ${{ steps.pr-info.outputs.author }}
     steps:
       - name: Clone
         uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
 
       - name: Detect Changes
         uses: dorny/paths-filter@v3
@@ -34,52 +26,10 @@ jobs:
         with:
           filters: ".github/file-filter.yml"
 
-      - name: Get PR Info
-        id: pr-info
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            echo "pr_number=" >> $GITHUB_OUTPUT
-            echo "approved=true" >> $GITHUB_OUTPUT
-            echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
-          else
-            # Get PR number from workflow_run
-            PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
-            if [ -n "$PR_NUMBER" ]; then
-              echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
-
-              # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
-              PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
-              echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
-
-              # Check if PR is approved
-              APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
-                --jq '[.[] | select(.state == "APPROVED")] | length')
-              if [ "$APPROVED" -gt 0 ]; then
-                echo "approved=true" >> $GITHUB_OUTPUT
-              else
-                echo "approved=false" >> $GITHUB_OUTPUT
-              fi
-            else
-              echo "pr_number=" >> $GITHUB_OUTPUT
-              echo "approved=false" >> $GITHUB_OUTPUT
-              echo "author=" >> $GITHUB_OUTPUT
-            fi
-          fi
-
   self:
     name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
-    if: >
-      github.repository == 'MFlowCode/MFC' &&
-      needs.file-changes.outputs.checkall == 'true' &&
-      (
-        github.event_name == 'workflow_dispatch' ||
-        needs.file-changes.outputs.pr_approved == 'true' ||
-        needs.file-changes.outputs.pr_author == 'sbryngelson' ||
-        needs.file-changes.outputs.pr_author == 'wilfonba'
-      )
-    needs: [file-changes]
+    if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
+    needs: file-changes
     strategy:
       fail-fast: false
       matrix:
@@ -136,14 +86,10 @@ jobs:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
     timeout-minutes: 480
-    env:
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
           path: pr
 
       - name: Clone - Master
@@ -155,10 +101,15 @@ jobs:
 
       - name: Setup & Build
         if: matrix.build_script != ''
-        run: | 
-          (cd pr     && ${{ matrix.build_script }}) &
-          (cd master && ${{ matrix.build_script }}) &
-          wait %1 && wait %2
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          timeout_minutes: 480
+          command: |
+            (cd pr     && ${{ matrix.build_script }}) &
+            (cd master && ${{ matrix.build_script }}) &
+            wait %1 && wait %2
 
       - name: Bench (Master v. PR)
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 18cddc96ca..199eda213c 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
@@ -18,39 +20,10 @@ fi
 
 . ./mfc.sh load -c f -m g
 
-max_attempts=3
-attempt=1
-while [ $attempt -le $max_attempts ]; do
-    echo "Build attempt $attempt of $max_attempts..."
-    if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
-    else
-        if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
-            build_cmd_ok=true
-        else
-            build_cmd_ok=false
-        fi
-    fi
-
-    if [ "$build_cmd_ok" = true ]; then
-        echo "Build succeeded on attempt $attempt."
-        exit 0
-    fi
-
-    if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
-        sleep 30
-    fi
-    attempt=$((attempt + 1))
-done
-
-echo "Build failed after $max_attempts attempts."
-exit 1
+if [ "$run_bench" == "bench" ]; then
+    for dir in benchmarks/*/; do
+        ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
+    done
+else
+    ./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts
+fi
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index d5b416c65a..4c3e0e3e27 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh
index 17fbbaf8e5..ad109c6478 100644
--- a/.github/workflows/frontier/test.sh
+++ b/.github/workflows/frontier/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
 fi
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
index 56c47d8ff4..60e396b54d 100644
--- a/.github/workflows/frontier_amd/build.sh
+++ b/.github/workflows/frontier_amd/build.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
@@ -18,39 +20,10 @@ fi
 
 . ./mfc.sh load -c famd -m g
 
-max_attempts=3
-attempt=1
-while [ $attempt -le $max_attempts ]; do
-    echo "Build attempt $attempt of $max_attempts..."
-    if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
-    else
-        if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
-            build_cmd_ok=true
-        else
-            build_cmd_ok=false
-        fi
-    fi
-
-    if [ "$build_cmd_ok" = true ]; then
-        echo "Build succeeded on attempt $attempt."
-        exit 0
-    fi
-
-    if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
-        sleep 30
-    fi
-    attempt=$((attempt + 1))
-done
-
-echo "Build failed after $max_attempts attempts."
-exit 1
+if [ "$run_bench" == "bench" ]; then
+    for dir in benchmarks/*/; do
+        ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
+    done
+else
+    ./mfc.sh test -v -a --dry-run -j 4 $build_opts
+fi
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
index 551e0056b8..df73db5807 100644
--- a/.github/workflows/frontier_amd/submit.sh
+++ b/.github/workflows/frontier_amd/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
index ff65aa2b0e..c051144b2d 100644
--- a/.github/workflows/frontier_amd/test.sh
+++ b/.github/workflows/frontier_amd/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
+    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
 fi
diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 7ae85e66fe..a3830f5050 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -44,6 +44,7 @@ sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$job_slug.out            # Combined output and error messages file
 
 set -e
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 06a03e465a..874f5afa44 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$output_file             # Combined output and error messages file
 
 set -e
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ec964794ff..eabb97b8fb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -137,14 +137,33 @@ jobs:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
       - name: Test
-        run:  |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
+        run: |
+          rm -f tests/failed_uuids.txt
+          TEST_EXIT=0
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
+
+          # Retry only if a small number of tests failed (sporadic failures)
+          if [ -f tests/failed_uuids.txt ]; then
+            NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+            if [ "$NUM_FAILED" -le 5 ]; then
+              FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
+              echo ""
+              echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+              echo ""
+              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
+            else
+              echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+              exit 1
+            fi
+          elif [ "$TEST_EXIT" -ne 0 ]; then
+            exit $TEST_EXIT
+          fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: [lint-gate, file-changes]
     continue-on-error: false
@@ -158,60 +177,89 @@ jobs:
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'acc'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'omp'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'cpu'
             interface: 'none'
-          # Frontier (ORNL) — build on login node, test via SLURM
+            shard: ''
+          # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+            shard: '1/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'acc'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
-          # Frontier AMD — build on login node, test via SLURM
+            shard: ''
+          # Frontier AMD — build on login node, GPU tests sharded for batch partition
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'cpu'
             interface: 'none'
+            shard: ''
     runs-on:
       group:  phoenix
       labels: ${{ matrix.runner }}
     env:
       NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone
         uses: actions/checkout@v4
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
-        run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          timeout_minutes: 480
+          command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test
-        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
+        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
       - name: Print Logs
         if:   always()
diff --git a/.pr_agent.toml b/.pr_agent.toml
index 9411d1cfe7..f87cd95910 100644
--- a/.pr_agent.toml
+++ b/.pr_agent.toml
@@ -9,7 +9,7 @@ handle_push_trigger = true
 push_commands = ["/improve"]
 
 [pr_reviewer]                # (all fields optional)
-num_max_findings        = 10  # how many items to surface
+num_max_findings        = 5   # how many items to surface
 require_tests_review    = true
 extra_instructions = """
 Project context and review priorities: .github/copilot-instructions.md
@@ -26,8 +26,12 @@ constraints for new parameters, and compiler portability across all four
 supported compilers.
 Python toolchain requires Python 3.10+; do not suggest __future__ imports
 or other backwards-compatibility shims.
+For CI/shell scripts, focus on correctness bugs only — not style, atomic
+writes, or FIFO alternatives.
+Do not suggest changes to code that was not modified in the PR.
 """
 
 [pr_code_suggestions]
+suggestions_depth           = "medium"
 commitable_code_suggestions = true
 apply_suggestions_checkbox  = true
diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py
index 8ad8c4bd07..018e3cef83 100644
--- a/toolchain/mfc/cli/commands.py
+++ b/toolchain/mfc/cli/commands.py
@@ -452,6 +452,12 @@
             default=False,
             dest="dry_run",
         ),
+        Argument(
+            name="shard",
+            help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
+            type=str,
+            default=None,
+        ),
     ],
     mutually_exclusive=[
         MutuallyExclusiveGroup(arguments=[
diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py
index ce02e8251c..e56c6a9eb4 100644
--- a/toolchain/mfc/common.py
+++ b/toolchain/mfc/common.py
@@ -1,4 +1,4 @@
-import os, yaml, typing, shutil, subprocess, logging
+import os, yaml, typing, shutil, subprocess, logging, time
 
 from os.path import join, abspath, normpath, dirname, realpath
 
@@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None:
 
 
 def delete_directory(dirpath: str) -> None:
-    if os.path.isdir(dirpath):
-        shutil.rmtree(dirpath)
+    for attempt in range(5):
+        if not os.path.isdir(dirpath):
+            return
+        try:
+            shutil.rmtree(dirpath)
+            return
+        except OSError:
+            if attempt == 4:
+                raise
+            time.sleep(1)
 
 
 def get_program_output(arguments: typing.List[str] = None, cwd=None):
diff --git a/toolchain/mfc/lint_docs.py b/toolchain/mfc/lint_docs.py
index ddd63a1503..e6b8c87338 100644
--- a/toolchain/mfc/lint_docs.py
+++ b/toolchain/mfc/lint_docs.py
@@ -59,10 +59,10 @@
     "docs/documentation/case.md": CASE_MD_SKIP,
 }
 
+
 # Match @ref page_id patterns (page IDs may contain hyphens)
 REF_RE = re.compile(r"@ref\s+([\w-]+)")
 
-
 def check_docs(repo_root: Path) -> list[str]:
     """Check that file paths referenced in documentation still exist."""
     errors = []
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 31a3771cb9..54e00186dd 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]:
         skipped_cases += example_cases
         cases = [case for case in cases if case not in example_cases]
 
+    if ARG("shard") is not None:
+        parts = ARG("shard").split("/")
+        if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
+            raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
+        shard_idx, shard_count = int(parts[0]), int(parts[1])
+        skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
+        cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]
+
     if ARG("percent") == 100:
         return cases, skipped_cases
 
@@ -206,6 +214,15 @@ def test():
     # Build the summary report
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
+    # Write failed UUIDs to file for CI retry logic
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+    if failed_tests:
+        with open(failed_uuids_path, "w") as f:
+            for test_info in failed_tests:
+                f.write(test_info['uuid'] + "\n")
+    elif os.path.exists(failed_uuids_path):
+        os.remove(failed_uuids_path)
+
     exit(nFAIL)