MFlowCode · sbryngelson · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026
@@ -52,16 +52,16 @@ else
   echo "Master job completed successfully"
 fi
 
-# Check if either job failed
+# Warn if either job failed (partial results may still be usable)
 if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-  echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
-  exit 1
+  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
+  echo "Checking for partial results..."
+else
+  echo "=========================================="
+  echo "Both benchmark jobs completed successfully!"
+  echo "=========================================="
 fi
-# Warn if either job failed (partial results may still be usable)
-if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-  echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
-  exit 1
-  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
-  echo "Checking for partial results..."
-else
-  echo "=========================================="
-  echo "Both benchmark jobs completed successfully!"
-  echo "=========================================="
-fi
+bench_failed=0
+
+# Warn if either job failed (partial results may still be usable)
+if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
+  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
+  echo "Checking for partial results..."
+  bench_failed=1
+else
+  echo "=========================================="
+  echo "Both benchmark jobs completed successfully!"
+  echo "=========================================="
+fi
-# Warn if either job failed (partial results may still be usable)
-if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-  echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
-  exit 1
-  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
-  echo "Checking for partial results..."
-else
-  echo "=========================================="
-  echo "Both benchmark jobs completed successfully!"
-  echo "=========================================="
-fi
+bench_failed=0
+
+# Warn if either job failed (partial results may still be usable)
+if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
+  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
+  echo "Checking for partial results..."
+  bench_failed=1
+else
+  echo "=========================================="
+  echo "Both benchmark jobs completed successfully!"
+  echo "=========================================="
+fi
 
-echo "=========================================="
-echo "Both benchmark jobs completed successfully!"
-echo "=========================================="
-
 # Final verification that output files exist before proceeding
 pr_yaml="pr/bench-${device}-${interface}.yaml"
 master_yaml="master/bench-${device}-${interface}.yaml"

@@ -0,0 +1,39 @@
+#!/bin/bash
+# Sets up a persistent build cache for self-hosted CI runners.
+# Creates a symlink: ./build -> <scratch>/.mfc-ci-cache/<key>/build
+#
+# Each runner gets its own cache keyed by (cluster, device, interface, runner).
+# This avoids cross-runner path issues entirely — CMake's absolute paths are
+# always correct because the same runner always uses the same workspace path.
+#
+# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
+
+_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
+_cache_device="${2:?}"
+_cache_interface="${3:-none}"
+_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
+
+_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
+_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build"
+
+mkdir -p "$_cache_base"
+_cache_dir="$(cd "$_cache_base" && pwd -P)"
+
+echo "=== Build Cache Setup ==="
+echo "  Cache key: $_cache_key"
+echo "  Cache dir: $_cache_dir"
+
+# Replace any existing build/ (real dir or stale symlink) with a symlink
+# to our runner-specific cache directory.
+# Use unlink for symlinks to avoid rm -rf following the link and deleting
+# the shared cache contents (which another runner may be using).
+if [ -L "build" ]; then
+    unlink "build"
+elif [ -e "build" ]; then
+    rm -rf "build"
+fi
+
+ln -s "$_cache_dir" "build"
+
+echo "  Symlink: build -> $_cache_dir"
+echo "========================="
@@ -37,9 +37,13 @@ fi
 echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
 
 # Use the monitoring script from PR (where this script lives)
-bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
-
-echo "[$dir] Monitoring complete for job $job_id"
+monitor_exit=0
+bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+if [ "$monitor_exit" -ne 0 ]; then
+  echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
+else
+  echo "[$dir] Monitoring complete for job $job_id"
+fi
 
 # Verify the YAML output file was created
 yaml_file="${job_slug}.yaml"

@@ -46,21 +46,34 @@ jobs:
           else
             # Get PR number from workflow_run
             PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
+            if [ -z "$PR_NUMBER" ]; then
+              # Cross-repo PRs don't populate pull_requests[]. Search by head SHA.
+              HEAD_SHA="${{ github.event.workflow_run.head_sha }}"
+              PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
+                  --jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1)
+            fi
+
             if [ -n "$PR_NUMBER" ]; then
               echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
 
               # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
               PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
               echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
 
-              # Check if PR is approved
-              APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
-                --jq '[.[] | select(.state == "APPROVED")] | length')
-              if [ "$APPROVED" -gt 0 ]; then
-                echo "approved=true" >> $GITHUB_OUTPUT
-              else
-                echo "approved=false" >> $GITHUB_OUTPUT
-              fi
+              # Check if PR is approved by a maintainer/admin (ignore AI bot approvals)
+              APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
+                  --jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]')
+              APPROVED="false"
+              for approver in $APPROVERS; do
+                  PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \
+                      --jq '.permission' 2>/dev/null || echo "none")
+                  if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then
+                      echo "  Approved by $approver (permission: $PERM)"
+                      APPROVED="true"
+                      break
+                  fi
+              done
+              echo "approved=$APPROVED" >> $GITHUB_OUTPUT
             else
               echo "pr_number=" >> $GITHUB_OUTPUT
               echo "approved=false" >> $GITHUB_OUTPUT
@@ -76,8 +89,7 @@ jobs:
       (
         github.event_name == 'workflow_dispatch' ||
         needs.file-changes.outputs.pr_approved == 'true' ||
-        needs.file-changes.outputs.pr_author == 'sbryngelson' ||
-        needs.file-changes.outputs.pr_author == 'wilfonba'
+        needs.file-changes.outputs.pr_author == 'sbryngelson'
       )
     needs: [file-changes]
     strategy:
@@ -164,6 +176,7 @@ jobs:
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Generate & Post Comment
+        if: always()
         run: |
           (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
           (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)

@@ -35,6 +35,12 @@ jobs:
       - name: Checkouts
         uses: actions/checkout@v4
 
+      - name: Restore Build Cache
+        uses: actions/cache@v4
+        with:
+          path: build
+          key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
+
       - name: Setup Ubuntu
         run: |
             sudo apt update -y

@@ -18,6 +18,11 @@ fi
 
 . ./mfc.sh load -c f -m g
 
+# Only set up build cache for test suite, not benchmarks
+if [ "$run_bench" != "bench" ]; then
+    source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
+fi
+
 max_attempts=3
 attempt=1
 while [ $attempt -le $max_attempts ]; do
@@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
     fi
 
     if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
+        echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
+        rm -rf build/staging build/install build/lock.yaml
         sleep 30
     fi
     attempt=$((attempt + 1))

@@ -18,6 +18,11 @@ fi
 
 . ./mfc.sh load -c famd -m g
 
+# Only set up build cache for test suite, not benchmarks
+if [ "$run_bench" != "bench" ]; then
+    source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
+fi
+
 max_attempts=3
 attempt=1
 while [ $attempt -le $max_attempts ]; do
@@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
     fi
 
     if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
+        echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
+        rm -rf build/staging build/install build/lock.yaml
         sleep 30
     fi
     attempt=$((attempt + 1))

@@ -10,18 +10,39 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+# Set up persistent build cache
+source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
+
 max_attempts=3
 attempt=1
 while [ $attempt -le $max_attempts ]; do
     echo "Build attempt $attempt of $max_attempts..."
     if ./mfc.sh test -v --dry-run -j 8 $build_opts; then
         echo "Build succeeded on attempt $attempt."
+
+        # Smoke-test the cached binaries to catch architecture mismatches
+        # (SIGILL from binaries compiled on a different compute node).
+        syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1)
+        if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then
+            echo "WARNING: syscheck binary crashed — cached install is stale."
+            if [ $attempt -lt $max_attempts ]; then
+                echo "Clearing cache and rebuilding..."
+                rm -rf build/staging build/install build/lock.yaml
+                sleep 5
+                attempt=$((attempt + 1))
+                continue
+            else
+                echo "ERROR: syscheck still failing after $max_attempts attempts."
+                exit 1
+            fi
+        fi
+
         break
     fi
 
     if [ $attempt -lt $max_attempts ]; then
-        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
-        ./mfc.sh clean
+        echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
+        rm -rf build/staging build/install build/lock.yaml
         sleep 30
     else
         echo "Build failed after $max_attempts attempts."
@@ -40,4 +61,3 @@ if [ "$job_device" = "gpu" ]; then
 fi
 
 ./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
-
@@ -94,6 +94,12 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
 
+      - name: Restore Build Cache
+        uses: actions/cache@v4
+        with:
+          path: build
+          key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
+
       - name: Setup MacOS
         if:   matrix.os == 'macos'
         run:  |
@@ -202,6 +208,8 @@ jobs:
     steps:
       - name: Clone
         uses: actions/checkout@v4
+        with:
+          clean: false
 
       - name: Build
         if:   matrix.cluster != 'phoenix'