Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/scripts/run_parallel_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,16 @@ else
echo "Master job completed successfully"
fi

# Check if either job failed
# Warn if either job failed (partial results may still be usable)
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
exit 1
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
echo "Checking for partial results..."
else
echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="
fi
Comment on lines +55 to 63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Use a flag to track benchmark job failures and exit with a non-zero status at the end of the script if any job failed, ensuring the workflow status is accurate. [possible issue, importance: 9]

Suggested change
# Warn if either job failed (partial results may still be usable)
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
exit 1
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
echo "Checking for partial results..."
else
echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="
fi
bench_failed=0
# Warn if either job failed (partial results may still be usable)
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
echo "Checking for partial results..."
bench_failed=1
else
echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="
fi


echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="

# Final verification that output files exist before proceeding
pr_yaml="pr/bench-${device}-${interface}.yaml"
master_yaml="master/bench-${device}-${interface}.yaml"
Expand Down
39 changes: 39 additions & 0 deletions .github/scripts/setup-build-cache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# Sets up a persistent build cache for self-hosted CI runners.
# Creates a symlink: ./build -> <scratch>/.mfc-ci-cache/<key>/build
#
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
# This avoids cross-runner path issues entirely — CMake's absolute paths are
# always correct because the same runner always uses the same workspace path.
#
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>

_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
_cache_device="${2:?}"
_cache_interface="${3:-none}"
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"

_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build"

mkdir -p "$_cache_base"
_cache_dir="$(cd "$_cache_base" && pwd -P)"

echo "=== Build Cache Setup ==="
echo " Cache key: $_cache_key"
echo " Cache dir: $_cache_dir"

# Replace any existing build/ (real dir or stale symlink) with a symlink
# to our runner-specific cache directory.
# Use unlink for symlinks to avoid rm -rf following the link and deleting
# the shared cache contents (which another runner may be using).
if [ -L "build" ]; then
unlink "build"
elif [ -e "build" ]; then
rm -rf "build"
fi

ln -s "$_cache_dir" "build"

echo " Symlink: build -> $_cache_dir"
echo "========================="
10 changes: 7 additions & 3 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ fi
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"

# Use the monitoring script from PR (where this script lives)
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"

echo "[$dir] Monitoring complete for job $job_id"
monitor_exit=0
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
if [ "$monitor_exit" -ne 0 ]; then
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
else
echo "[$dir] Monitoring complete for job $job_id"
fi

# Verify the YAML output file was created
yaml_file="${job_slug}.yaml"
Expand Down
33 changes: 23 additions & 10 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,34 @@ jobs:
else
# Get PR number from workflow_run
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
if [ -z "$PR_NUMBER" ]; then
# Cross-repo PRs don't populate pull_requests[]. Search by head SHA.
HEAD_SHA="${{ github.event.workflow_run.head_sha }}"
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
--jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1)
fi

if [ -n "$PR_NUMBER" ]; then
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT

# Check if PR is approved
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
--jq '[.[] | select(.state == "APPROVED")] | length')
if [ "$APPROVED" -gt 0 ]; then
echo "approved=true" >> $GITHUB_OUTPUT
else
echo "approved=false" >> $GITHUB_OUTPUT
fi
# Check if PR is approved by a maintainer/admin (ignore AI bot approvals)
APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
--jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]')
APPROVED="false"
for approver in $APPROVERS; do
PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \
--jq '.permission' 2>/dev/null || echo "none")
if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then
echo " Approved by $approver (permission: $PERM)"
APPROVED="true"
break
fi
done
echo "approved=$APPROVED" >> $GITHUB_OUTPUT
else
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=false" >> $GITHUB_OUTPUT
Expand All @@ -76,8 +89,7 @@ jobs:
(
github.event_name == 'workflow_dispatch' ||
needs.file-changes.outputs.pr_approved == 'true' ||
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
needs.file-changes.outputs.pr_author == 'wilfonba'
needs.file-changes.outputs.pr_author == 'sbryngelson'
)
needs: [file-changes]
strategy:
Expand Down Expand Up @@ -164,6 +176,7 @@ jobs:
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Generate & Post Comment
if: always()
run: |
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ jobs:
- name: Checkouts
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}

- name: Setup Ubuntu
run: |
sudo apt update -y
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ fi

. ./mfc.sh load -c f -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
rm -rf build/staging build/install build/lock.yaml
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ fi

. ./mfc.sh load -c famd -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
rm -rf build/staging build/install build/lock.yaml
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
26 changes: 23 additions & 3 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,39 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if ./mfc.sh test -v --dry-run -j 8 $build_opts; then
echo "Build succeeded on attempt $attempt."

# Smoke-test the cached binaries to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1)
if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then
echo "WARNING: syscheck binary crashed — cached install is stale."
if [ $attempt -lt $max_attempts ]; then
echo "Clearing cache and rebuilding..."
rm -rf build/staging build/install build/lock.yaml
sleep 5
attempt=$((attempt + 1))
continue
else
echo "ERROR: syscheck still failing after $max_attempts attempts."
exit 1
fi
fi

break
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
rm -rf build/staging build/install build/lock.yaml
sleep 30
else
echo "Build failed after $max_attempts attempts."
Expand All @@ -40,4 +61,3 @@ if [ "$job_device" = "gpu" ]; then
fi

./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix

8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ jobs:
- name: Clone
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}

- name: Setup MacOS
if: matrix.os == 'macos'
run: |
Expand Down Expand Up @@ -202,6 +208,8 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v4
with:
clean: false

- name: Build
if: matrix.cluster != 'phoenix'
Expand Down
Loading
Loading