diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index 06b6eaa8d3..be9b5c5a94 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -52,16 +52,16 @@ else echo "Master job completed successfully" fi -# Check if either job failed +# Warn if either job failed (partial results may still be usable) if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then - echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}" - exit 1 + echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}" + echo "Checking for partial results..." +else + echo "==========================================" + echo "Both benchmark jobs completed successfully!" + echo "==========================================" fi -echo "==========================================" -echo "Both benchmark jobs completed successfully!" -echo "==========================================" - # Final verification that output files exist before proceeding pr_yaml="pr/bench-${device}-${interface}.yaml" master_yaml="master/bench-${device}-${interface}.yaml" diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh new file mode 100755 index 0000000000..9a1c7266e4 --- /dev/null +++ b/.github/scripts/setup-build-cache.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Sets up a persistent build cache for self-hosted CI runners. +# Creates a symlink: ./build -> /.mfc-ci-cache//build +# +# Each runner gets its own cache keyed by (cluster, device, interface, runner). +# This avoids cross-runner path issues entirely — CMake's absolute paths are +# always correct because the same runner always uses the same workspace path. +# +# Usage: source .github/scripts/setup-build-cache.sh + +_cache_cluster="${1:?Usage: setup-build-cache.sh }" +_cache_device="${2:?}" +_cache_interface="${3:-none}" +_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}" + +_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}" +_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build" + +mkdir -p "$_cache_base" +_cache_dir="$(cd "$_cache_base" && pwd -P)" + +echo "=== Build Cache Setup ===" +echo " Cache key: $_cache_key" +echo " Cache dir: $_cache_dir" + +# Replace any existing build/ (real dir or stale symlink) with a symlink +# to our runner-specific cache directory. +# Use unlink for symlinks to avoid rm -rf following the link and deleting +# the shared cache contents (which another runner may be using). +if [ -L "build" ]; then + unlink "build" +elif [ -e "build" ]; then + rm -rf "build" +fi + +ln -s "$_cache_dir" "build" + +echo " Symlink: build -> $_cache_dir" +echo "=========================" diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 8e4b124107..80790752d7 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -37,9 +37,13 @@ fi echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" # Use the monitoring script from PR (where this script lives) -bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" - -echo "[$dir] Monitoring complete for job $job_id" +monitor_exit=0 +bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? +if [ "$monitor_exit" -ne 0 ]; then + echo "[$dir] WARNING: SLURM job exited with code $monitor_exit" +else + echo "[$dir] Monitoring complete for job $job_id" +fi # Verify the YAML output file was created yaml_file="${job_slug}.yaml" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..4f777a165e 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -46,6 +46,13 @@ jobs: else # Get PR number from workflow_run PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" + if [ -z "$PR_NUMBER" ]; then + # Cross-repo PRs don't populate pull_requests[]. Search by head SHA. + HEAD_SHA="${{ github.event.workflow_run.head_sha }}" + PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \ + --jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1) + fi + if [ -n "$PR_NUMBER" ]; then echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT @@ -53,14 +60,20 @@ jobs: PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi + # Check if PR is approved by a maintainer/admin (ignore AI bot approvals) + APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \ + --jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]') + APPROVED="false" + for approver in $APPROVERS; do + PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \ + --jq '.permission' 2>/dev/null || echo "none") + if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then + echo " Approved by $approver (permission: $PERM)" + APPROVED="true" + break + fi + done + echo "approved=$APPROVED" >> $GITHUB_OUTPUT else echo "pr_number=" >> $GITHUB_OUTPUT echo "approved=false" >> $GITHUB_OUTPUT @@ -76,8 +89,7 @@ jobs: ( github.event_name == 'workflow_dispatch' || needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' + needs.file-changes.outputs.pr_author == 'sbryngelson' ) needs: [file-changes] strategy: @@ -164,6 +176,7 @@ jobs: run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Generate & Post Comment + if: always() run: | (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 6cd6db11b8..9589c79e49 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -35,6 +35,12 @@ jobs: - name: Checkouts uses: actions/checkout@v4 + - name: Restore Build Cache + uses: actions/cache@v4 + with: + path: build + key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} + - name: Setup Ubuntu run: | sudo apt update -y diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 18cddc96ca..ca09c2a116 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -18,6 +18,11 @@ fi . ./mfc.sh load -c f -m g +# Only set up build cache for test suite, not benchmarks +if [ "$run_bench" != "bench" ]; then + source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface" +fi + max_attempts=3 attempt=1 while [ $attempt -le $max_attempts ]; do @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." + rm -rf build/staging build/install build/lock.yaml sleep 30 fi attempt=$((attempt + 1)) diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh index 56c47d8ff4..6036f73fc2 100644 --- a/.github/workflows/frontier_amd/build.sh +++ b/.github/workflows/frontier_amd/build.sh @@ -18,6 +18,11 @@ fi . ./mfc.sh load -c famd -m g +# Only set up build cache for test suite, not benchmarks +if [ "$run_bench" != "bench" ]; then + source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface" +fi + max_attempts=3 attempt=1 while [ $attempt -le $max_attempts ]; do @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." + rm -rf build/staging build/install build/lock.yaml sleep 30 fi attempt=$((attempt + 1)) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 3920d96ed7..74c31c9fba 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -10,18 +10,39 @@ if [ "$job_device" = "gpu" ]; then fi fi +# Set up persistent build cache +source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface" + max_attempts=3 attempt=1 while [ $attempt -le $max_attempts ]; do echo "Build attempt $attempt of $max_attempts..." if ./mfc.sh test -v --dry-run -j 8 $build_opts; then echo "Build succeeded on attempt $attempt." + + # Smoke-test the cached binaries to catch architecture mismatches + # (SIGILL from binaries compiled on a different compute node). + syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1) + if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then + echo "WARNING: syscheck binary crashed — cached install is stale." + if [ $attempt -lt $max_attempts ]; then + echo "Clearing cache and rebuilding..." + rm -rf build/staging build/install build/lock.yaml + sleep 5 + attempt=$((attempt + 1)) + continue + else + echo "ERROR: syscheck still failing after $max_attempts attempts." + exit 1 + fi + fi + break fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." + rm -rf build/staging build/install build/lock.yaml sleep 30 else echo "Build failed after $max_attempts attempts." @@ -40,4 +61,3 @@ if [ "$job_device" = "gpu" ]; then fi ./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix - diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..9deb1920a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -94,6 +94,12 @@ jobs: - name: Clone uses: actions/checkout@v4 + - name: Restore Build Cache + uses: actions/cache@v4 + with: + path: build + key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} + - name: Setup MacOS if: matrix.os == 'macos' run: | @@ -202,6 +208,8 @@ jobs: steps: - name: Clone uses: actions/checkout@v4 + with: + clean: false - name: Build if: matrix.cluster != 'phoenix' diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index 78b9b0fc65..80da40a7d4 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -1,4 +1,4 @@ -import os, sys, uuid, subprocess, dataclasses, typing, math, traceback +import os, sys, uuid, subprocess, dataclasses, typing, math, traceback, time import rich.table @@ -16,7 +16,7 @@ class BenchCase: path: str args: typing.List[str] -# pylint: disable=too-many-locals, too-many-branches, too-many-statements +# pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-nested-blocks def bench(targets = None): if targets is None: targets = ARG("targets") @@ -53,6 +53,8 @@ def bench(targets = None): failed_cases = [] + max_attempts = 2 + for i, case in enumerate(CASES): summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml") log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out") @@ -64,71 +66,94 @@ def bench(targets = None): cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]") try: - with open(log_filepath, "w") as log_file: - result = system( - ["./mfc.sh", "run", case.path, "--case-optimization"] + - ["--targets"] + [t.name for t in targets] + - ["--output-summary", summary_filepath] + - case.args + - ["--", "--gbpp", str(ARG('mem'))], - stdout=log_file, - stderr=subprocess.STDOUT) - - # Check return code (handle CompletedProcess or int defensively) - rc = result.returncode if hasattr(result, "returncode") else result - if rc != 0: - cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") - cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") - failed_cases.append(case.slug) - continue - - # Validate summary file exists - if not os.path.exists(summary_filepath): - cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") - cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") - failed_cases.append(case.slug) - continue - - # Load summary - summary = file_load_yaml(summary_filepath) - - # Validate all targets have required data - validation_failed = False - for target in targets: - if target.name not in summary: - cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}") - validation_failed = True - break - - if "exec" not in summary[target.name]: - cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}") - validation_failed = True + for attempt in range(1, max_attempts + 1): + try: + with open(log_filepath, "w") as log_file: + result = system( + ["./mfc.sh", "run", case.path, "--case-optimization"] + + ["--targets"] + [t.name for t in targets] + + ["--output-summary", summary_filepath] + + case.args + + ["--", "--gbpp", str(ARG('mem'))], + stdout=log_file, + stderr=subprocess.STDOUT) + + # Check return code (handle CompletedProcess or int defensively) + rc = result.returncode if hasattr(result, "returncode") else result + if rc != 0: + if attempt < max_attempts: + cons.print(f"[bold yellow]WARNING[/bold yellow]: Case {case.slug} failed with exit code {rc} (attempt {attempt}/{max_attempts})") + cons.print(f"Retrying in 5s...") + time.sleep(5) + continue + cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}") + cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]") + failed_cases.append(case.slug) + break + + # Validate summary file exists + if not os.path.exists(summary_filepath): + if attempt < max_attempts: + cons.print(f"[bold yellow]WARNING[/bold yellow]: Summary file not created for {case.slug} (attempt {attempt}/{max_attempts})") + cons.print(f"Retrying in 5s...") + time.sleep(5) + continue + cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}") + cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]") + failed_cases.append(case.slug) + break + + # Load summary + summary = file_load_yaml(summary_filepath) + + # Validate all targets have required data + validation_failed = False + for target in targets: + if target.name not in summary: + cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}") + validation_failed = True + break + + if "exec" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}") + validation_failed = True + break + + if target.name == "simulation" and "grind" not in summary[target.name]: + cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}") + validation_failed = True + break + + if validation_failed: + failed_cases.append(case.slug) + break + + # Add to results + results["cases"][case.slug] = { + "description": dataclasses.asdict(case), + "output_summary": summary, + } + cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully") break - if target.name == "simulation" and "grind" not in summary[target.name]: - cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}") - validation_failed = True - break + except Exception as e: + if attempt < max_attempts: + cons.print(f"[bold yellow]WARNING[/bold yellow]: Unexpected error running {case.slug} (attempt {attempt}/{max_attempts}): {e}") + cons.print(f"Retrying in 5s...") + time.sleep(5) + continue + cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") + cons.print(f"[dim]{traceback.format_exc()}[/dim]") + failed_cases.append(case.slug) - if validation_failed: - failed_cases.append(case.slug) - continue - - # Add to results - results["cases"][case.slug] = { - "description": dataclasses.asdict(case), - "output_summary": summary, - } - cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully") - - except Exception as e: - cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}") - cons.print(f"[dim]{traceback.format_exc()}[/dim]") - failed_cases.append(case.slug) finally: cons.unindent() - # Report results + # Always write results (even partial) so diff() can compare the intersection + file_dump_yaml(ARG("output"), results) + cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].") + + # Report failures after writing results if failed_cases: cons.print() cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]") @@ -137,11 +162,6 @@ def bench(targets = None): cons.print() raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed") - # Write output - file_dump_yaml(ARG("output"), results) - - cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].") - finally: cons.unindent()