diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..16717551cd 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -4,11 +4,17 @@ set -euo pipefail -# Cleanup handler to prevent orphaned tail processes +# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs cleanup() { if [ -n "${tail_pid:-}" ]; then kill "${tail_pid}" 2>/dev/null || true fi + # Cancel the SLURM job if the monitor is exiting due to an error + # (e.g., the CI runner is being killed). Don't cancel on success. + if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then + echo "Monitor exiting abnormally — cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + fi } trap cleanup EXIT @@ -23,30 +29,78 @@ output_file="$2" echo "Submitted batch job $job_id" echo "Monitoring output file: $output_file" -# Wait for file to appear with retry logic for transient squeue failures +# Robustly check SLURM job state using squeue with sacct fallback. +# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.) +# or "UNKNOWN" if both commands fail. +get_job_state() { + local jid="$1" + local state + + # Try squeue first (fast, works for active jobs) + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true) + if [ -n "$state" ]; then + echo "$state" + return + fi + + # Fallback to sacct (works for completed/historical jobs) + if command -v sacct >/dev/null 2>&1; then + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true) + if [ -n "$state" ]; then + echo "$state" + return + fi + fi + + echo "UNKNOWN" +} + +# Check if a state is terminal (job is done, for better or worse) +is_terminal_state() { + case "$1" in + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE) + return 0 ;; + *) + return 1 ;; + esac +} + +# Wait for file to appear, using robust state checking. +# Never give up due to transient squeue/sacct failures — the CI job timeout +# is the ultimate backstop. echo "Waiting for job to start..." -squeue_retries=0 -max_squeue_retries=5 +unknown_count=0 while [ ! -f "$output_file" ]; do - # Check if job is still queued/running - if squeue -j "$job_id" &>/dev/null; then - squeue_retries=0 # Reset on success - sleep 5 - else - squeue_retries=$((squeue_retries + 1)) - if [ $squeue_retries -ge $max_squeue_retries ]; then - # Job not in queue and output file doesn't exist - if [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id not in queue and output file not created" + state=$(get_job_state "$job_id") + + case "$state" in + PENDING|CONFIGURING) + unknown_count=0 + sleep 5 + ;; + RUNNING|COMPLETING) + unknown_count=0 + # Job is running but output file not yet visible (NFS delay) + sleep 2 + ;; + UNKNOWN) + unknown_count=$((unknown_count + 1)) + # Only print warning periodically to avoid log spam + if [ $((unknown_count % 12)) -eq 1 ]; then + echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..." + fi + sleep 5 + ;; + *) + # Terminal state — job finished without creating output + if is_terminal_state "$state"; then + echo "ERROR: Job $job_id reached terminal state ($state) without creating output file" exit 1 fi - break - fi - # Exponential backoff - sleep_time=$((2 ** squeue_retries)) - echo "Warning: squeue check failed, retrying in ${sleep_time}s..." - sleep $sleep_time - fi + # Unrecognized state, keep waiting + sleep 5 + ;; + esac done echo "=== Streaming output for job $job_id ===" @@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) tail_pid=$! # Monitor job status and stream output simultaneously -squeue_failures=0 last_heartbeat=$(date +%s) while true; do # Try to read from tail output (non-blocking via timeout) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -73,41 +126,22 @@ while true; do break fi done - + # Check job status current_time=$(date +%s) - if ! squeue -j "$job_id" &>/dev/null; then - squeue_failures=$((squeue_failures + 1)) - # Check if job actually completed using sacct (if available) - if [ $squeue_failures -ge 3 ]; then - if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') - # Consider job done only if it reached a terminal state - case "$state" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) - echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" - break - ;; - *) - # treat as transient failure, reset failures and continue polling - squeue_failures=0 - ;; - esac - else - # No sacct: assume job completed after 3 failures - echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" - break - fi - fi + state=$(get_job_state "$job_id") + + if is_terminal_state "$state"; then + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" + break else - squeue_failures=0 # Print heartbeat if no output for 60 seconds if [ $((current_time - last_heartbeat)) -ge 60 ]; then - echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." + echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..." last_heartbeat=$current_time fi fi - + # Sleep briefly between status checks sleep 1 done @@ -115,7 +149,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop @@ -128,8 +162,9 @@ done # Close the file descriptor and kill tail exec 3<&- kill "${tail_pid}" 2>/dev/null || true +tail_pid="" -# Wait for output file to finish growing (stabilize) before stopping tail +# Wait for output file to stabilize (NFS flush) before final read if [ -f "$output_file" ]; then last_size=-1 same_count=0 @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then done fi -# Stop tailing (trap will also handle this on exit) -kill "${tail_pid}" 2>/dev/null || true - echo "" echo "=== Final output ===" cat "$output_file" @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then exit 1 fi +monitor_success=1 echo "Job $job_id completed successfully" exit 0 - diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..eed7e002c7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -1,32 +1,24 @@ name: 'Benchmark' on: - # Trigger when Test Suite completes (no polling needed) - workflow_run: - workflows: ["Test Suite"] - types: [completed] + pull_request: + pull_request_review: + types: [submitted] workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: file-changes: name: Detect File Changes - # Only run if Test Suite passed (or manual dispatch) - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} - pr_number: ${{ steps.pr-info.outputs.pr_number }} - pr_approved: ${{ steps.pr-info.outputs.approved }} - pr_author: ${{ steps.pr-info.outputs.author }} steps: - name: Clone uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - name: Detect Changes uses: dorny/paths-filter@v3 @@ -34,52 +26,10 @@ jobs: with: filters: ".github/file-filter.yml" - - name: Get PR Info - id: pr-info - env: - GH_TOKEN: ${{ github.token }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=true" >> $GITHUB_OUTPUT - echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT - else - # Get PR number from workflow_run - PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" - if [ -n "$PR_NUMBER" ]; then - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author) - PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') - echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi - else - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=false" >> $GITHUB_OUTPUT - echo "author=" >> $GITHUB_OUTPUT - fi - fi - self: name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" - if: > - github.repository == 'MFlowCode/MFC' && - needs.file-changes.outputs.checkall == 'true' && - ( - github.event_name == 'workflow_dispatch' || - needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' - ) - needs: [file-changes] + if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }} + needs: file-changes strategy: fail-fast: false matrix: @@ -136,14 +86,10 @@ jobs: group: ${{ matrix.group }} labels: ${{ matrix.labels }} timeout-minutes: 480 - env: - ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clone - PR uses: actions/checkout@v4 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} path: pr - name: Clone - Master @@ -155,10 +101,15 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - run: | - (cd pr && ${{ matrix.build_script }}) & - (cd master && ${{ matrix.build_script }}) & - wait %1 && wait %2 + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + retry_wait_seconds: 60 + timeout_minutes: 480 + command: | + (cd pr && ${{ matrix.build_script }}) & + (cd master && ${{ matrix.build_script }}) & + wait %1 && wait %2 - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 18cddc96ca..199eda213c 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + # Ignore SIGHUP to survive login node session drops trap '' HUP @@ -18,39 +20,10 @@ fi . ./mfc.sh load -c f -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +if [ "$run_bench" == "bench" ]; then + for dir in benchmarks/*/; do + ./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts + done +else + ./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts +fi diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index d5b416c65a..4c3e0e3e27 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -34,12 +34,13 @@ output_file="$job_slug.out" submit_output=$(sbatch < None: def delete_directory(dirpath: str) -> None: - if os.path.isdir(dirpath): - shutil.rmtree(dirpath) + for attempt in range(5): + if not os.path.isdir(dirpath): + return + try: + shutil.rmtree(dirpath) + return + except OSError: + if attempt == 4: + raise + time.sleep(1) def get_program_output(arguments: typing.List[str] = None, cwd=None): diff --git a/toolchain/mfc/lint_docs.py b/toolchain/mfc/lint_docs.py index ddd63a1503..e6b8c87338 100644 --- a/toolchain/mfc/lint_docs.py +++ b/toolchain/mfc/lint_docs.py @@ -59,10 +59,10 @@ "docs/documentation/case.md": CASE_MD_SKIP, } + # Match @ref page_id patterns (page IDs may contain hyphens) REF_RE = re.compile(r"@ref\s+([\w-]+)") - def check_docs(repo_root: Path) -> list[str]: """Check that file paths referenced in documentation still exist.""" errors = [] diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..54e00186dd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]: skipped_cases += example_cases cases = [case for case in cases if case not in example_cases] + if ARG("shard") is not None: + parts = ARG("shard").split("/") + if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]): + raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').") + shard_idx, shard_count = int(parts[0]), int(parts[1]) + skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] + cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + if ARG("percent") == 100: return cases, skipped_cases @@ -206,6 +214,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL)