From a6364c5b3194141eda7ce91c2b2bed86c428eb42 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Feb 2026 21:34:12 -0500 Subject: [PATCH 1/5] Fix bash segfault in monitor_slurm_job.sh from fractional read timeout read -t 0.1 (sub-second timeout) in a loop with process substitution file descriptors triggers a bash internal error (unwind_frame_run: read_builtin: frame not found) leading to a segfault. Use integer timeout (read -t 1) instead. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..232a894f8a 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -64,7 +64,7 @@ while true; do # Try to read from tail output (non-blocking via timeout) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -115,7 +115,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop From 1ac123c1b02c079879c2cead57834bc64ca1bd59 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Feb 2026 20:54:32 -0500 Subject: [PATCH 2/5] Switch Phoenix GPU jobs to H200 nodes for faster scheduling Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/submit-bench.sh | 5 ++--- .github/workflows/phoenix/submit.sh | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..fc28b3046b 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..5747c839f0 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then From 17bdcc86b8e196b4ae2022c3711e197d694310dd Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:12:53 -0500 Subject: [PATCH 3/5] Remove NODE_OPTIONS from CI workflow Phoenix runners work fine without the max-old-space-size override. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..3576876520 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -196,7 +196,6 @@ jobs: group: phoenix labels: ${{ matrix.runner }} env: - NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }} ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: From a8882c50d7875f1c979b3816e98d6bce9775f408 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:36:12 -0500 Subject: [PATCH 4/5] Add workspace pre-clean to avoid stale NFS handles on self-hosted runners The actions/checkout clean step fails with ESTALE errors on NFS-backed storage when build artifacts from previous runs have stale file handles. Pre-clean with rm -rf (which tolerates stale handles) and disable checkout's built-in clean. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3576876520..f617242c23 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -199,8 +199,13 @@ jobs: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: + - name: Clean workspace + run: rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true + - name: Clone uses: actions/checkout@v4 + with: + clean: false - name: Build if: matrix.cluster != 'phoenix' From 3bb6621a026a595e3498f7f4f34cf37923397093 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 14 Feb 2026 09:39:33 -0500 Subject: [PATCH 5/5] Address review feedback: guard GITHUB_WORKSPACE, fix comments - Use ${GITHUB_WORKSPACE:?} to fail fast if variable is unset - Fix ntasks-per-node comment to say "tasks (MPI ranks)" not "cores" - Fix monitor script comment: polling-based, not non-blocking Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 2 +- .github/workflows/phoenix/submit-bench.sh | 2 +- .github/workflows/phoenix/submit.sh | 2 +- .github/workflows/test.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 232a894f8a..fd7abe962d 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -61,7 +61,7 @@ squeue_failures=0 last_heartbeat=$(date +%s) while true; do - # Try to read from tail output (non-blocking via timeout) + # Try to read from tail output with a 1s timeout (polling-based) # Read multiple lines if available to avoid falling behind lines_read=0 while IFS= read -r -t 1 line <&3 2>/dev/null; do diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index fc28b3046b..03e1506d15 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -21,7 +21,7 @@ sbatch_cpu_opts="\ sbatch_gpu_opts="\ #SBATCH --gres=gpu:H200:2 -#SBATCH --ntasks-per-node=8 # Number of cores per node required\ +#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 5747c839f0..9540433624 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -24,7 +24,7 @@ sbatch_cpu_opts="\ sbatch_gpu_opts="\ #SBATCH --gres=gpu:H200:2 -#SBATCH --ntasks-per-node=8 # Number of cores per node required\ +#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f617242c23..0a95e8a196 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -200,7 +200,7 @@ jobs: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - name: Clean workspace - run: rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true + run: rm -rf "${GITHUB_WORKSPACE:?}"/* "${GITHUB_WORKSPACE:?}"/.[!.]* 2>/dev/null || true - name: Clone uses: actions/checkout@v4