diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..fd7abe962d 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -61,10 +61,10 @@ squeue_failures=0 last_heartbeat=$(date +%s) while true; do - # Try to read from tail output (non-blocking via timeout) + # Try to read from tail output with a 1s timeout (polling-based) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -115,7 +115,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..03e1506d15 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..9540433624 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ec964794ff..82667a4057 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -199,12 +199,16 @@ jobs: group: phoenix labels: ${{ matrix.runner }} env: - NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }} ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: + - name: Clean workspace + run: rm -rf "${GITHUB_WORKSPACE:?}"/* "${GITHUB_WORKSPACE:?}"/.[!.]* 2>/dev/null || true + - name: Clone uses: actions/checkout@v4 + with: + clean: false - name: Build if: matrix.cluster != 'phoenix'