Skip to content
Closed
6 changes: 3 additions & 3 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ squeue_failures=0
last_heartbeat=$(date +%s)

while true; do
# Try to read from tail output (non-blocking via timeout)
# Try to read from tail output with a 1s timeout (polling-based)
# Read multiple lines if available to avoid falling behind
lines_read=0
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
lines_read=$((lines_read + 1))
last_heartbeat=$(date +%s)
Expand Down Expand Up @@ -115,7 +115,7 @@ done
# Drain any remaining output from tail after job completes
echo "Draining remaining output..."
drain_count=0
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
drain_count=$((drain_count + 1))
# Safety limit to avoid infinite loop
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\
"

if [ "$2" = "cpu" ]; then
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\
"

if [ "$2" = "cpu" ]; then
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,16 @@ jobs:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clean workspace
run: rm -rf "${GITHUB_WORKSPACE:?}"/* "${GITHUB_WORKSPACE:?}"/.[!.]* 2>/dev/null || true

- name: Clone
uses: actions/checkout@v4
with:
clean: false

- name: Build
if: matrix.cluster != 'phoenix'
Expand Down
Loading