@@ -213,25 +213,25 @@ jobs:
213213 fail-fast : false
214214 continue-on-error : true
215215 runs-on : ${{ matrix.nvhpc && 'ubuntu-22.04' || format('{0}-latest', matrix.os) }}
216- container :
217- image : ${{ matrix.nvhpc && format('nvcr.io/nvidia/nvhpc:{0}-devel-cuda_multi-ubuntu22.04', matrix.nvhpc) || '' }}
218- options : ${{ matrix.nvhpc && '--security-opt seccomp=unconfined' || '' }}
219216 env :
220- CC : ${{ matrix.nvhpc && 'nvc' || '' }}
221- CXX : ${{ matrix.nvhpc && 'nvc++' || '' }}
222- FC : ${{ matrix.nvhpc && 'nvfortran' || '' }}
223- OMPI_ALLOW_RUN_AS_ROOT : ${{ matrix.nvhpc && '1' || '' }}
224- OMPI_ALLOW_RUN_AS_ROOT_CONFIRM : ${{ matrix.nvhpc && '1' || '' }}
225- PMIX_MCA_gds : ${{ matrix.nvhpc && 'hash' || '' }}
226- OMPI_MCA_hwloc_base_binding_policy : ${{ matrix.nvhpc && 'none' || '' }}
227- FFLAGS : ${{ matrix.nvhpc && '-tp=px -Kieee -noswitcherror' || '' }}
228- CFLAGS : ${{ matrix.nvhpc && '-tp=px' || '' }}
229- CXXFLAGS : ${{ matrix.nvhpc && '-tp=px' || '' }}
217+ # Image tag for NVHPC jobs; empty for non-NVHPC jobs.
218+ NVHPC_IMAGE : ${{ matrix.nvhpc && format('nvcr.io/nvidia/nvhpc:{0}-devel-cuda_multi-ubuntu22.04', matrix.nvhpc) || '' }}
230219
231220 steps :
232- - name : Git safe directory
221+ # ── NVHPC: free disk before pulling the ~25-30 GB cuda_multi image ──
222+ - name : Free disk space
233223 if : matrix.nvhpc
234- run : git config --global --add safe.directory /__w/MFC/MFC
224+ run : |
225+ echo "=== Disk before cleanup ==="
226+ df -h /
227+ sudo rm -rf /usr/share/dotnet /usr/local/lib/android \
228+ /opt/ghc /usr/local/share/boost /opt/hostedtoolcache \
229+ /usr/local/graalvm /usr/local/.ghcup \
230+ /usr/local/share/chromium /usr/local/lib/node_modules
231+ sudo docker image prune -af
232+ sudo apt-get clean
233+ echo "=== Disk after cleanup ==="
234+ df -h /
235235
236236 - name : Clone
237237 uses : actions/checkout@v4
@@ -274,6 +274,67 @@ jobs:
274274 echo "Coverage cache: none available — full test suite will run"
275275 fi
276276
277+ # ── NVHPC: pull image and start a long-lived container ──────────────
278+ # Replaces the container: directive so we can free disk space first.
279+ # Uses "docker run -d ... sleep infinity" + "docker exec" to preserve
280+ # installed packages and env vars across steps.
281+ - name : Pull NVHPC container
282+ if : matrix.nvhpc
283+ run : docker pull "$NVHPC_IMAGE"
284+
285+ - name : Start NVHPC container
286+ if : matrix.nvhpc
287+ run : |
288+ docker run -d --name nvhpc \
289+ --security-opt seccomp=unconfined \
290+ -v "${{ github.workspace }}:/workspace" \
291+ -w /workspace \
292+ -e CC=nvc \
293+ -e CXX=nvc++ \
294+ -e FC=nvfortran \
295+ -e OMPI_ALLOW_RUN_AS_ROOT=1 \
296+ -e OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \
297+ -e PMIX_MCA_gds=hash \
298+ -e OMPI_MCA_hwloc_base_binding_policy=none \
299+ -e "FFLAGS=-tp=px -Kieee -noswitcherror" \
300+ -e CFLAGS=-tp=px \
301+ -e CXXFLAGS=-tp=px \
302+ "$NVHPC_IMAGE" sleep infinity
303+
304+ - name : Setup NVHPC
305+ if : matrix.nvhpc
306+ run : |
307+ docker exec nvhpc bash -c '
308+ set -e
309+ apt-get update -y
310+ apt-get install -y cmake python3 python3-venv python3-pip \
311+ libfftw3-dev libhdf5-dev hdf5-tools git
312+
313+ # The repo is bind-mounted from the host so git sees a different
314+ # owner. Mark it safe to suppress "dubious ownership" errors that
315+ # otherwise spam 80 000+ lines into the CI log.
316+ git config --global --add safe.directory /workspace
317+
318+ # Set up NVHPC HPC-X MPI runtime paths
319+ HPCX_DIR=$(dirname "$(find /opt/nvidia/hpc_sdk -path "*/hpcx/hpcx-*/ompi/bin/mpirun" | head -1)")/../..
320+ MPI_LIB=$(mpifort --showme:link | grep -oP "(?<=-L)\S+" | head -1)
321+
322+ # Persist env vars for subsequent docker exec calls
323+ cat > /etc/nvhpc-env.sh <<EOF
324+ export LD_LIBRARY_PATH=${MPI_LIB}:${HPCX_DIR}/ucx/lib:${HPCX_DIR}/ucc/lib:\$LD_LIBRARY_PATH
325+ export OMPI_MCA_rmaps_base_oversubscribe=1
326+ EOF
327+
328+ # Debug: confirm compiler flags are set
329+ echo "=== NVHPC Environment ==="
330+ echo "FFLAGS=$FFLAGS"
331+ echo "CFLAGS=$CFLAGS"
332+ echo "CXXFLAGS=$CXXFLAGS"
333+ nvfortran --version
334+ cat /proc/cpuinfo | grep "model name" | head -1
335+ '
336+
337+ # ── Standard (non-NVHPC) setup ─────────────────────────────────────
277338 - name : Setup MacOS
278339 if : matrix.os == 'macos' && !matrix.nvhpc
279340 run : |
@@ -313,30 +374,7 @@ jobs:
313374 echo "MPICC=mpiicx" >> $GITHUB_ENV
314375 echo "MPICXX=mpiicpx" >> $GITHUB_ENV
315376
316- # --- NVHPC container setup ---
317- - name : Setup NVHPC
318- if : matrix.nvhpc
319- run : |
320- apt-get update -y
321- apt-get install -y cmake python3 python3-venv python3-pip \
322- libfftw3-dev libhdf5-dev hdf5-tools git
323- # Set up NVHPC HPC-X MPI runtime paths
324- HPCX_DIR=$(dirname "$(find /opt/nvidia/hpc_sdk -path "*/hpcx/hpcx-*/ompi/bin/mpirun" | head -1)")/../..
325- MPI_LIB=$(mpifort --showme:link | grep -oP '(?<=-L)\S+' | head -1)
326- echo "LD_LIBRARY_PATH=${MPI_LIB}:${HPCX_DIR}/ucx/lib:${HPCX_DIR}/ucc/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
327- # Container MPI fixes: PMIx shared-memory, hwloc binding
328- echo "PMIX_MCA_gds=hash" >> $GITHUB_ENV
329- echo "OMPI_MCA_hwloc_base_binding_policy=none" >> $GITHUB_ENV
330- echo "OMPI_MCA_rmaps_base_oversubscribe=1" >> $GITHUB_ENV
331- # Debug: confirm compiler flags are set
332- echo "=== NVHPC Environment ==="
333- echo "FFLAGS=$FFLAGS"
334- echo "CFLAGS=$CFLAGS"
335- echo "CXXFLAGS=$CXXFLAGS"
336- nvfortran --version
337- cat /proc/cpuinfo | grep "model name" | head -1
338-
339- # --- Standard build + test ---
377+ # ── Standard build + test ───────────────────────────────────────────
340378 - name : Build
341379 if : ' !matrix.nvhpc'
342380 run : |
@@ -354,22 +392,37 @@ jobs:
354392 TEST_PCT : ${{ matrix.debug == 'reldebug' && '-% 20' || '' }}
355393 ONLY_CHANGES : ${{ github.event_name == 'pull_request' && '--only-changes' || '' }}
356394
357- # --- NVHPC build + test ---
395+ # ── NVHPC build + test (via docker exec into long-lived container) ──
358396 - name : Build (NVHPC)
359397 if : matrix.nvhpc && matrix.target == 'cpu'
360- run : /bin/bash mfc.sh test -v --dry-run -j $(nproc) --test-all
398+ run : |
399+ docker exec nvhpc bash -c '
400+ source /etc/nvhpc-env.sh
401+ /bin/bash mfc.sh test -v --dry-run -j $(nproc) --test-all
402+ '
361403
362404 - name : Build (NVHPC GPU)
363405 if : matrix.nvhpc && matrix.target == 'gpu'
364- run : |
365- /bin/bash mfc.sh test -v --dry-run -j 2 --test-all --gpu acc
366- /bin/bash mfc.sh test -v --dry-run -j 2 --test-all --gpu mp
406+ run : |
407+ docker exec nvhpc bash -c '
408+ source /etc/nvhpc-env.sh
409+ /bin/bash mfc.sh test -v --dry-run -j 2 --test-all --gpu acc
410+ /bin/bash mfc.sh test -v --dry-run -j 2 --test-all --gpu mp
411+ '
367412
368413 - name : Test (NVHPC)
369414 if : matrix.nvhpc && matrix.target == 'cpu'
370- run : |
371- ulimit -s unlimited || ulimit -s 65536 || true
372- /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --test-all
415+ run : |
416+ docker exec nvhpc bash -c '
417+ source /etc/nvhpc-env.sh
418+ ulimit -s unlimited || ulimit -s 65536 || true
419+ /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --test-all
420+ '
421+
422+ # ── Cleanup ─────────────────────────────────────────────────────────
423+ - name : Stop NVHPC container
424+ if : always() && matrix.nvhpc
425+ run : docker rm -f nvhpc || true
373426
374427 self :
375428 name : " ${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
0 commit comments