@@ -26,7 +26,7 @@ program main
2626integer :: batchsize
2727integer :: status
2828integer :: i,j,k,il,jl,kl,ig,jg,kg,t,stage
29- integer :: im,ip,jm,jp,km,kp,last,idx
29+ integer :: im,ip,jm,jp,km,kp,last,idx,kgm
3030! TDMA variables
3131double precision , allocatable :: a(:), b(:), c(:)
3232double complex , allocatable :: d(:), sol(:)
@@ -74,8 +74,8 @@ program main
7474call readinput
7575
7676! domain decomposition (pencils in y and z)
77- pr = 0
78- pc = 0
77+ pr = 1
78+ pc = 2
7979halo_ext= 1
8080
8181! CuDECOMP initialization and settings
@@ -148,6 +148,7 @@ program main
148148! End cuDecomp initialization
149149
150150
151+
151152! CUFFT initialization -- Create Plans (along x anf y only, z not required)
152153! Forward 1D FFT in X: D2Z
153154batchSize = piX_d2z% shape (2 )* piX_d2z% shape (3 ) ! <- number of FFT (from x-pencil dimension)
@@ -239,13 +240,14 @@ program main
239240 do j = 1 + halo_ext, piX% shape (2 )- halo_ext
240241 jg = piX% lo(2 ) + j - 1 - halo_ext
241242 do i = 1 , piX% shape (1 )
242- amp= 1 .d0
243- mx= 2.d0
244- my= 2.d0
245- mz= 4.d0
243+ amp= 3 .d0
244+ mx= 3.03d0
245+ my= 2.02d0
246+ mz= 4.00d0
246247 ! 3D divergence free flow with fluctuations that satisfies the boundary conditions
247- u(i,j,k) = 20.d0 * (1.d0 - ((2 * z(kg) - lz)/ lz)** 2 )
248+ u(i,j,k) = 20.d0 * (1.d0 - ((2 * z(kg) - lz)/ lz)** 2 ) !
248249 u(i,j,k) = u(i,j,k) - amp* cos (twopi* mx* x(i)/ lx)* sin (twopi* my* y(jg)/ ly)* 2.d0 * twopi/ lz* sin (twopi* z(kg)/ lz)* cos (twopi* z(kg)/ lz)
250+ u(i,j,k) = u(i,j,k) + amp* sin (twopi* mx* x(i)/ lx)* (- twopi* my/ ly)* sin (2.d0 * twopi* my* y(jg)/ ly)* sin (twopi* z(kg)/ lz)* sin (twopi* z(kg)/ lz)
249251 v(i,j,k) = - amp* cos (twopi* my* y(jg)/ ly)* (twopi* mx/ lx)* cos (twopi* mx* x(i)/ lx)* sin (twopi* z(kg)/ lz)* sin (twopi* z(kg)/ lz)
250252 w(i,j,k) = amp* cos (twopi* mx* x(i)/ lx)* (twopi* mx/ lx)* sin (twopi* my* y(jg)/ ly)* sin (twopi* z(kg)/ lz)* sin (twopi* z(kg)/ lz)
251253 enddo
@@ -719,7 +721,7 @@ program main
719721 enddo
720722 #endif
721723
722- ! 5 .3 update halos (y and z directions ), required to then compute the RHS of Poisson equation because of staggered grid
724+ ! 8 .3 update halos (y direction ), required to then compute the RHS of Poisson equation because of staggered grid
723725 ! $acc host_data use_device(u,v,w)
724726 CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX% halo_extents, halo_periods, 2 ))
725727 CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX% halo_extents, halo_periods, 3 ))
@@ -730,24 +732,27 @@ program main
730732 ! $acc end host_data
731733
732734 ! impose velocity boundary conditions, w is at the wall, u and v interpolate so that the mean value is zero, no-slip assumted, i.e. u=0, can be extented to any value
733- ! even when stretched the spacing cell face is centered
734- ! $acc parallel loop collapse(3)
735+ ! $acc parallel loop collapse(3)
735736 do k= 1 , piX% shape (3 )
736- do j= 1 , piX% shape (2 )
737+ do j= 1 + halo_ext , piX% shape (2 )- halo_ext
737738 do i= 1 ,nx
738- kg = piX% lo(3 ) + k - 1 - halo_ext
739+ kg = piX% lo(3 ) + k - 1 - halo_ext
739740 ! bottom wall
740- if (kg .eq. 1 ) u(i,j,k-1 )= - u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
741- if (kg .eq. 1 ) v(i,j,k-1 )= - v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
742- if (kg .eq. 1 ) w(i,j,k)= 0.d0 ! w point is at the wall
741+ if (kg .eq. 0 ) u(i,j,k)= - u(i,j,k+1 ) ! mean value between kg and kg-1 (wall) equal to zero
742+ if (kg .eq. 0 ) v(i,j,k)= - v(i,j,k+1 ) ! mean value between kg and kg-1 (wall) equal to zero
743+ if (kg .eq. 1 ) w(i,j,k)= 0.d0 ! w point is at the wall
743744 ! top wall
744- if (kg .eq. nz) u(i,j,k+1 )= - u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
745- if (kg .eq. nz) v(i,j,k+1 )= - v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
746- if (kg .eq. nz+1 ) w(i,j,k)= 0.d0 ! w point (nz+1) is at the wall
745+ if (kg .eq. nz+1 ) u(i,j,k)= - u(i,j,k-1 ) ! mean value between kg and kg+1 (wall) equal to zero
746+ if (kg .eq. nz+1 ) v(i,j,k)= - v(i,j,k-1 ) ! mean value between kg and kg+1 (wall) equal to zero
747+ if (kg .eq. nz+1 ) w(i,j,k)= 0.d0 ! w point (nz+1) is at the wall
747748 enddo
748749 enddo
749750 enddo
751+
750752 enddo
753+ call nvtxEndRange
754+
755+
751756 ! ########################################################################################################################################
752757 ! END STEP 6: USTAR COMPUTATION
753758 ! ########################################################################################################################################
@@ -780,6 +785,7 @@ program main
780785 ! $acc end kernels
781786 call nvtxEndRange
782787
788+
783789 call nvtxStartRange(" FFT forward w/ transpositions" )
784790 ! $acc host_data use_device(rhsp)
785791 status = cufftExecD2Z(planXf, rhsp, psi_d)
@@ -810,30 +816,29 @@ program main
810816 ! $acc parallel loop collapse(2) gang private(a,b,c,d,factor)
811817 do jl = 1 , npy
812818 do il = 1 , npx
813- ! compute index global wavenumber ig and jg
819+ ! compute global index ig and jg
814820 jg = yoff + jl
815821 ig = xoff + il
816822 ! Set up tridiagonal system for each i and j
817823 ! Fill diagonals and rhs for each
818- ! 0 and ny +1 are the ghost nodes
824+ ! 0 and nz +1 are the ghost nodes
819825 do k = 1 , nz
820- a(k) = 2.0d0 * (dzi(k)** 2 * dzi(k+1 ))/ (dzi(k)+ dzi(k+1 ))
821- c(k) = 2.0d0 * (dzi(k)* dzi(k+1 )** 2 )/ (dzi(k)+ dzi(k+1 ))
822- b(k) = - a(k) - c(k) + ( 2.d0 * (cos (kx_d(ig)* dx)- 1.d0 )* dxi* dxi + 2.d0 * (cos (ky_d(jg)* dy)- 1.d0 )* dyi* dyi)
826+ a(k) = 2.0d0 * (dzi(k)** 2.d0 * dzi(k+1 ))/ (dzi(k)+ dzi(k+1 ))
827+ c(k) = 2.0d0 * (dzi(k)* dzi(k+1 )** 2.d0 )/ (dzi(k)+ dzi(k+1 ))
828+ b(k) = - a(k) - c(k) + 2.d0 * (cos (kx_d(ig)* dx)- 1.d0 )* dxi* dxi + 2.d0 * (cos (ky_d(jg)* dy)- 1.d0 )* dyi* dyi ! opt: precompute cosines?
823829 d(k) = psi3d(k,il,jl)
824830 enddo
825831 ! Neumann BC at bottom
826832 a(0 ) = 0.d0
827- b(0 ) = - 1.d0 ! *dzi(1)*dzi(1)
828- c(0 ) = 1.d0 ! *dzi(1)*dzi(1)
833+ b(0 ) = - 1.d0
834+ c(0 ) = 1.d0
829835 d(0 ) = 0.d0
830836 ! Neumann BC at top
831- a(nz+1 ) = 1.d0 ! *dzi(nz+1)*dzi(nz+1)
832- b(nz+1 ) = - 1.d0 ! *dzi(nz+1)*dzi(nz+1)
837+ a(nz+1 ) = 1.d0
838+ b(nz+1 ) = - 1.d0
833839 c(nz+1 ) = 0.d0
834840 d(nz+1 ) = 0.d0
835841 ! Enforce pressure at one point? one interior point, avodig messing up with BC
836- ! need brackets?
837842 if (ig == 1 .and. jg == 1 ) then
838843 a(nz+1 ) = 0.d0
839844 b(nz+1 ) = 1.d0
@@ -847,10 +852,9 @@ program main
847852 d(k) = d(k) - factor* d(k-1 )
848853 end do
849854 ! Back substitution
850- psi3d(nz+1 ,il,jl) = d(nz+1 )/ b(nz+1 )
851- ! check on pivot like flutas?
855+ psi3d(nz,il,jl) = (d(nz) - c(nz)* d(nz+1 )/ b(nz+1 ))/ b(nz)
852856 ! $acc loop seq
853- do k = nz, 1 , - 1
857+ do k = nz-1 , 1 , - 1
854858 psi3d(k,il,jl) = (d(k) - c(k)* psi3d(k+1 ,il,jl))/ b(k)
855859 end do
856860 end do
@@ -879,18 +883,14 @@ program main
879883 end do
880884 end do
881885 end do
882-
883- call nvtxEndRange
884886 ! update halo nodes with pressure
885- ! Update X-pencil halos
886887 ! $acc host_data use_device(p)
887888 CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX% halo_extents, halo_periods, 2 ))
888889 CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX% halo_extents, halo_periods, 3 ))
889890 ! $acc end host_data
890891 ! ########################################################################################################################################
891892 ! END STEP 7: POISSON SOLVER FOR PRESSURE
892893 ! ########################################################################################################################################
893- call nvtxEndRange
894894
895895
896896 call nvtxStartRange(" Correction" )
@@ -900,7 +900,10 @@ program main
900900 ! 8.1 Correct velocity
901901 ! 8.2 Call halo update
902902 ! Correct velocity, pressure has also the halo
903- ! $acc kernels
903+ umax= 0.d0
904+ vmax= 0.d0
905+ wmax= 0.d0
906+ ! $acc parallel loop collapse(3) reduction(max:umax,vmax,wmax)
904907 do k= 1 + halo_ext, piX% shape (3 )- halo_ext
905908 do j= 1 + halo_ext, piX% shape (2 )- halo_ext
906909 do i = 1 , piX% shape (1 ) ! equal to nx (no halo on x)
@@ -909,13 +912,20 @@ program main
909912 km= k-1
910913 kg= piX% lo(3 ) + k - 1 - halo_ext
911914 if (im < 1 ) im= nx
915+ if (kg .eq. 1 ) then
916+ u(i,j,k)= u(i,j,k) - dt/ rho* (p(i,j,k)- p(im,j,k))* dxi
917+ v(i,j,k)= v(i,j,k) - dt/ rho* (p(i,j,k)- p(i,jm,k))* dyi
918+ else
912919 u(i,j,k)= u(i,j,k) - dt/ rho* (p(i,j,k)- p(im,j,k))* dxi
913920 v(i,j,k)= v(i,j,k) - dt/ rho* (p(i,j,k)- p(i,jm,k))* dyi
914921 w(i,j,k)= w(i,j,k) - dt/ rho* (p(i,j,k)- p(i,j,km))* dzi(kg)
922+ endif
923+ ! umax=max(umax,u(i,j,k))
924+ ! vmax=max(vmax,v(i,j,k))
925+ ! wmax=max(wmax,w(i,j,k))
915926 enddo
916927 enddo
917928 enddo
918- ! $acc end kernels
919929
920930 ! 8.3 update halos (y direction), required to then compute the RHS of Poisson equation because of staggered grid
921931 ! $acc host_data use_device(u,v,w)
@@ -927,34 +937,7 @@ program main
927937 CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX% halo_extents, halo_periods, 3 ))
928938 ! $acc end host_data
929939
930- ! impose velocity boundary conditions, can be optimized, no real gain
931- ! w is at the wall, u and v interpolate so that the mean value is zero
932- ! no-slip assumted, i.e. u=0, can be extented to any value
933- umax= 0.d0
934- vmax= 0.d0
935- wmax= 0.d0
936- ! $acc parallel loop collapse(3) reduction(max:umax,vmax,wmax)
937- do k= 1 , piX% shape (3 )
938- do j= 1 , piX% shape (2 )
939- do i= 1 ,nx
940- kg = piX% lo(3 ) + k - 1 - halo_ext
941- ! bottom wall
942- if (kg .eq. 1 ) u(i,j,k-1 ) = - u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
943- if (kg .eq. 1 ) v(i,j,k-1 ) = - v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
944- if (kg .eq. 1 ) w(i,j,k) = 0.d0 ! w point is at the wall
945- ! top wall
946- if (kg .eq. nz) u(i,j,k+1 ) = - u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
947- if (kg .eq. nz) v(i,j,k+1 ) = - v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
948- if (kg .eq. nz+1 ) w(i,j,k) = 0.d0 ! w point (nz+1) is at the wall
949- umax= max (umax,u(i,j,k))
950- vmax= max (vmax,v(i,j,k))
951- wmax= max (wmax,w(i,j,k))
952- enddo
953- enddo
954- enddo
955-
956-
957- maxdiv= 0.0d0
940+ maxdiv= 0.d0
958941 ! $acc parallel loop collapse(3) reduction(max:maxdiv)
959942 do k= 1 + halo_ext, piX% shape (3 )- halo_ext
960943 do j= 1 + halo_ext, piX% shape (2 )- halo_ext
@@ -971,7 +954,6 @@ program main
971954 enddo
972955 write (* ,* ) " Max divergence after correction " , maxdiv
973956
974-
975957 call MPI_Allreduce(umax,gumax,1 ,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
976958 call MPI_Allreduce(vmax,gvmax,1 ,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
977959 call MPI_Allreduce(wmax,gwmax,1 ,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
0 commit comments