diff --git a/src/core_atmosphere/Registry.xml b/src/core_atmosphere/Registry.xml
index 4281c40bba..9bfc99f5c8 100644
--- a/src/core_atmosphere/Registry.xml
+++ b/src/core_atmosphere/Registry.xml
@@ -146,6 +146,31 @@
                      description="Formulation of horizontal mixing"
                      possible_values="`2d_fixed' or `2d_smagorinsky'"/>
 
+                <nml_option name="config_les_model" type="character" default_value="none"
+                     units="-"
+                     description="LES dissipation model"
+                     possible_values="`none', `3d_smagorinsky', `prognostic_1.5_order'"/>
+
+                <nml_option name="config_les_surface" type="character" default_value="none"
+                     units="-"
+                     description="LES surface flux option"
+                     possible_values="`specified', `varying'"/>
+
+                <nml_option name="config_surface_heat_flux" type="real" default_value="0.0"
+                     units="K m s-1"
+                     description="specified surface heat flux w'theta'"
+                     possible_values="real number"/>
+
+                <nml_option name="config_surface_moisture_flux" type="real" default_value="0.0"
+                     units="kg m s-1"
+                     description="specified surface moisture flux w'q'"
+                     possible_values="real number"/>
+
+                <nml_option name="config_surface_drag_coefficient" type="real" default_value="0.0"
+                     units="-"
+                     description="Cd 10m drag coefficient"
+                     possible_values="real number"/>
+
                 <nml_option name="config_len_disp" type="real" default_value="0.0" in_defaults="false"
                      units="m"
                      description="Horizontal length scale, used by the Smagorinsky formulation of horizontal diffusion and by 3-d divergence damping"
@@ -156,6 +181,11 @@
                      description="Scaling coefficient of $\delta x^3$ to obtain $\nabla^4$ diffusion coefficient"
                      possible_values="Non-negative real values"/>
 
+                <nml_option name="config_mix_scalars" type="logical" default_value="false"
+                     units="-"
+                     description="Whether to enable horizontal and vertical mixing for scalars"
+                     possible_values=".true. or .false."/>
+
                 <nml_option name="config_del4u_div_factor" type="real" default_value="10.0" in_defaults="false"
                      units="-"
                      description="Scaling factor for the divergent component of $\nabla^4 u$ calculation"
@@ -392,6 +422,10 @@
                      units="-"
                      description="Method to use for exchanging halos"
                      possible_values="`mpas_dmpar', `mpas_halo'"/>
+               <nml_option name="config_gpu_aware_mpi" type="logical" default_value="false"
+                     units="-"
+                     description="Whether to use GPU-aware MPI for halo exchanges"
+                     possible_values=".true. or .false."/>
         </nml_record>
 
 <!-- **************************************************************************************** -->
@@ -494,6 +528,11 @@
 			<var name="deriv_two"/>
 			<var name="defc_a"/>
 			<var name="defc_b"/>
+			<var name="deformation_coef_c2"/>
+			<var name="deformation_coef_s2"/>
+			<var name="deformation_coef_cs"/>
+			<var name="deformation_coef_c"/>
+			<var name="deformation_coef_s"/>
 			<var name="coeffs_reconstruct"/>
 #ifdef MPAS_CAM_DYCORE
 			<var name="cell_gradient_coef_x"/>
@@ -1561,6 +1600,20 @@
                 <var name="cell_gradient_coef_y" type="real" dimensions="maxEdges nCells" units="m^-1"
                      description="Coefficients for computing the y (meridional) derivative of a cell-centered variable"/>
 #endif
+                <var name="deformation_coef_c2" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-squared terms of the 3d deformation"/>
+
+                <var name="deformation_coef_s2" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the sine-squared terms of the 3d deformation"/>
+
+                <var name="deformation_coef_cs" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-sine terms of the 3d deformation"/>
+
+                <var name="deformation_coef_c" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-only terms of the 3d deformation"/>
+
+                <var name="deformation_coef_s" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the sine-only terms of the 3d deformation"/>
 
                 <!-- Arrays required for reconstruction of velocity field -->
                 <var name="coeffs_reconstruct" type="real" dimensions="R3 maxEdges nCells" units="unitless"
@@ -1673,6 +1726,9 @@
                         <var name="nwfa" array_group="number" units="nb kg^{-1}"
                              description="Water-friendly aerosol number concentration"
                              packages="mp_thompson_aers_in"/>
+
+                        <var name="tke" array_group="turbulence" units="m^2 s^{-2}"
+                             description="Turbulent kinetic energy for the prognostic tke LES scheme"/>
                 </var_array>
 #endif
 
@@ -1921,8 +1977,20 @@
                 <var name="rho_p_save" type="real" dimensions="nVertLevels nCells Time" units="kg m^{-3}"
                      description="predicted value rho_p, saved before acoustic steps"/>
 
-                <var name="kdiff" type="real" dimensions="nVertLevels nCells Time" units="m^2 s^{-1}"
-                     description="Smagorinsky horizontal eddy viscosity"/>
+                <var name="eddy_visc_vert" type="real" dimensions="nVertLevels nCells Time" units="m^2 s^{-1}"
+                     description="vertical eddy viscosity for les models"/>
+
+                <var name="eddy_visc_horz" type="real" dimensions="nVertLevels nCells Time" units="m^2 s^{-1}"
+                     description="horizontal eddy viscosity for les models"/>
+
+                <var name="prandtl_3d_inv" type="real" dimensions="nVertLevels nCells Time" units="unitless"
+                     description="Inverse prandtl number used for vertical mixing in prognostic LES formulation"/>
+
+                <var name="bn2" type="real" dimensions="nVertLevels nCells Time" units="s^{-2}"
+                     description="Square of Brunt-Vaisala frequency"/>
+
+                <var name="tend_rtheta_adv" type="real" dimensions="nVertLevels nCells Time" units="kg K m^{-3} s^{-1}"
+                     description="flux divergence for rho*theta_m/zz, used in the Tiedtke convective parameterization"/>
 
                 <var name="surface_pressure" type="real" dimensions="nCells Time" units="Pa"
                      description="Diagnosed surface pressure"/>
@@ -2024,6 +2092,9 @@
                         <var name="tend_nwfa" name_in_code="nwfa" array_group="number" units="nb m^{-3} s^{-1}"
                              description="Tendency of water-friendly aerosol number concentration multiplied by dry air density divided by d(zeta)/dz"
                              packages="mp_thompson_aers_in"/>
+
+                        <var name="tend_tke" name_in_code="tke" array_group="turbulence" units="kg m^{-3} m^2 s^{-2} s^{-1}" 
+                             description="Tendency of tke multiplied by dry air density divided by d(zeta)/dz"/>
                 </var_array>
 #endif
         </var_struct>
@@ -2102,6 +2173,10 @@
                         <var name="lbc_nwfa" name_in_code="nwfa" array_group="number" packages="mp_thompson_aers_in"
                              units="m^{-3} s^{-1}"
                              description="Lateral boundary tendency of water-friendly aerosol number concentration"/>
+
+                        <var name="lbc_tke" name_in_code="tke" array_group="turbulence"
+                             units="m^2 s^{-3}"
+                             description="Lateral boundary tendency of TKE"/>
                 </var_array>
         </var_struct>
 
diff --git a/src/core_atmosphere/dynamics/Makefile b/src/core_atmosphere/dynamics/Makefile
index 6892633c68..1bf21fff9d 100644
--- a/src/core_atmosphere/dynamics/Makefile
+++ b/src/core_atmosphere/dynamics/Makefile
@@ -1,11 +1,12 @@
 .SUFFIXES: .F .o
 
 OBJS = mpas_atm_time_integration.o \
-       mpas_atm_boundaries.o
+       mpas_atm_boundaries.o \
+       mpas_atm_dissipation_models.o
 
 all: $(OBJS)
 
-mpas_atm_time_integration.o: mpas_atm_boundaries.o mpas_atm_iau.o
+mpas_atm_time_integration.o: mpas_atm_boundaries.o mpas_atm_iau.o mpas_atm_dissipation_models.o
 
 mpas_atm_boundaries.o:
 
diff --git a/src/core_atmosphere/dynamics/mpas_atm_boundaries.F b/src/core_atmosphere/dynamics/mpas_atm_boundaries.F
index 787e7719a1..6c19ed7931 100644
--- a/src/core_atmosphere/dynamics/mpas_atm_boundaries.F
+++ b/src/core_atmosphere/dynamics/mpas_atm_boundaries.F
@@ -395,18 +395,14 @@ subroutine mpas_atm_get_bdy_tend(clock, block, vertDim, horizDim, field, delta_t
         nullify(tend)
         call mpas_pool_get_array(lbc, 'lbc_'//trim(field), tend, 1)
 
-        MPAS_ACC_TIMER_START('mpas_atm_get_bdy_tend [ACC_data_xfer]')
         if (associated(tend)) then
-            !$acc enter data copyin(tend)
         else
             call mpas_pool_get_array(lbc, 'lbc_scalars', tend_scalars, 1)
-            !$acc enter data copyin(tend_scalars)
 
             ! Ensure the integer pointed to by idx_ptr is copied to the gpu device
             call mpas_pool_get_dimension(lbc, 'index_'//trim(field), idx_ptr)
             idx = idx_ptr
         end if
-        MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_tend [ACC_data_xfer]')
 
         !$acc parallel default(present)
         if (associated(tend)) then
@@ -426,13 +422,6 @@ subroutine mpas_atm_get_bdy_tend(clock, block, vertDim, horizDim, field, delta_t
         end if
         !$acc end parallel
 
-        MPAS_ACC_TIMER_START('mpas_atm_get_bdy_tend [ACC_data_xfer]')
-        if (associated(tend)) then
-            !$acc exit data delete(tend)
-        else
-            !$acc exit data delete(tend_scalars)
-        end if
-        MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_tend [ACC_data_xfer]')
 
     end subroutine mpas_atm_get_bdy_tend
 
@@ -533,9 +522,6 @@ subroutine mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, del
         ! query the field as a scalar constituent
         !
         if (associated(tend) .and. associated(state)) then
-            MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
-            !$acc enter data copyin(tend, state)
-            MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
 
             !$acc parallel default(present)
             !$acc loop gang vector collapse(2)
@@ -546,9 +532,6 @@ subroutine mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, del
             end do
             !$acc end parallel
 
-            MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
-            !$acc exit data delete(tend, state)
-            MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
         else
             call mpas_pool_get_array(lbc, 'lbc_scalars', tend_scalars, 1)
             call mpas_pool_get_array(lbc, 'lbc_scalars', state_scalars, 2)
@@ -556,10 +539,6 @@ subroutine mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, del
 
             idx=idx_ptr ! Avoid non-array pointer for OpenACC
 
-            MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
-            !$acc enter data copyin(tend_scalars, state_scalars)
-            MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
-
             !$acc parallel default(present)
             !$acc loop gang vector collapse(2)
             do i=1, horizDim+1
@@ -569,9 +548,6 @@ subroutine mpas_atm_get_bdy_state_2d(clock, block, vertDim, horizDim, field, del
             end do
             !$acc end parallel
 
-            MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
-            !$acc exit data delete(tend_scalars, state_scalars)
-            MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_2d [ACC_data_xfer]')
         end if
 
     end subroutine mpas_atm_get_bdy_state_2d
@@ -652,10 +628,6 @@ subroutine mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim,
         call mpas_pool_get_array(lbc, 'lbc_'//trim(field), tend, 1)
         call mpas_pool_get_array(lbc, 'lbc_'//trim(field), state, 2)
 
-        MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_3d [ACC_data_xfer]')
-        !$acc enter data copyin(tend, state)
-        MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_3d [ACC_data_xfer]')
-
         !$acc parallel default(present)
         !$acc loop gang vector collapse(3)
         do i=1, horizDim+1
@@ -667,10 +639,6 @@ subroutine mpas_atm_get_bdy_state_3d(clock, block, innerDim, vertDim, horizDim,
         end do
         !$acc end parallel
 
-        MPAS_ACC_TIMER_START('mpas_atm_get_bdy_state_3d [ACC_data_xfer]')
-        !$acc exit data delete(tend, state)
-        MPAS_ACC_TIMER_STOP('mpas_atm_get_bdy_state_3d [ACC_data_xfer]')
-
     end subroutine mpas_atm_get_bdy_state_3d
 
 
diff --git a/src/core_atmosphere/dynamics/mpas_atm_dissipation_models.F b/src/core_atmosphere/dynamics/mpas_atm_dissipation_models.F
new file mode 100644
index 0000000000..450ed045c1
--- /dev/null
+++ b/src/core_atmosphere/dynamics/mpas_atm_dissipation_models.F
@@ -0,0 +1,1622 @@
+! Copyright (c) 2026, The University Corporation for Atmospheric Research (UCAR).
+!
+! Unless noted otherwise source code is licensed under the BSD license.
+! Additional copyright and license information can be found in the LICENSE file
+! distributed with this code, or at http://mpas-dev.github.com/license.html
+!
+
+#define COMMA ,
+#define DEBUG_WRITE(M) ! call mpas_log_write(M)
+
+module mpas_atm_dissipation_models
+
+   use mpas_kind_types, only : RKIND
+   use mpas_atmphys_constants
+   use mpas_constants
+   use mpas_log
+   use mpas_timekeeping, only : mpas_get_clock_time, mpas_get_time
+   use mpas_derived_types, only : MPAS_Clock_type, MPAS_Time_type, MPAS_NOW, MPAS_LOG_CRIT
+
+   logical, parameter :: les_test = .true., les_sas_test = .false.
+   !! real (kind=RKIND), parameter :: tke_heat_flux = 0.03  ! shear case from Moeng et al., first hour
+   ! real (kind=RKIND), parameter :: tke_heat_flux = 0.03
+   !  real (kind=RKIND), parameter :: tke_heat_flux = 0.0
+   !! real (kind=RKIND), parameter :: tke_drag_coefficient = 0.0013 ! ocean roughness length
+   ! real (kind=RKIND), parameter :: tke_drag_coefficient = 0.006
+   ! real (kind=RKIND), parameter :: tke_drag_coefficient = 0.0
+   real (kind=RKIND), parameter :: epsilon_bv = 1.e-06
+   ! real (kind=RKIND), parameter :: c_k = 0.1
+   real (kind=RKIND), parameter :: c_k = 0.25
+
+   integer, parameter :: LES_INVALID_OPT = -1
+
+   integer, parameter :: LES_MODEL_NONE = 0, &
+                         LES_MODEL_3D_SMAGORINSKY = 1, &
+                         LES_MODEL_PROGNOSTIC_15_ORDER = 2
+
+   integer, parameter :: LES_SURFACE_NONE = 0, &
+                         LES_SURFACE_SPECIFIED = 1, &
+                         LES_SURFACE_VARYING = 2
+
+contains
+
+
+   !-----------------------------------------------------------------------
+   !  routine les_model_from_string
+   !
+   !> \brief Converts an LES model option from a string to an integer parameter
+   !> \author Michael Duda
+   !> \date   13 February 2026
+   !> \details
+   !>  Given a string that contains the name of a valid LES model option, this
+   !>  routine returns an integer parameter corresponding to that option.
+   !>
+   !>  If the given string is not recognized as a valid LES model option, the
+   !>  integer parameter LES_INVALID_OPT is returned.
+   !
+   !-----------------------------------------------------------------------
+   pure function les_model_from_string(les_model_str) result(les_model_opt)
+
+      implicit none
+
+      ! Arguments
+      character(len=*), intent(in) :: les_model_str
+
+      ! Return value
+      integer :: les_model_opt
+
+
+      if (trim(les_model_str) == 'none') then
+         les_model_opt = LES_MODEL_NONE
+      else if (trim(les_model_str) == '3d_smagorinsky') then
+         les_model_opt = LES_MODEL_3D_SMAGORINSKY
+      else if (trim(les_model_str) == 'prognostic_1.5_order') then
+         les_model_opt = LES_MODEL_PROGNOSTIC_15_ORDER
+      else
+         les_model_opt = LES_INVALID_OPT
+      end if
+
+   end function les_model_from_string
+
+
+   !-----------------------------------------------------------------------
+   !  routine les_surface_from_string
+   !
+   !> \brief Converts an LES surface option from a string to an integer parameter
+   !> \author Michael Duda
+   !> \date   13 February 2026
+   !> \details
+   !>  Given a string that contains the name of a valid LES surface option, this
+   !>  routine returns an integer parameter corresponding to that option.
+   !>
+   !>  If the given string is not recognized as a valid LES surface option, the
+   !>  integer parameter LES_INVALID_OPT is returned.
+   !
+   !-----------------------------------------------------------------------
+   pure function les_surface_from_string(les_surface_str) result(les_surface_opt)
+
+      implicit none
+
+      ! Arguments
+      character(len=*), intent(in) :: les_surface_str
+
+      ! Return value
+      integer :: les_surface_opt
+
+
+      if (trim(les_surface_str) == 'none') then
+         les_surface_opt = LES_SURFACE_NONE
+      else if (trim(les_surface_str) == 'specified') then
+         les_surface_opt = LES_SURFACE_SPECIFIED
+      else if (trim(les_surface_str) == 'varying') then
+         les_surface_opt = LES_SURFACE_VARYING
+      else
+         les_surface_opt = LES_INVALID_OPT
+      end if
+
+   end function les_surface_from_string
+
+
+   subroutine smagorinsky_2d( kdiff, u, v, c_s, config_len_disp,                                 &
+                              deformation_coef_c2, deformation_coef_s2, deformation_coef_cs,     &
+                              invDt, h_mom_eddy_visc4, config_visc4_2dsmag, h_theta_eddy_visc4,  &
+                              cellStart, cellEnd, nEdgesOnCell, edgesOnCell,                     &
+                              nCells, nEdges                                                    )
+
+      use mpas_atm_dimensions  !  pull nVertLevels and maxEdges from here
+
+      implicit none
+
+      integer, intent(in) :: cellStart, cellEnd, nCells, nEdges
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: u
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: v
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_c2
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_s2
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_cs
+      real (kind=RKIND), intent(in) :: c_s, config_len_disp, invDt, config_visc4_2dsmag
+      integer, dimension(nCells+1), intent(in) :: nEdgesOnCell
+      integer, dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell
+
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(out) :: kdiff
+      real (kind=RKIND), intent(out) :: h_mom_eddy_visc4, h_theta_eddy_visc4
+
+      ! local variables
+
+      integer :: iCell, iEdge, k
+      real (kind=RKIND), dimension(nVertLevels) :: d_11, d_22, d_12, dudx, dudy, dvdx, dvdy
+
+
+      DEBUG_WRITE(' begin smagorinsky_2d ')
+
+      !$acc enter data create(dudx, dudy, dvdx, dvdy)
+      !$acc enter data create(d_11, d_22, d_12)
+
+      !$acc parallel default(present)
+
+      !$acc loop gang worker private(dudx, dudy, dvdx, dvdy, d_11, d_22, d_12)
+      do iCell = cellStart,cellEnd
+
+         !$acc loop vector
+         do k = 1, nVertLevels
+            dudx(k) = 0.0_RKIND
+            dudy(k) = 0.0_RKIND
+            dvdx(k) = 0.0_RKIND
+            dvdy(k) = 0.0_RKIND
+         end do
+
+         !$acc loop seq
+         do iEdge=1,nEdgesOnCell(iCell)
+            !$acc loop vector
+            do k=1,nVertLevels
+               dudx(k) = dudx(k) + deformation_coef_c2(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell)) &
+                                 - deformation_coef_cs(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
+               dudy(k) = dudy(k) + deformation_coef_cs(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell)) &
+                                 - deformation_coef_s2(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
+               dvdx(k) = dvdx(k) + deformation_coef_cs(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell)) &
+                                 + deformation_coef_c2(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
+               dvdy(k) = dvdy(k) + deformation_coef_s2(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell)) &
+                                 + deformation_coef_cs(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
+            end do
+         end do
+
+!DIR$ IVDEP
+         !$acc loop vector
+         do k=1, nVertLevels
+            ! here is the Smagorinsky formulation,
+            ! followed by imposition of an upper bound on the eddy viscosity
+            d_11(k) = 2*dudx(k)
+            d_22(k) = 2*dvdy(k)
+            d_12(k) = dudy(k) + dvdx(k)
+            kdiff(k,iCell) = (c_s * config_len_disp)**2 * sqrt(0.25*(d_11(k)-d_22(k))**2 + d_12(k)**2)
+            kdiff(k,iCell) = min(kdiff(k,iCell),(0.01*config_len_disp**2) * invDt)
+         end do
+      end do
+
+      !$acc end parallel
+
+      !$acc exit data delete(dudx, dudy, dvdx, dvdy)
+      !$acc exit data delete(d_11, d_22, d_12)
+
+      h_mom_eddy_visc4   = config_visc4_2dsmag * config_len_disp**3
+      h_theta_eddy_visc4 = h_mom_eddy_visc4
+
+      DEBUG_WRITE(' exiting smagorinsky_2d ')
+
+   end subroutine smagorinsky_2d
+
+!---------------------------------------
+
+   subroutine les_models( les_model_opt, les_surface_opt, dynamics_substep, eddy_visc_horz, eddy_visc_vert, &
+                          u, v, uCell, vCell,                                                &
+                          w, c_s, bv_freq2, zgrid, config_len_disp,                          &
+                          deformation_coef_c2, deformation_coef_s2, deformation_coef_cs,     &
+                          deformation_coef_c, deformation_coef_s, prandtl_3d_inv,            &
+                          invDt, h_mom_eddy_visc4, config_visc4_2dsmag, h_theta_eddy_visc4,  &
+                          scalars, tend_scalars, index_tke, rho_zz, meshScalingDel2,         &
+                          cellStart, cellEnd, nEdgesOnCell, edgesOnCell, cellsOnEdge,        &
+                          nCells, nEdges, nVertLevels, maxEdges, num_scalars                )
+
+      implicit none
+
+      integer, intent(in) :: les_model_opt
+      integer, intent(in) :: les_surface_opt
+
+      integer, intent(in) :: cellStart, cellEnd, nCells, nEdges, nVertLevels, maxEdges, index_tke, num_scalars
+      integer, intent(in) :: dynamics_substep
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: u
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: v
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: uCell
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: vCell
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: bv_freq2
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(out) :: prandtl_3d_inv
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: rho_zz
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1), intent(inout) :: scalars
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1) :: tend_scalars
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(in) :: w
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(in) :: zgrid
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_c2
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_s2
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_cs
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_c
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: deformation_coef_s
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel2
+      real (kind=RKIND), intent(in) :: c_s, config_len_disp, invDt, config_visc4_2dsmag
+      integer, dimension(nCells+1), intent(in) :: nEdgesOnCell
+      integer, dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell
+      integer, dimension(2,nEdges+1), intent(in) :: cellsOnEdge
+
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(out) :: eddy_visc_horz, eddy_visc_vert
+      real (kind=RKIND), intent(out) :: h_mom_eddy_visc4, h_theta_eddy_visc4
+
+      ! local variables
+
+      integer :: iCell, iEdge, k, ie, cell1, cell2
+      real (kind=RKIND), dimension(nVertLevels) :: d_11, d_22, d_33, d_12, d_13, d_23
+      real (kind=RKIND), dimension(nVertLevels) :: dudx, dudy, dvdx, dvdy
+      real (kind=RKIND), dimension(nVertLevels+1) :: dwdx, dwdy
+      real (kind=RKIND), dimension(nVertLevels) :: dudz, dvdz, dwdz
+      real (kind=RKIND) :: rdz, def2, pr_inv, wk
+      real (kind=RKIND) :: shear_production, buoyancy, dissipation, delta_z, delta_s, bv, tke_length, diss_length
+      real (kind=RKIND) :: l_horizontal, l_vertical, c_dissipation
+      real (kind=RKIND) :: prandtl_horizontal_inv
+      real (kind=RKIND) :: eddy_visc_h, eddy_visc_v
+
+      logical, parameter :: test_tke=.true.
+      ! real (kind=RKIND), parameter :: epsilon_bv = 1.e-06
+
+
+      !$acc enter data create(dudx, dudy, dvdx, dvdy, dwdx, dwdy, dudz, dvdz, dwdz)
+      !$acc enter data create(d_11, d_22, d_33, d_12, d_13, d_23)
+
+      pr_inv = 1./prandtl
+
+      ! set up coefficients for 4th-order horizontal background filter
+
+      h_mom_eddy_visc4   = config_visc4_2dsmag * config_len_disp**3
+      h_theta_eddy_visc4 = h_mom_eddy_visc4
+
+      !$acc parallel default(present)
+
+      !$acc loop gang worker private(dudx, dudy, dvdx, dvdy, dwdx, dwdy, dudz, dvdz, dwdz, d_11, d_22, d_33, d_12, d_13, d_23)
+      do iCell = cellStart,cellEnd
+
+         !$acc loop vector
+         do k = 1, nVertLevels
+            dudx(k) = 0.0_RKIND
+            dudy(k) = 0.0_RKIND
+            dvdx(k) = 0.0_RKIND
+            dvdy(k) = 0.0_RKIND
+
+            dudz(k) = 0.0_RKIND
+            dvdz(k) = 0.0_RKIND
+            dwdz(k) = 0.0_RKIND
+         end do
+
+         !$acc loop vector
+         do k = 1, nVertLevels+1
+            dwdx(k) = 0.0_RKIND
+            dwdy(k) = 0.0_RKIND
+         end do
+
+         !$acc loop seq
+         do iEdge=1,nEdgesOnCell(iCell)
+
+            ie = EdgesOnCell(iEdge,iCell)
+            cell1 = cellsOnEdge(1,ie)
+            cell2 = cellsOnEdge(2,ie)
+
+            !$acc loop vector
+            do k=1,nVertLevels
+               dudx(k) = dudx(k) + deformation_coef_c2(iEdge,iCell)*u(k,ie) &
+                                 - deformation_coef_cs(iEdge,iCell)*v(k,ie)
+               dudy(k) = dudy(k) + deformation_coef_cs(iEdge,iCell)*u(k,ie) &
+                                 - deformation_coef_s2(iEdge,iCell)*v(k,ie)
+               dvdx(k) = dvdx(k) + deformation_coef_cs(iEdge,iCell)*u(k,ie) &
+                                 + deformation_coef_c2(iEdge,iCell)*v(k,ie)
+               dvdy(k) = dvdy(k) + deformation_coef_s2(iEdge,iCell)*u(k,ie) &
+                                 + deformation_coef_cs(iEdge,iCell)*v(k,ie)
+            end do
+
+            !$acc loop vector
+            do k=1,nVertLevels+1
+               wk = 0.5*(w(k,cell1)+w(k,cell2))
+               dwdx(k) = dwdx(k) + deformation_coef_c(iEdge,iCell)*wk
+               dwdy(k) = dwdy(k) + deformation_coef_s(iEdge,iCell)*wk
+            end do
+
+         end do
+
+         !$acc loop vector
+         do k=1,nVertLevels
+            rdz = 1./(zgrid(k+1,iCell)-zgrid(k,iCell))
+            dwdz(k) = (w(k+1,iCell)-w(k,iCell))*rdz
+         end do
+
+         !$acc loop vector
+         do k=2,nVertLevels-1
+            rdz = 1./(zgrid(k+2,iCell)+zgrid(k+1,iCell)-zgrid(k,iCell)-zgrid(k-1,iCell))
+            dudz(k) = (uCell(k+1,iCell)-uCell(k-1,iCell))*rdz
+            dvdz(k) = (vCell(k+1,iCell)-vCell(k-1,iCell))*rdz
+         end do
+
+         k = 1
+         rdz = 1./(zgrid(k+1,iCell)-zgrid(k,iCell))
+         dudz(k) = (uCell(k+1,iCell)-uCell(k,iCell))*rdz
+         dvdz(k) = (vCell(k+1,iCell)-vCell(k,iCell))*rdz
+
+         k = nVertLevels-1
+         rdz = 1./(zgrid(k+1,iCell)-zgrid(k,iCell))
+         dudz(k+1) = (uCell(k+1,iCell)-uCell(k,iCell))*rdz
+         dvdz(k+1) = (vCell(k+1,iCell)-vCell(k,iCell))*rdz
+ 
+         !$acc loop vector
+         do k=1, nVertLevels
+            d_11(k) = 2.*dudx(k)
+            d_22(k) = 2.*dvdy(k)
+            d_33(k) = 2.*dwdz(k)
+            d_12(k) = dudy(k) + dvdx(k)
+            d_13(k) = dwdx(k) + dudz(k)
+            d_23(k) = dwdy(k) + dvdz(k)
+         end do
+
+         if (les_model_opt == LES_MODEL_3D_SMAGORINSKY) then
+
+            !$acc loop vector
+            do k=1, nVertLevels
+               def2 = 0.5*(d_11(k)**2 + d_22(k)**2 + d_33(k)**2) + d_12(k)**2 + d_13(k)**2 + d_23(k)**2
+               eddy_visc_horz(k,iCell) = (c_s * config_len_disp)**2 * sqrt(max(0.,def2 - pr_inv*bv_freq2(k,iCell)))
+               eddy_visc_horz(k,iCell) = min(eddy_visc_horz(k,iCell),(0.01*config_len_disp**2) * invDt)
+               delta_z = zgrid(k+1,iCell)-zgrid(k,iCell)
+               eddy_visc_vert(k,iCell) = (c_s * delta_z)**2 * sqrt(max(0.,def2 - pr_inv*bv_freq2(k,iCell)))
+               ! eddy_visc_vert(k,iCell) = eddy_visc_horz(k,iCell)
+            end do
+
+         else if (les_model_opt == LES_MODEL_PROGNOSTIC_15_ORDER) then
+
+            !$acc loop vector
+            do k=1,nVertLevels  ! bound the tke here, currently hardwired
+               ! scalars(index_tke,k,iCell) = max(0.,min(100.,scalars(index_tke,k,iCell)))
+               scalars(index_tke,k,iCell) = max(0.,scalars(index_tke,k,iCell))
+            end do
+
+            !$acc loop vector
+            do k=1,nVertLevels
+
+               delta_z = zgrid(k+1,iCell)-zgrid(k,iCell)
+               delta_s = ((config_len_disp**2)*delta_z)**(1./3.)
+               bv = max( sqrt(abs(bv_freq2(k,iCell))), epsilon_bv )
+               tke_length = delta_s
+               ! isentropic mixing formulation
+               if(bv_freq2(k,iCell) .gt. 1.e-06) &
+                  tke_length = 0.76*sqrt(scalars(index_tke,k,iCell))/bv
+                  tke_length = min(tke_length, delta_z)
+               diss_length = min(delta_s,max(tke_length,0.01*delta_s))
+               if(bv_freq2(k,iCell) <= 0) diss_length = delta_s
+
+               ! non-isotropic mixing
+
+               l_horizontal = config_len_disp
+               l_vertical = min(delta_z,tke_length)
+               if(bv_freq2(k,iCell) <= 0) diss_length = delta_z
+
+               ! isotropic mixing
+
+               ! l_horizontal = min(delta_s,tke_length)
+               ! if(bv_freq2(k,iCell) <= 0) diss_length = delta_s
+               ! l_vertical = l_horizontal
+
+               ! eddy viscocities set here if we are running the 1.5 order prognostic tke scheme
+               eddy_visc_h = c_k*l_horizontal*sqrt(scalars(index_tke,k,iCell))
+               eddy_visc_h = min(eddy_visc_h,(0.01*config_len_disp**2) * invDt)
+               eddy_visc_v = c_k*l_vertical*sqrt(scalars(index_tke,k,iCell))
+               eddy_visc_v = min(eddy_visc_v,(0.01*delta_z**2) * invDt)
+
+               eddy_visc_horz(k,iCell) = eddy_visc_h
+               eddy_visc_vert(k,iCell) = eddy_visc_v
+
+               ! terms for the prognostic tke integration
+
+               shear_production = eddy_visc_h*(d_11(k)**2 + d_22(k)**2 + d_12(k)**2) &
+                                 +eddy_visc_v*(d_33(k)**2 + d_13(k)**2 + d_23(k)**2)
+
+               buoyancy = -eddy_visc_v*bv_freq2(k,iCell)
+
+               ! dissipation
+
+               c_dissipation = 1.9*c_k + max( 0.0, 0.93 - 1.9*c_k )*diss_length/delta_s
+               !  if( (k.eq. 1) .or. (k.eq.nVertLevels) ) c_dissipation = 3.9
+
+               dissipation = -c_dissipation*(scalars(index_tke,k,iCell)**(1.5))/diss_length
+
+               ! computing eddy viscosities *********
+
+               prandtl_horizontal_inv = 3.
+               prandtl_3d_inv(k,iCell) = 1.0+(2.0*l_vertical/delta_z)
+
+
+               ! RHS term for the subgrid ke.
+
+               if(dynamics_substep == 1) &
+                    tend_scalars(index_tke,k,iCell) = rho_zz(k,iCell)*( shear_production + buoyancy + dissipation )
+
+            end do
+
+         else
+
+!MGD            call mpas_log_write(' in les_models, no les scheme for '//trim(config_les_model), messageType=MPAS_LOG_CRIT)
+
+         end if ! end of les_model_opt test
+
+      end do ! loop over all owned cells (columns)
+
+      !$acc end parallel
+
+      !$acc exit data delete(dudx, dudy, dvdx, dvdy, dwdx, dwdy, dudz, dvdz, dwdz)
+      !$acc exit data delete(d_11, d_22, d_33, d_12, d_13, d_23)
+
+      DEBUG_WRITE(' les_models ')
+
+   end subroutine les_models
+
+!---------------------------------------
+
+   subroutine calculate_n2( bn2, theta_m, exner, pressure_b, pp, zgrid, scalars, index_qv, index_qc, qtot, &
+                            cellStart, cellEnd, nCells)
+
+      use mpas_atm_dimensions  !  pull nVertLevels and num_scalars from here
+
+      integer, intent(in) :: cellStart, cellEnd, nCells
+      integer, intent(in) :: index_qv, index_qc
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(out) :: bn2
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: theta_m, exner, pressure_b, pp, qtot
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(in) :: zgrid
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1), intent(in) :: scalars
+! local
+      integer :: iCell, k
+      real (kind=RKIND) :: dz, rdz, esw, p
+      real (kind=RKIND), parameter :: qc_cr = 0.00001  ! in kg/kg
+      real (kind=RKIND), dimension(nVertLevels) :: theta, qvsw, temp, coefa
+      logical :: dry_bv_frequency
+
+
+      DEBUG_WRITE(' begin BV frequency calculations ')
+
+      !$acc enter data create(theta, temp, qvsw, coefa)
+
+      !$acc parallel default(present)
+
+      !$acc loop gang worker private(theta, temp, qvsw, coefa)
+      do iCell = cellStart,cellEnd
+
+!DIR$ IVDEP
+         !$acc loop vector
+         do k=1, nVertLevels
+
+            theta(k) = theta_m(k,iCell) / (1._RKIND + rvord * scalars(index_qv,k,iCell))
+
+            temp(k) = exner(k,iCell) * theta(k)
+
+            p = pressure_b(k,iCell) + pp(k,iCell)
+            esw = 1000. * svp1 * exp(svp2 * (temp(k) - svpt0) / (temp(k) - svp3))
+            if (p < esw) esw = p * 0.99     ! fix for pressure < esw
+            qvsw(k) = ep_2 * esw / (p - esw)
+
+            coefa(k) = ( 1.0 + xlv * qvsw(k)/ R_d / temp(k) ) / &
+               ( 1.0 + xlv * xlv *qvsw(k) / Cp / R_v / temp(k) / temp(k) )
+
+         end do
+
+         !$acc loop vector
+         do k=2, nVertLevels-1
+            dz = 0.5 * (zgrid(k+2,iCell)+zgrid(k+1,iCell)) - 0.5 * (zgrid(k,iCell)+zgrid(k-1,iCell))
+            rdz = 1.0/dz
+
+            ! if ( scalars(index_qc,k,iCell) < qc_cr ) then
+            !   ! Dry Brunt-Vaisala frequency
+            !   bn2(k,iCell) = gravity * ((theta(k+1) - theta(k-1) ) / theta(k)  / dz  &
+            !                + rvord * (scalars(index_qv,k+1,iCell) - scalars(index_qv,k-1,iCell)) / dz &
+            !               - ( qtot(k+1, iCell) - qtot(k-1, iCell) ) / dz )
+            ! else
+            !   ! Moist Brunt-Vaisala frequency according to Durran and Klemp (1982) Eq. 36
+            !   bn2(k,iCell) = gravity * ( coefa(k) * ((theta(k+1) - theta(k-1) ) / theta(k) / dz &
+            !               + xlv / cp / temp(k) * ( qvsw(k+1) - qvsw(k-1)) / dz ) &
+            !               - ( qtot(k+1, iCell) - qtot(k-1, iCell) ) / dz )
+            ! endif
+
+            dry_bv_frequency = .true.
+            if(index_qc .gt. 0) then  ! if moist simulation, qc exists
+               if ( scalars(index_qc,k,iCell) .ge. qc_cr ) dry_bv_frequency = .false.
+            end if
+
+            if (dry_bv_frequency) then
+               ! Dry Brunt-Vaisala frequency
+               bn2(k,iCell) = gravity * ((theta(k+1) - theta(k-1) ) / theta(k)  * rdz  &
+                            + rvord * (scalars(index_qv,k+1,iCell) - scalars(index_qv,k-1,iCell)) * rdz &
+                           - ( qtot(k+1, iCell) - qtot(k-1, iCell) ) * rdz )
+            else
+               ! Moist Brunt-Vaisala frequency according to Durran and Klemp (1982) Eq. 36
+               bn2(k,iCell) = gravity * ( coefa(k) * ((theta(k+1) - theta(k-1) ) / theta(k) * rdz &
+                           + xlv / cp / temp(k) * ( qvsw(k+1) - qvsw(k-1)) * rdz ) &
+                           - ( qtot(k+1, iCell) - qtot(k-1, iCell) ) * rdz )
+            endif
+
+         end do
+
+         bn2(1,iCell) = bn2(2,iCell)
+         bn2(nVertLevels,iCell) = bn2(nVertLevels-1,iCell)
+
+      end do
+
+      !$acc end parallel
+
+      !$acc exit data delete(theta, temp, qvsw, coefa)
+
+      DEBUG_WRITE(' exiting BV frequency calculations ')
+
+   end subroutine calculate_n2
+
+!---------------------------------------
+
+   subroutine u_dissipation_3d( edgeStart, edgeEnd, edgeSolveStart, edgeSolveEnd, vertexStart, vertexEnd,    &
+                                cellStart, cellEnd, nCells, nEdges, nVertices, vertexDegree,                 &
+                                cellsOnEdge, verticesOnEdge, edgesOnCell, edgesOnVertex,                     &
+                                nEdgesOnCell, edgesOnCell_sign, edgesOnVertex_sign,                          &
+                                invAreaCell, invAreaTriangle, invDvEdge, invDcEdge,                          &
+                                angleEdge, dcEdge, dvEdge, meshScalingDel2, meshScalingDel4,                 &
+                                config_mix_full, h_mom_eddy_visc4, v_mom_eddy_visc2,                         &
+                                config_del4u_div_factor, zgrid,                                              &
+                                eddy_visc_horz, eddy_visc_vert, zz, rdzu, rdzw,                              &
+                                fzm, fzp, les_model_opt, les_surface_opt,                                    &
+                                config_surface_drag_coefficient,                                             &
+                                delsq_u, delsq_vorticity, delsq_divergence,                                  &
+                                u, v, divergence, vorticity, rho_edge, rho_zz, u_init, v_init, ustm,         &
+                                tend_u_euler                                                                )
+
+      use mpas_atm_dimensions  !  pull nVertLevels and maxEdges from here
+
+      implicit none
+
+      integer, intent(in) :: edgeStart, edgeEnd, edgeSolveStart, edgeSolveEnd
+      integer, intent(in) :: vertexStart, vertexEnd, vertexDegree
+      integer, intent(in) :: cellStart, cellEnd
+      integer, intent(in) :: nCells, nEdges, nVertices
+      logical, intent(in) :: config_mix_full
+
+      integer, intent(in) :: les_model_opt
+      integer, intent(in) :: les_surface_opt
+
+      integer, dimension(2,nEdges+1), intent(in) :: cellsOnEdge
+      integer, dimension(2,nEdges+1), intent(in) :: verticesOnEdge
+      integer, dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell
+      integer, dimension(nCells+1), intent(in) :: nEdgesOnCell
+      integer, dimension(vertexDegree,nVertices+1), intent(in) :: edgesOnVertex
+
+      real (kind=RKIND), intent(in) :: h_mom_eddy_visc4
+      real (kind=RKIND), intent(in) :: v_mom_eddy_visc2
+      real (kind=RKIND), intent(in) :: config_del4u_div_factor
+      real (kind=RKIND), intent(in) :: config_surface_drag_coefficient
+
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell_sign
+      real (kind=RKIND), dimension(vertexDegree,nVertices+1), intent(in) :: edgesOnVertex_sign
+      real (kind=RKIND), dimension(nVertices+1), intent(in) :: invAreaTriangle
+      real (kind=RKIND), dimension(nCells+1), intent(in) :: invAreaCell
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: invDcEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: invDvEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: angleEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: dcEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: dvEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel2
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel4
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(in) :: zgrid
+
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: u
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: v
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: divergence
+      real (kind=RKIND), dimension(nVertLevels,nVertices+1), intent(in) :: vorticity
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: rho_edge
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_horz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_vert
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: zz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: rho_zz
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzu
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzw
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: fzm
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: fzp
+
+
+      !  scratch space from calling routine
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1) :: delsq_u
+      real (kind=RKIND), dimension(nVertLevels,nVertices+1) :: delsq_vorticity
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: delsq_divergence
+
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: u_init, v_init
+      real (kind=RKIND), dimension(:), intent(in) :: ustm
+
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(out) :: tend_u_euler
+
+      ! local variables
+
+      integer :: iEdge, cell1, cell2, vertex1, vertex2, iVertex, iCell, i, k
+      real (kind=RKIND) :: r_dc, r_dv, u_diffusion, u_diffusion_les, kdiffu, r, edge_sign, u_mix_scale
+      real (kind=RKIND) :: z1, z2, z3, z4, zm, z0, zp
+      real (kind=RKIND), dimension(nVertLevels) :: u_mix
+
+      real (kind=RKIND), dimension(nVertLevels+1) :: turb_vflux
+      real (kind=RKIND) :: rho_k_cell1, rho_k_cell2, rho_k_at_w
+      real (kind=RKIND) :: zz_cell1, zz_cell2, zz_at_w
+      real (kind=RKIND) :: ust_edge
+
+      real (kind=RKIND) :: velocity_magnitude
+      real (kind=RKIND) :: tau_12_factor
+
+
+      DEBUG_WRITE(' begin u_dissipation_3d ')
+      DEBUG_WRITE(' 4th order hyperviscosity is $r ' COMMA realArgs=(/h_mom_eddy_visc4/))
+      DEBUG_WRITE(' 4th order divergence factor is $r ' COMMA realArgs=(/config_del4u_div_factor/))
+
+!$OMP BARRIER
+
+      ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (u) ).
+      ! First, storage to hold the result from the first del^2 computation.
+
+      !$acc enter data create(u_mix)
+      !$acc enter data create(turb_vflux)
+
+      !$acc parallel default(present)
+
+      tau_12_factor = 0.0
+      if(les_model_opt /= LES_MODEL_NONE) tau_12_factor = 1.0
+
+      !$acc loop gang worker
+      do iEdge=edgeStart,edgeEnd
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
+         vertex1 = verticesOnEdge(1,iEdge)
+         vertex2 = verticesOnEdge(2,iEdge)
+         r_dc = invDcEdge(iEdge)
+         r_dv = min(invDvEdge(iEdge), 4*invDcEdge(iEdge))
+
+         !$acc loop vector
+         do k = 1, nVertLevels
+            delsq_u(k,iEdge) = 0.0_RKIND
+         end do
+
+!DIR$ IVDEP
+         !$acc loop vector
+         do k=1,nVertLevels
+
+            ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity
+            !                    only valid for h_mom_eddy_visc4 == constant
+            u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) * r_dc  &
+                           -( vorticity(k,vertex2) - vorticity(k,vertex1) ) * r_dv
+            ! for LES models we need 2 times the gradient of divergence, in contrast to what is
+            ! saved and used to calculate the 4th-order horizontal filter
+            u_diffusion_les = u_diffusion + tau_12_factor * ( divergence(k,cell2)  - divergence(k,cell1)  ) * r_dc
+
+            delsq_u(k,iEdge) = delsq_u(k,iEdge) + u_diffusion
+
+            kdiffu = 0.5*(eddy_visc_horz(k,cell1)+eddy_visc_horz(k,cell2))
+
+            ! include 2nd-order diffusion here
+            tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) &
+                                    + rho_edge(k,iEdge)* kdiffu * u_diffusion_les * meshScalingDel2(iEdge)
+
+         end do
+      end do
+
+      !$acc end parallel
+
+      if (h_mom_eddy_visc4 > 0.0) then  ! 4th order mixing is active
+
+!$OMP BARRIER
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker
+         do iVertex=vertexStart,vertexEnd
+            !$acc loop vector
+            do k = 1, nVertLevels
+               delsq_vorticity(k,iVertex) = 0.0_RKIND
+            end do
+
+            !$acc loop seq
+            do i=1,vertexDegree
+               iEdge = edgesOnVertex(i,iVertex)
+               edge_sign = invAreaTriangle(iVertex) * dcEdge(iEdge) * edgesOnVertex_sign(i,iVertex)
+
+               !$acc loop vector
+               do k=1,nVertLevels
+                  delsq_vorticity(k,iVertex) = delsq_vorticity(k,iVertex) + edge_sign * delsq_u(k,iEdge)
+               end do
+            end do
+         end do
+
+         !$acc loop gang worker
+         do iCell=cellStart,cellEnd
+            !$acc loop vector
+            do k = 1, nVertLevels
+               delsq_divergence(k,iCell) = 0.0_RKIND
+            end do
+
+            r = invAreaCell(iCell)
+
+            !$acc loop seq
+            do i=1,nEdgesOnCell(iCell)
+               iEdge = edgesOnCell(i,iCell)
+               edge_sign = r * dvEdge(iEdge) * edgesOnCell_sign(i,iCell)
+
+               !$acc loop vector
+               do k=1,nVertLevels
+                  delsq_divergence(k,iCell) = delsq_divergence(k,iCell) + edge_sign * delsq_u(k,iEdge)
+               end do
+            end do
+         end do
+
+         !$acc end parallel
+
+!$OMP BARRIER
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker
+         do iEdge=edgeSolveStart,edgeSolveEnd
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+            vertex1 = verticesOnEdge(1,iEdge)
+            vertex2 = verticesOnEdge(2,iEdge)
+
+            u_mix_scale = meshScalingDel4(iEdge)*h_mom_eddy_visc4
+            r_dc = u_mix_scale * config_del4u_div_factor * invDcEdge(iEdge)
+            r_dv = u_mix_scale * min(invDvEdge(iEdge), 4*invDcEdge(iEdge))
+
+!DIR$ IVDEP
+            !$acc loop vector
+            do k=1,nVertLevels
+
+               ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity
+               !                    only valid for h_mom_eddy_visc4 == constant
+               !
+               ! Here, we scale the diffusion on the divergence part a factor of config_del4u_div_factor
+               !    relative to the rotational part.  The stability constraint on the divergence component is much less
+               !    stringent than the rotational part, and this flexibility may be useful.
+               !
+               u_diffusion =  rho_edge(k,iEdge) *  ( ( delsq_divergence(k,cell2)  - delsq_divergence(k,cell1) ) * r_dc  &
+                                                    -( delsq_vorticity(k,vertex2) - delsq_vorticity(k,vertex1) ) * r_dv )
+               tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) - u_diffusion
+
+            end do
+         end do
+
+         !$acc end parallel
+
+      end if ! 4th order mixing is active
+
+      !
+      !  vertical mixing for u - 2nd order filter in physical (z) space
+      !
+      if ( v_mom_eddy_visc2 > 0.0 ) then
+
+         if (config_mix_full) then  ! mix full state
+
+            !$acc parallel default(present)
+
+            !$acc loop gang worker
+            do iEdge=edgeSolveStart,edgeSolveEnd
+
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+
+               !$acc loop vector
+               do k=2,nVertLevels-1
+
+                  z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
+                  z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
+                  z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
+                  z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
+
+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &
+                                     (u(k+1,iEdge)-u(k  ,iEdge))/(zp-z0)                      &
+                                    -(u(k  ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm))
+               end do
+            end do
+
+            !$acc end parallel
+
+         else  ! idealized cases where we mix on the perturbation from the initial 1-D state
+
+            !$acc parallel default(present)
+
+            !$acc loop gang worker private(u_mix)
+            do iEdge=edgeSolveStart,edgeSolveEnd
+
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+
+               !$acc loop vector
+               do k=1,nVertLevels
+                  u_mix(k) = u(k,iEdge) - u_init(k) * cos( angleEdge(iEdge) ) &
+                                        + v_init(k) * sin( angleEdge(iEdge) )
+               end do
+
+               !$acc loop vector
+               do k=2,nVertLevels-1
+
+                  z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
+                  z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
+                  z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
+                  z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
+
+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &
+                                     (u_mix(k+1)-u_mix(k  ))/(zp-z0)                      &
+                                    -(u_mix(k  )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm))
+               end do
+            end do
+
+            !$acc end parallel
+
+         end if  ! mix perturbation state
+
+      end if  ! vertical mixing of horizontal momentum for les formulation
+
+      if ( les_model_opt /= LES_MODEL_NONE ) then
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker private(turb_vflux)
+         do iEdge=edgeSolveStart,edgeSolveEnd
+
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+            turb_vflux(nVertlevels+1) = 0.0_RKIND  !  no turbulent flux out of the domain
+            turb_vflux(1) = 0.0_RKIND  !  lower bc flux handled where ???
+
+            !$acc loop vector
+            do k=2,nVertLevels
+               rho_k_cell1 =  fzm(k)*rho_zz(k  ,cell1)*zz(k  ,cell1)*eddy_visc_vert(k  ,cell1) &
+                             +fzp(k)*rho_zz(k-1,cell1)*zz(k-1,cell1)*eddy_visc_vert(k-1,cell1)
+               rho_k_cell2 =  fzm(k)*rho_zz(k  ,cell2)*zz(k  ,cell2)*eddy_visc_vert(k  ,cell2) &
+                             +fzp(k)*rho_zz(k-1,cell2)*zz(k-1,cell2)*eddy_visc_vert(k-1,cell2)
+               rho_k_at_w = 0.5*(rho_k_cell1+rho_k_cell2)
+
+               zz_cell1 = fzm(k)*zz(k,cell1)+fzp(k)*zz(k-1,cell1)
+               zz_cell2 = fzm(k)*zz(k,cell2)+fzp(k)*zz(k-1,cell2)
+               zz_at_w = 0.5*(zz_cell1+zz_cell2)
+               turb_vflux(k) = - rho_k_at_w*zz_at_w*rdzu(k)*(u(k,iEdge)-u(k-1,iEdge))
+            end do
+
+            if( les_surface_opt == LES_SURFACE_SPECIFIED ) then
+               velocity_magnitude = sqrt(u(1,iEdge)**2 + v(1,iEdge)**2)
+               turb_vflux(1) = -rho_edge(1,iEdge)*config_surface_drag_coefficient*u(1,iEdge)*velocity_magnitude
+               turb_vflux(nVertLevels+1) = turb_vflux(nVertLevels)
+            else if ( les_surface_opt == LES_SURFACE_VARYING ) then
+               ust_edge = 0.5*(ustm(cell1) + ustm(cell2))
+               velocity_magnitude = max(sqrt(u(1,iEdge)**2 + v(1,iEdge)**2),0.1)
+               turb_vflux(1) = -rho_edge(1,iEdge)*ust_edge*ust_edge*(u(1,iEdge)/velocity_magnitude)
+               turb_vflux(nVertLevels+1) = turb_vflux(nVertLevels)
+               ! end test conditions
+            else
+               ! test conditions for supercell case
+               turb_vflux(1) = turb_vflux(2)
+               turb_vflux(nVertLevels+1) = turb_vflux(nVertLevels)
+               ! end test conditions
+            end if
+
+            !$acc loop vector
+            do k=1,nVertLevels
+               tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) - rdzw(k)*(turb_vflux(k+1)-turb_vflux(k))
+            end do
+
+         end do
+
+         !$acc end parallel
+
+      end if
+
+      !$acc exit data delete(turb_vflux)
+      !$acc exit data delete(u_mix)
+
+      DEBUG_WRITE(' exiting u_dissipation_3d ')
+
+   end subroutine u_dissipation_3d
+
+!------------------------
+
+   subroutine w_dissipation_3d( cellStart, cellEnd, cellSolveStart, cellSolveEnd,         &
+                                nCells, nEdges,                                           &
+                                nEdgesOnCell, edgesOnCell, cellsOnEdge, edgesOnCell_sign, &
+                                invAreaCell, invDcEdge, dvEdge,                           &
+                                meshScalingDel2, meshScalingDel4,                         &
+                                rdzw, rdzu,                                               &
+                                v_mom_eddy_visc2, h_mom_eddy_visc4,                       &
+                                delsq_w,                                                  &
+                                w, rho_edge, rho_zz, divergence, zz,                      &
+                                eddy_visc_horz, eddy_visc_vert,                           &
+                                les_model_opt, les_surface_opt,                           &
+                                tend_w_euler                                             )
+
+
+      ! 3D w dissipation using the 3D smagorinsky eddy viscosities.
+      ! This routine also includes the simpler mixing models, and the 4th-order horizontal filter
+
+      use mpas_atm_dimensions  !  pull nVertLevels and maxEdges from here
+
+      implicit none
+
+      integer, intent(in) :: cellStart, cellEnd
+      integer, intent(in) :: cellSolveStart, cellSolveEnd
+      integer, intent(in) :: nCells, nEdges
+
+      integer, dimension(nCells+1), intent(in) :: nEdgesOnCell
+      integer, dimension(maxEdges,nCells+1), intent(in) :: EdgesOnCell
+
+      integer, dimension(2,nEdges+1), intent(in) :: cellsOnEdge
+
+      integer, intent(in) :: les_model_opt
+      integer, intent(in) :: les_surface_opt
+
+      real (kind=RKIND), intent(in) :: h_mom_eddy_visc4
+      real (kind=RKIND), intent(in) :: v_mom_eddy_visc2
+
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell_sign
+      real (kind=RKIND), dimension(nCells+1), intent(in) :: invAreaCell
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: dvEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: invDcEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel2
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel4
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzw
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzu
+
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(in) :: w
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_horz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_vert
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: rho_zz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: divergence
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: zz
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: rho_edge
+
+      real (kind=RKIND), dimension(nVertLevels+1,nCells+1), intent(inout) :: tend_w_euler
+
+      ! storage passed in from calling routine
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: delsq_w
+      real (kind=RKIND), dimension(nVertLevels+1) :: turb_vflux
+
+      ! local variables
+
+      integer :: cell1, cell2, iEdge, iCell, i, k
+      real (kind=RKIND) :: r_areaCell, edge_sign, w_turb_flux
+
+
+!  !OMP BARRIER  why is this openmp barrier here???
+
+      ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (w) ).
+      !
+      ! First, storage to hold the result from the first del^2 computation.
+      !  we copied code from the theta mixing, hence the theta* names.
+
+
+      DEBUG_WRITE(' begin w_dissipation_3d ')
+      DEBUG_WRITE(' 4th order hyperviscosity is $r ' COMMA realArgs=(/h_mom_eddy_visc4/))
+
+      !$acc enter data create(turb_vflux)
+
+      !$acc parallel default(present)
+
+      !$acc loop gang worker
+      do iCell=cellStart,cellEnd
+
+         !$acc loop vector
+         do k = 1, nVertLevels
+            delsq_w(k,iCell) = 0.0_RKIND
+         end do
+
+         !$acc loop vector
+         do k = 1, nVertLevels+1
+            tend_w_euler(k,iCell) = 0.0_RKIND
+         end do
+
+         r_areaCell = invAreaCell(iCell)
+
+         !$acc loop seq
+         do i=1,nEdgesOnCell(iCell)
+            iEdge = edgesOnCell(i,iCell)
+
+            edge_sign = 0.5 * r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge)
+
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+
+!DIR$ IVDEP
+            !$acc loop vector
+            do k=2,nVertLevels
+
+               w_turb_flux =  edge_sign*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))
+               delsq_w(k,iCell) = delsq_w(k,iCell) + w_turb_flux
+               w_turb_flux = w_turb_flux * meshScalingDel2(iEdge) * 0.25 * &
+                               ( eddy_visc_horz(k  ,cell1)+eddy_visc_horz(k  ,cell2)      &
+                                +eddy_visc_horz(k-1,cell1)+eddy_visc_horz(k-1,cell2) )
+               tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + w_turb_flux
+            end do
+         end do
+      end do
+
+      !$acc end parallel
+
+!$OMP BARRIER
+
+      if (h_mom_eddy_visc4 > 0.0) then  ! 4th order mixing is active
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker
+         do iCell=cellSolveStart,cellSolveEnd    ! Technically updating fewer cells than before...
+
+            r_areaCell = h_mom_eddy_visc4 * invAreaCell(iCell)
+
+            !$acc loop seq
+            do i=1,nEdgesOnCell(iCell)
+               iEdge = edgesOnCell(i,iCell)
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+
+               edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell) * invDcEdge(iEdge)
+
+               !$acc loop vector
+               do k=2,nVertLevels
+                  tend_w_euler(k,iCell) = tend_w_euler(k,iCell) - edge_sign * (delsq_w(k,cell2) - delsq_w(k,cell1))
+               end do
+
+            end do
+         end do
+
+         !$acc end parallel
+
+      end if ! 4th order mixing is active
+
+      if ( v_mom_eddy_visc2 > 0.0 ) then  ! vertical mixing
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker
+         do iCell=cellSolveStart,cellSolveEnd
+!DIR$ IVDEP
+            !$acc loop vector
+            do k=2,nVertLevels
+               tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + v_mom_eddy_visc2*0.5*(rho_zz(k,iCell)+rho_zz(k-1,iCell))*(  &
+                                           (w(k+1,iCell)-w(k  ,iCell))*rdzw(k)                              &
+                                          -(w(k  ,iCell)-w(k-1,iCell))*rdzw(k-1) )*rdzu(k)
+            end do
+         end do
+
+         !$acc end parallel
+
+      end if
+
+      if ( les_model_opt /= LES_MODEL_NONE ) then
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker private(turb_vflux)
+         do iCell = cellSolveStart,cellSolveEnd ! vertical mixing for each column
+            ! compute turbulent fluxes
+     
+            !$acc loop vector
+            do k=1,nVertLevels
+               turb_vflux(k) = - rho_zz(k,iCell)*eddy_visc_vert(k,iCell)*zz(k,iCell)*(      &
+                                          2.0*zz(k,iCell)*rdzw(k)*(w(k+1,iCell)-w(k,iCell))  &
+                                              + divergence(k,iCell)                         )
+            end do
+
+            turb_vflux(nVertLevels+1) = 0.0
+
+            !$acc loop vector
+            do k=2,nVertLevels
+               tend_w_euler(k,iCell) = tend_w_euler(k,iCell) &
+                                           - rdzu(k)*(turb_vflux(k)-turb_vflux(k-1))
+            end do
+         end do
+
+         !$acc end parallel
+
+      end if
+
+      !$acc exit data delete(turb_vflux)
+
+      DEBUG_WRITE(' exiting w_dissipation_3d ')
+
+   end subroutine w_dissipation_3d
+
+!-----------------------------------------------------
+
+   subroutine scalar_dissipation_3d_les( cellStart, cellEnd, cellSolveStart, cellSolveEnd,         &
+                                         nCells, nEdges,                                           &
+                                         nEdgesOnCell, edgesOnCell, cellsOnEdge, edgesOnCell_sign, &
+                                         invAreaCell, invDcEdge, dvEdge,                           &
+                                         meshScalingDel2, meshScalingDel4,                         &
+                                         config_mix_full, t_init, zgrid,                           &
+                                         rdzw, rdzu, fzm, fzp,                                     &
+                                         v_theta_eddy_visc2, h_theta_eddy_visc4, prandtl_inv,      &
+                                         prandtl_3d_inv,                                           &
+                                         delsq_theta,                                              &
+                                         theta_m, rho_edge, rho_zz, zz,                            &
+                                         eddy_visc_horz, eddy_visc_vert,                           &
+                                         bv_freq2, config_len_disp, scalars, tend_scalars,         &
+                                         index_tke, index_qv, num_scalars_dummy, mix_scalars,      &
+                                         les_model_opt, les_surface_opt, clock, dt,                &
+                                         config_surface_heat_flux, config_surface_moisture_flux,   &
+                                         uReconstructZonal, uReconstructMeridional,                &
+                                         hfx, qfx,                                                 &
+                                         tend_theta_euler, dynamics_substep                       )
+
+
+      ! 3D theta_m dissipation using the 3D smagorinsky eddy viscosities.
+      ! This routine also includes the simpler mixing models, and the 4th-order horizontal filter
+
+      use mpas_atm_dimensions  !  pull nVertLevels and maxEdges from here
+
+      implicit none
+
+      integer, intent(in) :: cellStart, cellEnd
+      integer, intent(in) :: cellSolveStart, cellSolveEnd
+      integer, intent(in) :: nCells, nEdges
+      integer, intent(in) :: num_scalars_dummy
+      integer, intent(in) :: index_tke, index_qv
+      integer, intent(in) :: dynamics_substep
+
+      real (kind=RKIND), intent(in) :: config_surface_heat_flux
+      real (kind=RKIND), intent(in) :: config_surface_moisture_flux
+
+      logical, intent(in) :: config_mix_full, mix_scalars
+
+      integer, intent(in) :: les_model_opt
+      integer, intent(in) :: les_surface_opt
+
+      type (MPAS_Clock_type), intent(in) :: clock
+      real (kind=RKIND), intent(in) :: dt
+
+      integer, dimension(nCells+1), intent(in) :: nEdgesOnCell
+      integer, dimension(maxEdges,nCells+1), intent(in) :: EdgesOnCell
+
+      integer, dimension(2,nEdges+1), intent(in) :: cellsOnEdge
+
+      real (kind=RKIND), intent(in) :: h_theta_eddy_visc4
+      real (kind=RKIND), intent(in) :: v_theta_eddy_visc2
+      real (kind=RKIND), intent(in) :: prandtl_inv
+      real (kind=RKIND), intent(in) :: config_len_disp
+
+      real (kind=RKIND), dimension(maxEdges,nCells+1), intent(in) :: edgesOnCell_sign
+      real (kind=RKIND), dimension(nCells+1), intent(in) :: invAreaCell
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: dvEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: invDcEdge
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel2
+      real (kind=RKIND), dimension(nEdges+1), intent(in) :: meshScalingDel4
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzw
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: rdzu
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: fzm
+      real (kind=RKIND), dimension(nVertLevels), intent(in) :: fzp
+      real (kind=RKIND), dimension(nVertLevels+1, nCells+1), intent(in) :: zgrid
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: zz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: uReconstructZonal, uReconstructMeridional
+
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: t_init
+
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: bv_freq2
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: prandtl_3d_inv
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1), intent(in) :: scalars
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1), intent(inout) :: tend_scalars
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: theta_m
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_horz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: eddy_visc_vert
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(in) :: rho_zz
+      real (kind=RKIND), dimension(nVertLevels,nEdges+1), intent(in) :: rho_edge
+      real (kind=RKIND), dimension(:), intent(in) :: hfx, qfx
+
+      real (kind=RKIND), dimension(nVertLevels,nCells+1), intent(inout) :: tend_theta_euler
+
+      ! storage passed in from calling routine
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: delsq_theta
+
+      ! local variables
+      integer :: cell1, cell2, iEdge, iCell, i, k, iScalar
+      real (kind=RKIND) :: r_areaCell, edge_sign, theta_turb_flux, pr_scale
+      real (kind=RKIND) :: z1, z2, z3, z4, zm, z0, zp
+      real (kind=RKIND), dimension(nVertLevels+1) :: turb_vflux, prandtl_1d_inverse
+      real (kind=RKIND), dimension(num_scalars,nVertLevels+1) :: turb_vflux_scalars
+      real (kind=RKIND), dimension(nVertLevels) :: rho_k_at_w, zz_at_w
+
+      real (kind=RKIND) :: moisture_flux, heat_flux, theta_m_flux
+      real (kind=RKIND) :: qv_cell, theta_m_cell, theta_cell
+
+
+      DEBUG_WRITE(' begin scalar_dissipation_3d ')
+      DEBUG_WRITE(' 4th order hyperviscosity is $r ' COMMA realArgs=(/h_theta_eddy_visc4/))
+
+      if( mix_scalars .and. (dynamics_substep == 1)) call mpas_log_write(' scalar mixing on ')
+
+      !$acc enter data create(turb_vflux_scalars)
+      !$acc enter data create(turb_vflux, prandtl_1d_inverse)
+
+      !$acc parallel default(present)
+
+      !$acc loop gang worker
+      do iCell=cellStart,cellEnd
+
+         !$acc loop vector
+         do k = 1, nVertLevels
+            delsq_theta(k,iCell) = 0.0_RKIND
+            tend_theta_euler(k,iCell) = 0.0_RKIND
+         end do
+
+         r_areaCell = invAreaCell(iCell)
+
+         !$acc loop seq
+         do i=1,nEdgesOnCell(iCell)
+            iEdge = edgesOnCell(i,iCell)
+            edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge)
+            pr_scale = prandtl_inv * meshScalingDel2(iEdge)
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+
+!DIR$ IVDEP
+            !$acc loop vector
+            do k=1,nVertLevels
+
+!  we are computing the Smagorinsky filter at more points than needed here so as to pick up the delsq_theta for 4th order filter below.
+!  This is in conservative form.
+
+               theta_turb_flux = edge_sign*(theta_m(k,cell2) - theta_m(k,cell1))*rho_edge(k,iEdge)
+               delsq_theta(k,iCell) = delsq_theta(k,iCell) + theta_turb_flux
+               theta_turb_flux = theta_turb_flux*0.5*(eddy_visc_horz(k,cell1)+eddy_visc_horz(k,cell2)) * pr_scale
+               tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + theta_turb_flux
+
+            end do
+         end do
+      end do
+
+      !$acc end parallel
+
+!$OMP BARRIER
+
+      if (h_theta_eddy_visc4 > 0.0) then  ! 4th order mixing is active
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker
+         do iCell=cellSolveStart,cellSolveEnd    ! Technically updating fewer cells than before...
+            r_areaCell = h_theta_eddy_visc4 * prandtl_inv * invAreaCell(iCell)
+
+            !$acc loop seq
+            do i=1,nEdgesOnCell(iCell)
+
+               iEdge = edgesOnCell(i,iCell)
+               edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell)*invDcEdge(iEdge)
+
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+
+               !$acc loop vector
+               do k=1,nVertLevels
+                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) - edge_sign*(delsq_theta(k,cell2) - delsq_theta(k,cell1))
+               end do
+            end do
+         end do
+
+         !$acc end parallel
+
+      end if ! 4th order mixing is active
+
+      if(mix_scalars .and. (dynamics_substep == 1)) then  ! dissipation for scalars, including 4th-order filter.  Likely needs optimization
+
+         do iScalar=1,num_scalars
+
+            !$acc parallel default(present)
+
+            !$acc loop gang worker
+            do iCell=cellStart,cellEnd
+               !$acc loop vector
+               do k = 1, nVertLevels
+                  delsq_theta(k,iCell) = 0.0_RKIND
+               end do
+
+               ! tend_theta_euler(1:nVertLevels,iCell) = 0.0
+               r_areaCell = invAreaCell(iCell)
+
+               !$acc loop seq
+               do i=1,nEdgesOnCell(iCell)
+                  iEdge = edgesOnCell(i,iCell)
+                  edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge)
+                  pr_scale = prandtl_inv * meshScalingDel2(iEdge)
+                  cell1 = cellsOnEdge(1,iEdge)
+                  cell2 = cellsOnEdge(2,iEdge)
+
+!DIR$ IVDEP
+                  !$acc loop vector
+                  do k=1,nVertLevels
+
+!  we are computing the Smagorinsky filter at more points than needed here so as to pick up the delsq_theta for 4th order filter below.
+!  This is in conservative form.
+
+                     theta_turb_flux = edge_sign*(scalars(iScalar,k,cell2) - scalars(iScalar,k,cell1))*rho_edge(k,iEdge)
+                     delsq_theta(k,iCell) = delsq_theta(k,iCell) + theta_turb_flux
+                     theta_turb_flux = theta_turb_flux*0.5*(eddy_visc_horz(k,cell1)+eddy_visc_horz(k,cell2)) * pr_scale
+                     tend_scalars(iScalar,k,iCell) = tend_scalars(iScalar,k,iCell) + theta_turb_flux
+
+                  end do
+               end do
+            end do
+
+            !$acc end parallel
+
+!$OMP BARRIER
+
+            if (h_theta_eddy_visc4 > 0.0) then  ! 4th order mixing is active
+
+               !$acc parallel default(present)
+
+               !$acc loop gang worker
+               do iCell=cellSolveStart,cellSolveEnd    ! Technically updating fewer cells than before...
+
+                  r_areaCell = h_theta_eddy_visc4 * prandtl_inv * invAreaCell(iCell)
+
+                  !$acc loop seq
+                  do i=1,nEdgesOnCell(iCell)
+
+                     iEdge = edgesOnCell(i,iCell)
+                     edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell)*invDcEdge(iEdge)
+
+                     cell1 = cellsOnEdge(1,iEdge)
+                     cell2 = cellsOnEdge(2,iEdge)
+
+                     !$acc loop vector
+                     do k=1,nVertLevels
+                        tend_scalars(iScalar,k,iCell) = tend_scalars(iScalar,k,iCell) - edge_sign*(delsq_theta(k,cell2) - delsq_theta(k,cell1))
+                     end do
+                  end do
+               end do
+
+               !$acc end parallel
+
+            end if ! 4th order mixing is active
+
+         end do ! loop over scalars for horizontal mixing
+
+      end if ! horizontal scalar mixing
+
+
+      !  idealized case vertical mixing.  No scalar mixing here.
+
+      if ( v_theta_eddy_visc2 > 0.0 ) then  ! vertical mixing for theta_m
+
+         if (config_mix_full) then
+
+            !$acc parallel default(present)
+
+            !$acc loop gang worker
+            do iCell = cellSolveStart,cellSolveEnd
+
+               !$acc loop vector
+               do k=2,nVertLevels-1
+                  z1 = zgrid(k-1,iCell)
+                  z2 = zgrid(k  ,iCell)
+                  z3 = zgrid(k+1,iCell)
+                  z4 = zgrid(k+2,iCell)
+
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
+
+                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&
+                                           (theta_m(k+1,iCell)-theta_m(k  ,iCell))/(zp-z0)                 &
+                                          -(theta_m(k  ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm))
+               end do
+            end do
+
+            !$acc end parallel
+
+         else  ! idealized cases where we mix on the perturbation from the initial 1-D state
+
+            !$acc parallel default(present)
+
+            !$acc loop gang worker
+            do iCell = cellSolveStart,cellSolveEnd
+
+               !$acc loop vector
+               do k=2,nVertLevels-1
+                  z1 = zgrid(k-1,iCell)
+                  z2 = zgrid(k  ,iCell)
+                  z3 = zgrid(k+1,iCell)
+                  z4 = zgrid(k+2,iCell)
+
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
+
+                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&
+                                           ((theta_m(k+1,iCell)-t_init(k+1,iCell))-(theta_m(k  ,iCell)-t_init(k,iCell)))/(zp-z0)      &
+                                          -((theta_m(k  ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm))
+               end do
+            end do
+
+            !$acc end parallel
+
+         end if
+
+      end if
+
+      if ( les_model_opt /= LES_MODEL_NONE ) then
+
+         !$acc parallel default(present)
+
+         !$acc loop gang worker private(turb_vflux, turb_vflux_scalars, prandtl_1d_inverse, rho_k_at_w, zz_at_w)
+         do iCell = cellSolveStart,cellSolveEnd ! vertical mixing for each column
+            ! compute turbulent fluxes
+
+            turb_vflux(nVertlevels+1) = 0.  !  no turbulent flux out of the domain
+            turb_vflux(1) = 0.  !  lower bc flux handled where ???
+
+            if ( les_model_opt == LES_MODEL_3D_SMAGORINSKY ) then
+               !$acc loop vector
+               do k=2,nVertLevels
+                  prandtl_1d_inverse(k) = prandtl_inv
+               end do
+            else   ! prognostic_1.5_order, isentropic mixing length
+               ! do k=2,nVertLevels
+               !   delta_z = 0.5*(zgrid(k+1,iCell)-zgrid(k-1,iCell))
+               !   delta_s = ((config_len_disp**2)*delta_z)**(1./3.)
+               !   bv_frequency2 = 0.5*(bv_freq2(k,iCell)+bv_freq2(k-1,iCell))
+               !   tke_length = delta_s
+               !   if(bv_frequency2 .gt. 1.e-06) &
+               !       tke_length = 0.76*sqrt(scalars(index_tke,k,iCell))/sqrt(bv_frequency2)
+               !       tke_length = min(delta_z,tke_length)
+               !   prandtl_inverse(k) = 1. + 2.*tke_length/delta_z
+               ! end do
+
+               !$acc loop vector
+               do k=2,nVertLevels
+                  ! prandtl_1d_inverse(k) = 0.5*(prandtl_3d_inv(k,iCell)+prandtl_3d_inv(k-1,iCell))
+                  prandtl_1d_inverse(k) = fzm(k)*prandtl_3d_inv(k,iCell)+fzp(k)*prandtl_3d_inv(k-1,iCell)
+               end do
+
+            end if
+
+            !$acc loop vector
+            do k=2,nVertLevels
+
+               ! delta_z = 0.5*(zgrid(k+1,iCell)-zgrid(k-1,iCell))
+               ! delta_s = ((config_len_disp**2)*delta_z)**(1./3.)
+               ! bv_frequency2 = 0.5*(bv_freq2(k)+bv_freq(k-1))
+               ! bv = max( sqrt(abs(bv_frequency2)), epsilon_bv )
+               rho_k_at_w(k) =  fzm(k)*rho_zz(k  ,iCell)*zz(k  ,iCell)*zz(k  ,iCell)*eddy_visc_vert(k  ,iCell) &
+                             +fzp(k)*rho_zz(k-1,iCell)*zz(k-1,iCell)*zz(k-1,iCell)*eddy_visc_vert(k-1,iCell)
+               zz_at_w(k) = fzm(k)*zz(k,iCell)+fzp(k)*zz(k-1,iCell)
+               turb_vflux(k) = - prandtl_1d_inverse(k)*rho_k_at_w(k)*zz_at_w(k)*rdzu(k)*(theta_m(k,iCell)-theta_m(k-1,iCell))
+            end do
+
+            ! test boundary conditions for supercell and les test cases
+
+            if( les_surface_opt == LES_SURFACE_SPECIFIED .or. les_surface_opt == LES_SURFACE_VARYING ) then
+
+               if( les_surface_opt == LES_SURFACE_SPECIFIED ) then
+                  moisture_flux = config_surface_moisture_flux
+                  heat_flux = config_surface_heat_flux
+
+!                 place holder routine for time-varying specified
+!                 call flux_les_sas( heat_flux, moisture_flux, clock, dt )
+
+               else if ( les_surface_opt == LES_SURFACE_VARYING ) then
+                  heat_flux = hfx(iCell)/rho_zz(1,iCell)/cp
+                  moisture_flux = qfx(iCell)/rho_zz(1,iCell)
+               endif
+
+               qv_cell = scalars(index_qv,1,iCell)
+               theta_m_cell = theta_m(1,iCell)
+               theta_cell = theta_m_cell/(1.0+(rv/rgas)*qv_cell)
+
+               theta_m_flux = heat_flux*(1.0+(rv/rgas)*qv_cell)+(rv/rgas)*theta_cell*moisture_flux
+               turb_vflux(1) = theta_m_flux*rho_zz(1,iCell)
+               moisture_flux = moisture_flux*rho_zz(1,iCell)
+               turb_vflux(nVertLevels+1) = turb_vflux(nVertLevels)
+
+            else
+
+               turb_vflux(1) = turb_vflux(2)
+               turb_vflux(nVertLevels+1) = turb_vflux(nVertLevels)
+
+            end if
+
+            !$acc loop vector
+            do k=1,nVertLevels
+               tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) &
+                                           - rdzw(k)*(turb_vflux(k+1)-turb_vflux(k))
+            end do
+
+            if (mix_scalars ) then
+
+               ! compute turbulent fluxes
+               !$acc loop vector
+               do iScalar=1,num_scalars
+                  turb_vflux_scalars(iScalar,nVertlevels+1) = 0.0_RKIND  !  no turbulent flux out of the domain
+                  turb_vflux_scalars(iScalar,1) = 0.0_RKIND  !  lower bc flux handled where ???
+               end do
+
+               !$acc loop vector
+               do k=2,nVertLevels
+                  rho_k_at_w(k) = fzm(k)*rho_zz(k  ,iCell)*zz(k  ,iCell)*zz(k  ,iCell)*eddy_visc_vert(k  ,iCell) &
+                                + fzp(k)*rho_zz(k-1,iCell)*zz(k-1,iCell)*zz(k-1,iCell)*eddy_visc_vert(k-1,iCell)
+                  zz_at_w(k) = fzm(k)*zz(k,iCell)+fzp(k)*zz(k-1,iCell)
+               end do
+
+               !$acc loop vector collapse(2)
+               do k=2,nVertLevels
+                  do iScalar=1,num_scalars
+                     turb_vflux_scalars(iScalar,k) = - prandtl_1d_inverse(k)*rho_k_at_w(k)*zz_at_w(k)*rdzu(k)*  &
+                                                      (scalars(iScalar,k,iCell)-scalars(iScalar,k-1,iCell))
+                  end do
+               end do
+
+               if( les_surface_opt == LES_SURFACE_SPECIFIED .or. les_surface_opt == LES_SURFACE_VARYING ) turb_vflux_scalars(index_qv,1) = moisture_flux ! lower b.c. for qv
+
+               !$acc loop vector collapse(2)
+               do k=1,nVertLevels
+                  do iScalar=1,num_scalars
+                     tend_scalars(iScalar,k,iCell) = tend_scalars(iScalar,k,iCell) &
+                          - rdzw(k)*(turb_vflux_scalars(iScalar,k+1)-turb_vflux_scalars(iScalar,k))
+                  end do
+               end do
+
+            end if ! mix scalars
+
+         end do  ! loop over cells (columns)
+
+         !$acc end parallel
+
+      end if
+
+      !$acc exit data delete(turb_vflux_scalars)
+      !$acc exit data delete(turb_vflux, prandtl_1d_inverse)
+
+      DEBUG_WRITE(' exiting scalar_dissipation_3d ')
+
+   end subroutine scalar_dissipation_3d_les
+
+!-----------
+
+!  subroutine flux_les_sas(heat_flux, moisture_flux, clock, dt)
+
+!     implicit none
+
+!     real (kind=RKIND), intent(out) :: heat_flux, moisture_flux
+!     type (MPAS_Clock_type), intent(in) :: clock
+!     real (kind=RKIND), intent(in) :: dt
+
+!     real (kind=RKIND), parameter:: t_start_t_flux = 3600.*6.0
+!     real (kind=RKIND), parameter:: t_end_t_flux = 3600.*19.50
+!     real (kind=RKIND), parameter:: t_start_q_flux = 3600.*7.0
+!     real (kind=RKIND), parameter:: t_end_q_flux = 3600.*19.50
+!     real (kind=RKIND) :: rel_time_t_flux, rel_time_q_flux
+!     real (kind=RKIND) :: time_of_day_seconds
+!     type (MPAS_Time_type) :: currTime
+!     integer :: H, M, S, S_n, S_d
+!     integer :: ierr
+
+!     currTime = mpas_get_clock_time(clock, MPAS_NOW, ierr)
+!     call mpas_get_time(curr_time=currTime, H=H, M=M, S=S, S_n=S_n, S_d=S_d)
+!     time_of_day_seconds = real(H)*3600. + real(M)*60. + real(S) + real(S_n)/real(S_d) + 0.5*dt
+!     call mpas_log_write(' les integration, timestep midpoint time of day in seconds, $r ', realArgs=(/time_of_day_seconds/))
+
+!     rel_time_t_flux = max(0.,(time_of_day_seconds - t_start_t_flux)/(t_end_t_flux - t_start_t_flux))
+!     rel_time_q_flux = max(0.,(time_of_day_seconds - t_start_q_flux)/(t_end_q_flux - t_start_q_flux))
+
+!     heat_flux = max(0., 0.1*sin(pii*rel_time_t_flux))
+!     moisture_flux = max(0., 0.15*sin(pii*rel_time_q_flux))/1000.
+
+!  end subroutine flux_les_sas
+
+end module mpas_atm_dissipation_models
diff --git a/src/core_atmosphere/dynamics/mpas_atm_iau.F b/src/core_atmosphere/dynamics/mpas_atm_iau.F
index 654fd3ae82..7459de89b4 100644
--- a/src/core_atmosphere/dynamics/mpas_atm_iau.F
+++ b/src/core_atmosphere/dynamics/mpas_atm_iau.F
@@ -5,6 +5,15 @@
 ! Additional copyright and license information can be found in the LICENSE file
 ! distributed with this code, or at http://mpas-dev.github.com/license.html
 !
+
+#ifdef MPAS_OPENACC
+#define MPAS_ACC_TIMER_START(X) call mpas_timer_start(X)
+#define MPAS_ACC_TIMER_STOP(X) call mpas_timer_stop(X)
+#else
+#define MPAS_ACC_TIMER_START(X)
+#define MPAS_ACC_TIMER_STOP(X)
+#endif
+
 module mpas_atm_iau
 
  use mpas_derived_types
@@ -13,9 +22,10 @@ module mpas_atm_iau
  use mpas_dmpar
  use mpas_constants   
  use mpas_log, only : mpas_log_write
+ use mpas_timer
+
+ !public :: atm_compute_iau_coef, atm_add_tend_anal_incr    
 
- !public :: atm_compute_iau_coef, atm_add_tend_anal_incr  
-      
  contains
 
 !==================================================================================================
@@ -76,6 +86,39 @@ real (kind=RKIND) function atm_iau_coef(configs, itimestep, dt) result(wgt_iau)
       end if  
 
  end function atm_iau_coef
+
+!==================================================================================================
+ subroutine update_d2h_pre_add_tend_anal_incr(configs,structs)
+!==================================================================================================
+  
+      implicit none
+
+      type (mpas_pool_type), intent(in) :: configs
+      type (mpas_pool_type), intent(inout) :: structs
+
+      type (mpas_pool_type), pointer :: tend
+      type (mpas_pool_type), pointer :: state
+      type (mpas_pool_type), pointer :: diag
+          
+      real (kind=RKIND), dimension(:,:), pointer :: rho_edge, rho_zz, theta_m
+      real(kind=RKIND),dimension(:,:,:), pointer :: scalars, tend_scalars
+
+      call mpas_pool_get_subpool(structs, 'tend', tend)
+      call mpas_pool_get_subpool(structs, 'state', state)
+      call mpas_pool_get_subpool(structs, 'diag', diag)
+
+      MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
+      call mpas_pool_get_array(state, 'theta_m', theta_m, 1)
+      call mpas_pool_get_array(state, 'scalars', scalars, 1)
+      call mpas_pool_get_array(state, 'rho_zz',   rho_zz, 2) 
+      call mpas_pool_get_array(diag , 'rho_edge', rho_edge)
+      !$acc update self(theta_m, scalars, rho_zz, rho_edge)
+
+      call mpas_pool_get_array(tend, 'scalars_tend', tend_scalars)
+      !$acc update self(tend_scalars)
+      MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
+
+ end subroutine update_d2h_pre_add_tend_anal_incr
       
 !==================================================================================================
  subroutine atm_add_tend_anal_incr (configs, structs, itimestep, dt, tend_ru, tend_rtheta, tend_rho)
diff --git a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
index 4fe2faefc4..786f3a3416 100644
--- a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
+++ b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
@@ -16,7 +16,6 @@
 
 module atm_time_integration
 
-   use mpas_derived_types
    use mpas_pool_routines
    use mpas_kind_types
    use mpas_constants
@@ -29,6 +28,7 @@ module atm_time_integration
 
 #ifdef DO_PHYSICS
    use mpas_atmphys_driver_microphysics
+   use mpas_atmphys_interface, only: update_d2h_pre_microphysics, update_h2d_post_microphysics
    use mpas_atmphys_todynamics
    use mpas_atmphys_utilities
 #endif
@@ -36,18 +36,20 @@ module atm_time_integration
    use mpas_atm_boundaries, only : nSpecZone, nRelaxZone, nBdyZone, mpas_atm_get_bdy_state, mpas_atm_get_bdy_tend  ! regional_MPAS addition
    
    use mpas_atm_iau  
+   use mpas_atm_dissipation_models
 
    !
    ! Abstract interface for routine used to communicate halos of fields
    ! in a named group
    !
    abstract interface
-      subroutine halo_exchange_routine(domain, halo_group, ierr)
+      subroutine halo_exchange_routine(domain, halo_group, withGPUAwareMPI, ierr)
 
          use mpas_derived_types, only : domain_type
 
          type (domain_type), intent(inout) :: domain
          character(len=*), intent(in) :: halo_group
+         logical, intent(in), optional :: withGPUAwareMPI
          integer, intent(out), optional :: ierr
 
       end subroutine halo_exchange_routine
@@ -85,6 +87,7 @@ end subroutine halo_exchange_routine
    !$acc declare create(s_max_arr, s_min_arr)
    !$acc declare create(flux_array, flux_upwind_tmp_arr)
    !$acc declare create(flux_tmp_arr, wdtn_arr)
+   !$acc declare create(rho_zz_int)
 
    real (kind=RKIND), dimension(:,:), allocatable :: ru_driving_tend ! regional_MPAS addition 
    real (kind=RKIND), dimension(:,:), allocatable :: rt_driving_tend ! regional_MPAS addition 
@@ -272,6 +275,13 @@ subroutine mpas_atm_dynamics_init(domain)
       real (kind=RKIND), dimension(:), pointer :: angleEdge
       real (kind=RKIND), dimension(:), pointer :: meshScalingDel2
       real (kind=RKIND), dimension(:), pointer :: meshScalingDel4
+      real (kind=RKIND), dimension(:), pointer :: u_init, v_init, qv_init
+      real (kind=RKIND), dimension(:,:), pointer :: t_init
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c2
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_s2
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_cs
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_s
 #endif
 
 #ifdef MPAS_CAM_DYCORE
@@ -292,6 +302,7 @@ subroutine mpas_atm_dynamics_init(domain)
       nullify(mesh)
       call mpas_pool_get_subpool(domain % blocklist % structs, 'mesh', mesh)
 
+      MPAS_ACC_TIMER_START('mpas_dynamics_init [ACC_data_xfer]')
       call mpas_pool_get_array(mesh, 'dvEdge', dvEdge)
       !$acc enter data copyin(dvEdge)
 
@@ -456,9 +467,971 @@ subroutine mpas_atm_dynamics_init(domain)
 
       call mpas_pool_get_array(mesh, 'meshScalingDel4', meshScalingDel4)
       !$acc enter data copyin(meshScalingDel4)
+      
+      call mpas_pool_get_array(mesh, 'u_init', u_init)
+      !$acc enter data copyin(u_init)
+      call mpas_pool_get_array(mesh, 'v_init', v_init)
+      !$acc enter data copyin(v_init)
+      call mpas_pool_get_array(mesh, 't_init', t_init)
+      !$acc enter data copyin(t_init)
+      call mpas_pool_get_array(mesh, 'qv_init', qv_init)
+      !$acc enter data copyin(qv_init)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_c2', deformation_coef_c2)
+      !$acc enter data copyin(deformation_coef_c2)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_s2', deformation_coef_s2)
+      !$acc enter data copyin(deformation_coef_s2)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_cs', deformation_coef_cs)
+      !$acc enter data copyin(deformation_coef_cs)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_c', deformation_coef_c)
+      !$acc enter data copyin(deformation_coef_c)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_s', deformation_coef_s)
+      !$acc enter data copyin(deformation_coef_s)
+
+      MPAS_ACC_TIMER_STOP('mpas_dynamics_init [ACC_data_xfer]')
+#endif
+
+   end subroutine mpas_atm_dynamics_init
+
+   subroutine mpas_atm_pre_computesolvediag_h2d(block)
+
+      implicit none
+
+      type (block_type), intent(inout) :: block
+
+
+#ifdef MPAS_OPENACC
+      type (mpas_pool_type), pointer :: mesh
+      type (mpas_pool_type), pointer :: diag
+      type (mpas_pool_type), pointer :: state
+      type (mpas_pool_type), pointer :: tend_physics
+      real (kind=RKIND), dimension(:,:), pointer :: rthdynten
+
+      real (kind=RKIND), dimension(:,:), pointer :: h_edge, v, vorticity, ke, pv_edge, &
+                                                   pv_vertex, pv_cell, gradPVn, gradPVt, divergence
+      real (kind=RKIND), dimension(:,:), pointer :: u, h
+
+      real (kind=RKIND), dimension(:,:), pointer :: zz
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb_cell
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb3_cell
+      real (kind=RKIND), dimension(:), pointer :: fzm
+      real (kind=RKIND), dimension(:), pointer :: fzp
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb3
+
+
+      real (kind=RKIND), dimension(:), pointer :: dvEdge
+      integer, dimension(:,:), pointer :: cellsOnCell
+      integer, dimension(:,:), pointer :: cellsOnEdge
+      integer, dimension(:,:), pointer :: advCellsForEdge
+      integer, dimension(:,:), pointer :: edgesOnCell
+      integer, dimension(:), pointer :: nAdvCellsForEdge
+      integer, dimension(:), pointer :: nEdgesOnCell
+      real (kind=RKIND), dimension(:,:), pointer :: adv_coefs
+      real (kind=RKIND), dimension(:,:), pointer :: adv_coefs_3rd
+      real (kind=RKIND), dimension(:,:), pointer :: edgesOnCell_sign
+      real (kind=RKIND), dimension(:), pointer :: invAreaCell
+      integer, dimension(:), pointer :: bdyMaskCell
+      integer, dimension(:), pointer :: bdyMaskEdge
+      real (kind=RKIND), dimension(:), pointer :: specZoneMaskEdge
+      real (kind=RKIND), dimension(:), pointer :: invDvEdge
+      real (kind=RKIND), dimension(:), pointer :: dcEdge
+      real (kind=RKIND), dimension(:), pointer :: invDcEdge
+      integer, dimension(:,:), pointer :: edgesOnEdge
+      integer, dimension(:,:), pointer :: edgesOnVertex
+      real (kind=RKIND), dimension(:,:), pointer :: edgesOnVertex_sign
+      integer, dimension(:), pointer :: nEdgesOnEdge
+      real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge
+      integer, dimension(:,:), pointer :: cellsOnVertex
+      integer, dimension(:,:), pointer :: verticesOnCell
+      integer, dimension(:,:), pointer :: verticesOnEdge
+      real (kind=RKIND), dimension(:), pointer :: invAreaTriangle
+      integer, dimension(:,:), pointer :: kiteForCell
+      real (kind=RKIND), dimension(:,:), pointer :: kiteAreasOnVertex
+      real (kind=RKIND), dimension(:), pointer :: fEdge
+      real (kind=RKIND), dimension(:), pointer :: fVertex
+
+      nullify(mesh)
+      call mpas_pool_get_subpool(block % structs, 'mesh', mesh)
+      nullify(state)
+      call mpas_pool_get_subpool(block % structs, 'state', state)
+      nullify(diag)
+      call mpas_pool_get_subpool(block % structs, 'diag', diag)
+
+      MPAS_ACC_TIMER_START('first_compute_solve_diagnostics [ACC_data_xfer]')
+      call mpas_pool_get_array(state, 'rho_zz', h, 1)
+      !$acc enter data create(h)
+      call mpas_pool_get_array(state, 'u', u, 1)
+      !$acc enter data copyin(u)
+
+      call mpas_pool_get_array(diag, 'v', v)
+      !$acc enter data copyin(v)
+      call mpas_pool_get_array(diag, 'rho_edge', h_edge)
+      !$acc enter data copyin(h_edge)
+      call mpas_pool_get_array(diag, 'vorticity', vorticity)
+      !$acc enter data copyin(vorticity)
+      call mpas_pool_get_array(diag, 'divergence', divergence)
+      !$acc enter data copyin(divergence)
+      call mpas_pool_get_array(diag, 'ke', ke)
+      !$acc enter data copyin(ke)
+      call mpas_pool_get_array(diag, 'pv_edge', pv_edge)
+      !$acc enter data copyin(pv_edge)
+      call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex)
+      !$acc enter data copyin(pv_vertex)
+      call mpas_pool_get_array(diag, 'pv_cell', pv_cell)
+      !$acc enter data copyin(pv_cell)
+      call mpas_pool_get_array(diag, 'gradPVn', gradPVn)
+      !$acc enter data copyin(gradPVn)
+      call mpas_pool_get_array(diag, 'gradPVt', gradPVt)
+      !$acc enter data copyin(gradPVt)
+
+      ! Required by atm_init_coupled_diagnostics
+      call mpas_pool_get_array(mesh, 'zz', zz)
+      !$acc enter data copyin(zz)
+
+      call mpas_pool_get_array(mesh, 'zb_cell', zb_cell)
+      !$acc enter data copyin(zb_cell)
+
+      call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell)
+      !$acc enter data copyin(zb3_cell)
+
+      call mpas_pool_get_array(mesh, 'fzm', fzm)
+      !$acc enter data copyin(fzm)
+
+      call mpas_pool_get_array(mesh, 'fzp', fzp)
+      !$acc enter data copyin(fzp)
+
+      call mpas_pool_get_array(mesh, 'zb', zb)
+      !$acc enter data copyin(zb)
+
+      call mpas_pool_get_array(mesh, 'zb3', zb3)
+      !$acc enter data copyin(zb3)
+
+      ! Required by atm_compute_solve_diagnostics
+      call mpas_pool_get_array(mesh, 'dvEdge', dvEdge)
+      !$acc enter data copyin(dvEdge)
+
+      call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge)
+      !$acc enter data copyin(cellsOnEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnCell', edgesOnCell)
+      !$acc enter data copyin(edgesOnCell)
+
+      call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell)
+      !$acc enter data copyin(nEdgesOnCell)
+
+      call mpas_pool_get_array(mesh, 'edgesOnCell_sign', edgesOnCell_sign)
+      !$acc enter data copyin(edgesOnCell_sign)
+
+      call mpas_pool_get_array(mesh, 'invAreaCell', invAreaCell)
+      !$acc enter data copyin(invAreaCell)
+
+      call mpas_pool_get_array(mesh, 'invDvEdge', invDvEdge)
+      !$acc enter data copyin(invDvEdge)
+
+      call mpas_pool_get_array(mesh, 'dcEdge', dcEdge)
+      !$acc enter data copyin(dcEdge)
+
+      call mpas_pool_get_array(mesh, 'invDcEdge', invDcEdge)
+      !$acc enter data copyin(invDcEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnEdge', edgesOnEdge)
+      !$acc enter data copyin(edgesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnVertex', edgesOnVertex)
+      !$acc enter data copyin(edgesOnVertex)
+
+      call mpas_pool_get_array(mesh, 'edgesOnVertex_sign', edgesOnVertex_sign)
+      !$acc enter data copyin(edgesOnVertex_sign)
+
+      call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge)
+      !$acc enter data copyin(nEdgesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge)
+      !$acc enter data copyin(weightsOnEdge)
+
+      call mpas_pool_get_array(mesh, 'verticesOnCell', verticesOnCell)
+      !$acc enter data copyin(verticesOnCell)
+
+      call mpas_pool_get_array(mesh, 'verticesOnEdge', verticesOnEdge)
+      !$acc enter data copyin(verticesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'invAreaTriangle', invAreaTriangle)
+      !$acc enter data copyin(invAreaTriangle)
+
+      call mpas_pool_get_array(mesh, 'kiteForCell', kiteForCell)
+      !$acc enter data copyin(kiteForCell)
+
+      call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex)
+      !$acc enter data copyin(kiteAreasOnVertex)
+
+      call mpas_pool_get_array(mesh, 'fVertex', fVertex)
+      !$acc enter data copyin(fVertex)
+
+      MPAS_ACC_TIMER_STOP('first_compute_solve_diagnostics [ACC_data_xfer]')
+#endif
+
+   end subroutine mpas_atm_pre_computesolvediag_h2d
+
+
+   subroutine mpas_atm_post_computesolvediag_d2h(block)
+
+      implicit none
+
+      type (block_type), intent(inout) :: block
+
+
+#ifdef MPAS_OPENACC
+      type (mpas_pool_type), pointer :: mesh
+      type (mpas_pool_type), pointer :: diag
+      type (mpas_pool_type), pointer :: state
+      type (mpas_pool_type), pointer :: tend_physics
+      real (kind=RKIND), dimension(:,:), pointer :: rthdynten
+
+      real (kind=RKIND), dimension(:,:), pointer :: h_edge, v, vorticity, ke, pv_edge, &
+                                                   pv_vertex, pv_cell, gradPVn, gradPVt, divergence
+      real (kind=RKIND), dimension(:,:), pointer :: u, h
+
+      real (kind=RKIND), dimension(:,:), pointer :: zz
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb_cell
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb3_cell
+      real (kind=RKIND), dimension(:), pointer :: fzm
+      real (kind=RKIND), dimension(:), pointer :: fzp
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb3
+
+
+      real (kind=RKIND), dimension(:), pointer :: dvEdge
+      integer, dimension(:,:), pointer :: cellsOnCell
+      integer, dimension(:,:), pointer :: cellsOnEdge
+      integer, dimension(:,:), pointer :: advCellsForEdge
+      integer, dimension(:,:), pointer :: edgesOnCell
+      integer, dimension(:), pointer :: nAdvCellsForEdge
+      integer, dimension(:), pointer :: nEdgesOnCell
+      real (kind=RKIND), dimension(:,:), pointer :: adv_coefs
+      real (kind=RKIND), dimension(:,:), pointer :: adv_coefs_3rd
+      real (kind=RKIND), dimension(:,:), pointer :: edgesOnCell_sign
+      real (kind=RKIND), dimension(:), pointer :: invAreaCell
+      integer, dimension(:), pointer :: bdyMaskCell
+      integer, dimension(:), pointer :: bdyMaskEdge
+      real (kind=RKIND), dimension(:), pointer :: specZoneMaskEdge
+      real (kind=RKIND), dimension(:), pointer :: invDvEdge
+      real (kind=RKIND), dimension(:), pointer :: dcEdge
+      real (kind=RKIND), dimension(:), pointer :: invDcEdge
+      integer, dimension(:,:), pointer :: edgesOnEdge
+      integer, dimension(:,:), pointer :: edgesOnVertex
+      real (kind=RKIND), dimension(:,:), pointer :: edgesOnVertex_sign
+      integer, dimension(:), pointer :: nEdgesOnEdge
+      real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge
+      integer, dimension(:,:), pointer :: cellsOnVertex
+      integer, dimension(:,:), pointer :: verticesOnCell
+      integer, dimension(:,:), pointer :: verticesOnEdge
+      real (kind=RKIND), dimension(:), pointer :: invAreaTriangle
+      integer, dimension(:,:), pointer :: kiteForCell
+      real (kind=RKIND), dimension(:,:), pointer :: kiteAreasOnVertex
+      real (kind=RKIND), dimension(:), pointer :: fEdge
+      real (kind=RKIND), dimension(:), pointer :: fVertex
+
+      nullify(mesh)
+      call mpas_pool_get_subpool(block % structs, 'mesh', mesh)
+      nullify(state)
+      call mpas_pool_get_subpool(block % structs, 'state', state)
+      nullify(diag)
+      call mpas_pool_get_subpool(block % structs, 'diag', diag)
+
+      MPAS_ACC_TIMER_START('first_compute_solve_diagnostics [ACC_data_xfer]')
+
+      call mpas_pool_get_array(state, 'rho_zz', h, 1)
+      !$acc exit data copyout(h)
+      call mpas_pool_get_array(state, 'u', u, 1)
+      !$acc exit data copyout(u)
+
+      call mpas_pool_get_array(diag, 'v', v)
+      !$acc exit data copyout(v)
+      call mpas_pool_get_array(diag, 'rho_edge', h_edge)
+      !$acc exit data copyout(h_edge)
+      call mpas_pool_get_array(diag, 'vorticity', vorticity)
+      !$acc exit data copyout(vorticity)
+      call mpas_pool_get_array(diag, 'divergence', divergence)
+      !$acc exit data copyout(divergence)
+      call mpas_pool_get_array(diag, 'ke', ke)
+      !$acc exit data copyout(ke)
+      call mpas_pool_get_array(diag, 'pv_edge', pv_edge)
+      !$acc exit data copyout(pv_edge)
+      call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex)
+      !$acc exit data copyout(pv_vertex)
+      call mpas_pool_get_array(diag, 'pv_cell', pv_cell)
+      !$acc exit data copyout(pv_cell)
+      call mpas_pool_get_array(diag, 'gradPVn', gradPVn)
+      !$acc exit data copyout(gradPVn)
+      call mpas_pool_get_array(diag, 'gradPVt', gradPVt)
+      !$acc exit data copyout(gradPVt)
+
+      ! Required by atm_init_coupled_diagnostics
+      call mpas_pool_get_array(mesh, 'zz', zz)
+      !$acc exit data delete(zz)
+
+      call mpas_pool_get_array(mesh, 'zb_cell', zb_cell)
+      !$acc exit data delete(zb_cell)
+
+      call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell)
+      !$acc exit data delete(zb3_cell)
+
+      call mpas_pool_get_array(mesh, 'fzm', fzm)
+      !$acc exit data delete(fzm)
+
+      call mpas_pool_get_array(mesh, 'fzp', fzp)
+      !$acc exit data delete(fzp)
+
+      call mpas_pool_get_array(mesh, 'zb', zb)
+      !$acc exit data delete(zb)
+
+      call mpas_pool_get_array(mesh, 'zb3', zb3)
+      !$acc exit data delete(zb3)
+
+
+      call mpas_pool_get_array(mesh, 'dvEdge', dvEdge)
+      !$acc exit data delete(dvEdge)
+
+      call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge)
+      !$acc exit data delete(cellsOnEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnCell', edgesOnCell)
+      !$acc exit data delete(edgesOnCell)
+
+      call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell)
+      !$acc exit data delete(nEdgesOnCell)
+
+      call mpas_pool_get_array(mesh, 'edgesOnCell_sign', edgesOnCell_sign)
+      !$acc exit data delete(edgesOnCell_sign)
+
+      call mpas_pool_get_array(mesh, 'invAreaCell', invAreaCell)
+      !$acc exit data delete(invAreaCell)
+
+      call mpas_pool_get_array(mesh, 'invDvEdge', invDvEdge)
+      !$acc exit data delete(invDvEdge)
+
+      call mpas_pool_get_array(mesh, 'dcEdge', dcEdge)
+      !$acc exit data delete(dcEdge)
+
+      call mpas_pool_get_array(mesh, 'invDcEdge', invDcEdge)
+      !$acc exit data delete(invDcEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnEdge', edgesOnEdge)
+      !$acc exit data delete(edgesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'edgesOnVertex', edgesOnVertex)
+      !$acc exit data delete(edgesOnVertex)
+
+      call mpas_pool_get_array(mesh, 'edgesOnVertex_sign', edgesOnVertex_sign)
+      !$acc exit data delete(edgesOnVertex_sign)
+
+      call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge)
+      !$acc exit data delete(nEdgesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge)
+      !$acc exit data delete(weightsOnEdge)
+
+      call mpas_pool_get_array(mesh, 'verticesOnCell', verticesOnCell)
+      !$acc exit data delete(verticesOnCell)
+
+      call mpas_pool_get_array(mesh, 'verticesOnEdge', verticesOnEdge)
+      !$acc exit data delete(verticesOnEdge)
+
+      call mpas_pool_get_array(mesh, 'invAreaTriangle', invAreaTriangle)
+      !$acc exit data delete(invAreaTriangle)
+
+      call mpas_pool_get_array(mesh, 'kiteForCell', kiteForCell)
+      !$acc exit data delete(kiteForCell)
+
+      call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex)
+      !$acc exit data delete(kiteAreasOnVertex)
+
+      call mpas_pool_get_array(mesh, 'fVertex', fVertex)
+      !$acc exit data delete(fVertex)
+
+      MPAS_ACC_TIMER_STOP('first_compute_solve_diagnostics [ACC_data_xfer]')
+#endif
+
+   end subroutine mpas_atm_post_computesolvediag_d2h
+
+   subroutine mpas_atm_pre_dynamics_h2d(domain)
+
+      implicit none
+
+      type (domain_type), intent(inout) :: domain
+
+
+#ifdef MPAS_OPENACC
+      type (mpas_pool_type), pointer :: mesh
+      type (mpas_pool_type), pointer :: state
+      type (mpas_pool_type), pointer :: diag
+      type (mpas_pool_type), pointer :: tend
+      type (mpas_pool_type), pointer :: tend_physics
+      type (mpas_pool_type), pointer :: lbc
+
+      logical, pointer :: config_apply_lbcs_ptr
+      logical :: config_apply_lbcs
+
+      real (kind=RKIND), dimension(:,:), pointer :: ru, ru_p
+      real (kind=RKIND), dimension(:,:), pointer :: ru_save
+      real (kind=RKIND), dimension(:,:), pointer :: rw, rw_p
+      real (kind=RKIND), dimension(:,:), pointer :: rw_save
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_p
+      real (kind=RKIND), dimension(:,:), pointer :: exner, exner_base
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_base, rho_base
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_p_save
+      real (kind=RKIND), dimension(:,:), pointer :: rho_p, rho_pp, rho, theta, theta_base
+      real (kind=RKIND), dimension(:,:), pointer :: rho_p_save
+      real (kind=RKIND), dimension(:,:), pointer :: rho_zz_old_split
+      real (kind=RKIND), dimension(:,:), pointer :: cqw, rtheta_pp_old, rtheta_pp
+      real (kind=RKIND), dimension(:,:), pointer :: cqu, pressure_base, pressure_p, pressure, v
+      real (kind=RKIND), dimension(:,:), pointer :: kdiff, pv_edge, pv_vertex, pv_cell, rho_edge, h_divergence, ke
+      real (kind=RKIND), dimension(:,:), pointer :: cofwr, cofwz, coftz, cofwt, a_tri, alpha_tri, gamma_tri
+      real (kind=RKIND), dimension(:), pointer :: cofrz
+      real (kind=RKIND), dimension(:,:), pointer :: gradPVn, gradPVt
+
+
+      real (kind=RKIND), dimension(:,:), pointer :: u_1, u_2
+      real (kind=RKIND), dimension(:,:), pointer :: w_1, w_2
+      real (kind=RKIND), dimension(:,:), pointer :: theta_m_1, theta_m_2
+      real (kind=RKIND), dimension(:,:), pointer :: rho_zz_1, rho_zz_2
+      real (kind=RKIND), dimension(:,:,:), pointer :: scalars_1, scalars_2
+      real (kind=RKIND), dimension(:,:), pointer :: ruAvg, wwAvg, ruAvg_split, wwAvg_split
+
+      integer, pointer :: nCells_ptr
+      integer :: nCells
+      real (kind=RKIND), dimension(:,:), pointer :: uReconstructZonal, uReconstructMeridional, uReconstructX, uReconstructY, uReconstructZ
+
+      real (kind=RKIND), dimension(:,:), pointer :: tend_ru, tend_rt, tend_rho, tend_rw, rt_diabatic_tend
+      real (kind=RKIND), dimension(:,:), pointer :: tend_u_euler, tend_w_euler, tend_theta_euler
+      real(kind=RKIND), dimension(:,:), pointer :: tend_w_pgf, tend_w_buoy
+      real(kind=RKIND), dimension(:,:,:), pointer :: scalar_tend_save
+
+      real (kind=RKIND), dimension(:,:), pointer :: rthdynten, divergence, vorticity
+
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_u, lbc_w, lbc_ru, lbc_rho_edge, lbc_rho, lbc_rtheta_m, lbc_rho_zz, lbc_theta
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_u, lbc_tend_w, lbc_tend_ru, lbc_tend_rho_edge, lbc_tend_rho
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_rtheta_m, lbc_tend_rho_zz, lbc_tend_theta
+
+      real (kind=RKIND), dimension(:,:,:), pointer :: lbc_scalars, lbc_tend_scalars
+
+      nullify(mesh)
+      nullify(state)
+      nullify(diag)
+      nullify(tend)
+      nullify(tend_physics)
+      nullify(lbc)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'mesh', mesh)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'state', state)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'diag', diag)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'tend', tend)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'tend_physics', tend_physics)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'lbc', lbc)
+
+      call mpas_pool_get_config(domain % blocklist % configs, 'config_apply_lbcs', config_apply_lbcs_ptr)
+      config_apply_lbcs = config_apply_lbcs_ptr
+
+      MPAS_ACC_TIMER_START('atm_srk3 [ACC_data_xfer]')
+      call mpas_pool_get_array(diag, 'ru', ru)
+      !$acc enter data copyin(ru) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'ru_p', ru_p)
+      !$acc enter data copyin(ru_p)
+      call mpas_pool_get_array(diag, 'ru_save', ru_save)
+      !$acc enter data copyin(ru_save)
+      call mpas_pool_get_array(diag, 'rw', rw)
+      !$acc enter data copyin(rw) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rw_p', rw_p)
+      !$acc enter data copyin(rw_p)
+      call mpas_pool_get_array(diag, 'rw_save', rw_save)
+      !$acc enter data copyin(rw_save)
+      call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p)
+      !$acc enter data copyin(rtheta_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_p_save', rtheta_p_save)
+      !$acc enter data copyin(rtheta_p_save)
+      call mpas_pool_get_array(diag, 'exner', exner)
+      !$acc enter data copyin(exner) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'exner_base', exner_base)
+      !$acc enter data copyin(exner_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base)
+      !$acc enter data copyin(rtheta_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rho_base', rho_base)
+      !$acc enter data copyin(rho_base)
+      call mpas_pool_get_array(diag, 'rho', rho)
+      !$acc enter data copyin(rho)
+      call mpas_pool_get_array(diag, 'theta', theta)
+      !$acc enter data copyin(theta)
+      call mpas_pool_get_array(diag, 'theta_base', theta_base)
+      !$acc enter data copyin(theta_base)
+      call mpas_pool_get_array(diag, 'rho_p', rho_p)
+      !$acc enter data copyin(rho_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rho_p_save', rho_p_save)
+      !$acc enter data copyin(rho_p_save)
+      call mpas_pool_get_array(diag, 'rho_pp', rho_pp)
+      !$acc enter data copyin(rho_pp)
+      call mpas_pool_get_array(diag, 'rho_zz_old_split', rho_zz_old_split)
+      !$acc enter data copyin(rho_zz_old_split)
+      call mpas_pool_get_array(diag, 'cqw', cqw)
+      !$acc enter data copyin(cqw)
+      call mpas_pool_get_array(diag, 'cqu', cqu)
+      !$acc enter data copyin(cqu)
+      call mpas_pool_get_array(diag, 'pressure_p', pressure_p)
+      !$acc enter data copyin(pressure_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'pressure_base', pressure_base)
+      !$acc enter data copyin(pressure_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'pressure', pressure)
+      !$acc enter data copyin(pressure)
+      call mpas_pool_get_array(diag, 'v', v)
+      !$acc enter data copyin(v) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp)
+      !$acc enter data copyin(rtheta_pp)
+      call mpas_pool_get_array(diag, 'rtheta_pp_old', rtheta_pp_old)
+      !$acc enter data copyin(rtheta_pp_old)
+      call mpas_pool_get_array(diag, 'kdiff', kdiff)
+      !$acc enter data copyin(kdiff)
+      call mpas_pool_get_array(diag, 'pv_edge', pv_edge)
+      !$acc enter data copyin(pv_edge) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex)
+      !$acc enter data copyin(pv_vertex) 
+      call mpas_pool_get_array(diag, 'pv_cell', pv_cell)
+      !$acc enter data copyin(pv_cell) 
+      call mpas_pool_get_array(diag, 'rho_edge', rho_edge)
+      !$acc enter data copyin(rho_edge) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'h_divergence', h_divergence)
+      !$acc enter data copyin(h_divergence)
+      call mpas_pool_get_array(diag, 'ke', ke)
+      !$acc enter data copyin(ke) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'gradPVn', gradPVn)
+      !$acc enter data copyin(gradPVn)
+      call mpas_pool_get_array(diag, 'gradPVt', gradPVt)
+      !$acc enter data copyin(gradPVt)
+
+      call mpas_pool_get_array(diag, 'alpha_tri', alpha_tri)
+      !$acc enter data copyin(alpha_tri)
+      call mpas_pool_get_array(diag, 'gamma_tri', gamma_tri)
+      !$acc enter data copyin(gamma_tri)
+      call mpas_pool_get_array(diag, 'a_tri', a_tri)
+      !$acc enter data copyin(a_tri)
+      call mpas_pool_get_array(diag, 'cofwr', cofwr)
+      !$acc enter data copyin(cofwr)
+      call mpas_pool_get_array(diag, 'cofwz', cofwz)
+      !$acc enter data copyin(cofwz)
+      call mpas_pool_get_array(diag, 'coftz', coftz)
+      !$acc enter data copyin(coftz)
+      call mpas_pool_get_array(diag, 'cofwt', cofwt)
+      !$acc enter data copyin(cofwt)
+      call mpas_pool_get_array(diag, 'cofrz', cofrz)
+      !$acc enter data copyin(cofrz)
+      call mpas_pool_get_array(diag, 'vorticity', vorticity)
+      !$acc enter data copyin(vorticity)
+      call mpas_pool_get_array(diag, 'divergence', divergence)
+      !$acc enter data copyin(divergence)
+      call mpas_pool_get_array(diag, 'ruAvg', ruAvg)
+      !$acc enter data copyin(ruAvg)
+      call mpas_pool_get_array(diag, 'ruAvg_split', ruAvg_split)
+      !$acc enter data copyin(ruAvg_split)
+      call mpas_pool_get_array(diag, 'wwAvg', wwAvg)
+      !$acc enter data copyin(wwAvg)
+      call mpas_pool_get_array(diag, 'wwAvg_split', wwAvg_split)
+      !$acc enter data copyin(wwAvg_split)
+
+      call mpas_pool_get_dimension(mesh, 'nCellsSolve', nCells_ptr)
+      nCells = nCells_ptr
+      call mpas_pool_get_array(diag, 'uReconstructX', uReconstructX)
+      !$acc enter data create(uReconstructX(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructY', uReconstructY)
+      !$acc enter data create(uReconstructY(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructZ', uReconstructZ)
+      !$acc enter data create(uReconstructZ(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructZonal', uReconstructZonal)
+      !$acc enter data copyin(uReconstructZonal(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructMeridional', uReconstructMeridional)
+      !$acc enter data copyin(uReconstructMeridional(:,1:nCells))
+
+      call mpas_pool_get_array(state, 'u', u_1, 1)
+      !$acc enter data copyin(u_1)
+      call mpas_pool_get_array(state, 'u', u_2, 2)
+      !$acc enter data copyin(u_2)
+      call mpas_pool_get_array(state, 'w', w_1, 1)
+      !$acc enter data copyin(w_1)
+      call mpas_pool_get_array(state, 'w', w_2, 2)
+      !$acc enter data copyin(w_2)
+      call mpas_pool_get_array(state, 'theta_m', theta_m_1, 1)  
+      !$acc enter data copyin(theta_m_1)   ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(state, 'theta_m', theta_m_2, 2)
+      !$acc enter data copyin(theta_m_2)
+      call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1)
+      !$acc enter data copyin(rho_zz_1)
+      call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2)
+      !$acc enter data copyin(rho_zz_2)
+      call mpas_pool_get_array(state, 'scalars', scalars_1, 1)
+      !$acc enter data copyin(scalars_1)
+      call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+      !$acc enter data copyin(scalars_2)
+
+
+      call mpas_pool_get_array(tend, 'u', tend_ru)
+      !$acc enter data copyin(tend_ru)
+      call mpas_pool_get_array(tend, 'rho_zz', tend_rho)
+      !$acc enter data copyin(tend_rho)
+      call mpas_pool_get_array(tend, 'theta_m', tend_rt)
+      !$acc enter data copyin(tend_rt)
+      call mpas_pool_get_array(tend, 'w', tend_rw)
+      !$acc enter data copyin(tend_rw)
+      call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend)
+      !$acc enter data copyin(rt_diabatic_tend)
+      call mpas_pool_get_array(tend, 'u_euler', tend_u_euler)
+      !$acc enter data copyin(tend_u_euler)
+      call mpas_pool_get_array(tend, 'theta_euler', tend_theta_euler)
+      !$acc enter data copyin(tend_theta_euler)
+      call mpas_pool_get_array(tend, 'w_euler', tend_w_euler)
+      !$acc enter data copyin(tend_w_euler)
+      call mpas_pool_get_array(tend, 'w_pgf', tend_w_pgf)
+      !$acc enter data copyin(tend_w_pgf)
+      call mpas_pool_get_array(tend, 'w_buoy', tend_w_buoy)
+      !$acc enter data copyin(tend_w_buoy)
+      call mpas_pool_get_array(tend, 'scalars_tend', scalar_tend_save)
+      !$acc enter data copyin(scalar_tend_save)
+
+
+      if(config_apply_lbcs) then
+         call mpas_pool_get_array(lbc, 'lbc_u', lbc_u, 2)
+         !$acc enter data copyin(lbc_u)
+         call mpas_pool_get_array(lbc, 'lbc_w', lbc_w, 2)
+         !$acc enter data copyin(lbc_w)
+         call mpas_pool_get_array(lbc, 'lbc_ru', lbc_ru, 2)
+         !$acc enter data copyin(lbc_ru)
+         call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_rho_edge, 2)
+         !$acc enter data copyin(lbc_rho_edge)
+         call mpas_pool_get_array(lbc, 'lbc_theta', lbc_theta, 2)
+         !$acc enter data copyin(lbc_theta)
+         call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_rtheta_m, 2)
+         !$acc enter data copyin(lbc_rtheta_m)
+         call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_rho_zz, 2)
+         !$acc enter data copyin(lbc_rho_zz)
+         call mpas_pool_get_array(lbc, 'lbc_rho', lbc_rho, 2)
+         !$acc enter data copyin(lbc_rho)
+         call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_scalars, 2)
+         !$acc enter data copyin(lbc_scalars)
+
+         
+         call mpas_pool_get_array(lbc, 'lbc_u', lbc_tend_u, 1)
+         !$acc enter data copyin(lbc_tend_u)
+         call mpas_pool_get_array(lbc, 'lbc_ru', lbc_tend_ru, 1)
+         !$acc enter data copyin(lbc_tend_ru)
+         call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_tend_rho_edge, 1)
+         !$acc enter data copyin(lbc_tend_rho_edge)
+         call mpas_pool_get_array(lbc, 'lbc_w', lbc_tend_w, 1)
+         !$acc enter data copyin(lbc_tend_w)
+         call mpas_pool_get_array(lbc, 'lbc_theta', lbc_tend_theta, 1)
+         !$acc enter data copyin(lbc_tend_theta)
+         call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_tend_rtheta_m, 1)
+         !$acc enter data copyin(lbc_tend_rtheta_m)
+         call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_tend_rho_zz, 1)
+         !$acc enter data copyin(lbc_tend_rho_zz)
+         call mpas_pool_get_array(lbc, 'lbc_rho', lbc_tend_rho, 1)
+         !$acc enter data copyin(lbc_tend_rho)
+         call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_tend_scalars, 1)
+         !$acc enter data copyin(lbc_tend_scalars)
+      end if
+
+      call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten)
+      !$acc enter data copyin(rthdynten)
+
+      MPAS_ACC_TIMER_STOP('atm_srk3 [ACC_data_xfer]')
+#endif
+
+   end subroutine mpas_atm_pre_dynamics_h2d
+
+
+   subroutine mpas_atm_post_dynamics_d2h(domain)
+
+      implicit none
+
+      type (domain_type), intent(inout) :: domain
+
+
+#ifdef MPAS_OPENACC
+      type (mpas_pool_type), pointer :: mesh
+      type (mpas_pool_type), pointer :: state
+      type (mpas_pool_type), pointer :: diag
+      type (mpas_pool_type), pointer :: tend
+      type (mpas_pool_type), pointer :: tend_physics
+      type (mpas_pool_type), pointer :: lbc
+
+      logical, pointer :: config_apply_lbcs_ptr
+      logical :: config_apply_lbcs
+
+      real (kind=RKIND), dimension(:,:), pointer :: ru, ru_p
+      real (kind=RKIND), dimension(:,:), pointer :: ru_save
+      real (kind=RKIND), dimension(:,:), pointer :: rw, rw_p
+      real (kind=RKIND), dimension(:,:), pointer :: rw_save
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_p
+      real (kind=RKIND), dimension(:,:), pointer :: exner, exner_base
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_base, rho_base
+      real (kind=RKIND), dimension(:,:), pointer :: rtheta_p_save
+      real (kind=RKIND), dimension(:,:), pointer :: rho_p, rho_pp, rho, theta, theta_base
+      real (kind=RKIND), dimension(:,:), pointer :: rho_p_save
+      real (kind=RKIND), dimension(:,:), pointer :: rho_zz_old_split
+      real (kind=RKIND), dimension(:,:), pointer :: cqw, rtheta_pp_old, rtheta_pp
+      real (kind=RKIND), dimension(:,:), pointer :: cqu, pressure_base, pressure_p, pressure, v
+      real (kind=RKIND), dimension(:,:), pointer :: kdiff, pv_edge, pv_vertex, pv_cell, rho_edge, h_divergence, ke
+      real (kind=RKIND), dimension(:,:), pointer :: cofwr, cofwz, coftz, cofwt, a_tri, alpha_tri, gamma_tri
+      real (kind=RKIND), dimension(:), pointer :: cofrz
+      real (kind=RKIND), dimension(:,:), pointer :: gradPVn, gradPVt
+
+
+      real (kind=RKIND), dimension(:,:), pointer :: u_1, u_2
+      real (kind=RKIND), dimension(:,:), pointer :: w_1, w_2
+      real (kind=RKIND), dimension(:,:), pointer :: theta_m_1, theta_m_2
+      real (kind=RKIND), dimension(:,:), pointer :: rho_zz_1, rho_zz_2
+      real (kind=RKIND), dimension(:,:,:), pointer :: scalars_1, scalars_2
+      real (kind=RKIND), dimension(:,:), pointer :: ruAvg, wwAvg, ruAvg_split, wwAvg_split
+
+      integer, pointer :: nCells_ptr
+      integer :: nCells
+      real (kind=RKIND), dimension(:,:), pointer :: uReconstructZonal, uReconstructMeridional, uReconstructX, uReconstructY, uReconstructZ
+
+      real (kind=RKIND), dimension(:,:), pointer :: tend_ru, tend_rt, tend_rho, tend_rw, rt_diabatic_tend
+      real (kind=RKIND), dimension(:,:), pointer :: tend_u_euler, tend_w_euler, tend_theta_euler
+      real(kind=RKIND), dimension(:,:), pointer :: tend_w_pgf, tend_w_buoy
+      real(kind=RKIND), dimension(:,:,:), pointer :: scalar_tend_save
+
+      real (kind=RKIND), dimension(:,:), pointer :: rthdynten, divergence, vorticity
+
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_u, lbc_w, lbc_ru, lbc_rho_edge, lbc_rho, lbc_rtheta_m, lbc_rho_zz, lbc_theta
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_u, lbc_tend_w, lbc_tend_ru, lbc_tend_rho_edge, lbc_tend_rho
+      real (kind=RKIND), dimension(:,:), pointer :: lbc_tend_rtheta_m, lbc_tend_rho_zz, lbc_tend_theta
+
+      real (kind=RKIND), dimension(:,:,:), pointer :: lbc_scalars, lbc_tend_scalars
+
+      nullify(mesh)
+      nullify(state)
+      nullify(diag)
+      nullify(tend)
+      nullify(tend_physics)
+      nullify(lbc)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'mesh', mesh)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'state', state)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'diag', diag)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'tend', tend)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'tend_physics', tend_physics)
+      call mpas_pool_get_subpool(domain % blocklist % structs, 'lbc', lbc)
+
+      call mpas_pool_get_config(domain % blocklist % configs, 'config_apply_lbcs', config_apply_lbcs_ptr)
+      config_apply_lbcs = config_apply_lbcs_ptr
+
+      MPAS_ACC_TIMER_START('atm_srk3 [ACC_data_xfer]')
+      call mpas_pool_get_array(diag, 'ru', ru)
+      !$acc exit data copyout(ru) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'ru_p', ru_p)
+      !$acc exit data copyout(ru_p)
+      call mpas_pool_get_array(diag, 'ru_save', ru_save)
+      !$acc exit data delete(ru_save)
+      call mpas_pool_get_array(diag, 'rw', rw)
+      !$acc exit data copyout(rw) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rw_p', rw_p)
+      !$acc exit data copyout(rw_p)
+      call mpas_pool_get_array(diag, 'rw_save', rw_save)
+      !$acc exit data delete(rw_save)
+      call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p)
+      !$acc exit data copyout(rtheta_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_p_save', rtheta_p_save)
+      !$acc exit data delete(rtheta_p_save)
+      call mpas_pool_get_array(diag, 'exner', exner)
+      !$acc exit data copyout(exner) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'exner_base', exner_base)
+      !$acc exit data copyout(exner_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base)
+      !$acc exit data copyout(rtheta_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rho_base', rho_base)
+      !$acc exit data copyout(rho_base)
+      call mpas_pool_get_array(diag, 'rho', rho)
+      !$acc exit data copyout(rho)
+      call mpas_pool_get_array(diag, 'theta', theta)
+      !$acc exit data copyout(theta)
+      call mpas_pool_get_array(diag, 'theta_base', theta_base)
+      !$acc exit data copyout(theta_base)
+      call mpas_pool_get_array(diag, 'rho_p', rho_p)
+      !$acc exit data copyout(rho_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'rho_p_save', rho_p_save)
+      !$acc exit data delete(rho_p_save)
+      call mpas_pool_get_array(diag, 'rho_pp', rho_pp)
+      !$acc exit data copyout(rho_pp)
+      call mpas_pool_get_array(diag, 'rho_zz_old_split', rho_zz_old_split)
+      !$acc exit data delete(rho_zz_old_split)
+      call mpas_pool_get_array(diag, 'cqw', cqw)
+      !$acc exit data delete(cqw)
+      call mpas_pool_get_array(diag, 'cqu', cqu)
+      !$acc exit data copyout(cqu)
+      call mpas_pool_get_array(diag, 'pressure_p', pressure_p)
+      !$acc exit data copyout(pressure_p) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'pressure_base', pressure_base)
+      !$acc exit data copyout(pressure_base) ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(diag, 'pressure', pressure)
+      !$acc exit data copyout(pressure)
+      call mpas_pool_get_array(diag, 'v', v)
+      !$acc exit data copyout(v) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp)
+      !$acc exit data copyout(rtheta_pp)
+      call mpas_pool_get_array(diag, 'rtheta_pp_old', rtheta_pp_old)
+      !$acc exit data copyout(rtheta_pp_old)
+      call mpas_pool_get_array(diag, 'kdiff', kdiff)
+      !$acc exit data copyout(kdiff)
+      call mpas_pool_get_array(diag, 'pv_edge', pv_edge)
+      !$acc exit data copyout(pv_edge) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'pv_vertex', pv_vertex)
+      !$acc exit data copyout(pv_vertex) 
+      call mpas_pool_get_array(diag, 'pv_cell', pv_cell)
+      !$acc exit data delete(pv_cell) 
+      call mpas_pool_get_array(diag, 'rho_edge', rho_edge)
+      !$acc exit data copyout(rho_edge) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'h_divergence', h_divergence)
+      !$acc exit data copyout(h_divergence)
+      call mpas_pool_get_array(diag, 'ke', ke)
+      !$acc exit data copyout(ke) ! use values from atm_compute_solve_diagnostics
+      call mpas_pool_get_array(diag, 'gradPVn', gradPVn)
+      !$acc exit data delete(gradPVn)
+      call mpas_pool_get_array(diag, 'gradPVt', gradPVt)
+      !$acc exit data delete(gradPVt)
+
+      call mpas_pool_get_array(diag, 'alpha_tri', alpha_tri)
+      !$acc exit data delete(alpha_tri)
+      call mpas_pool_get_array(diag, 'gamma_tri', gamma_tri)
+      !$acc exit data delete(gamma_tri)
+      call mpas_pool_get_array(diag, 'a_tri', a_tri)
+      !$acc exit data delete(a_tri)
+      call mpas_pool_get_array(diag, 'cofwr', cofwr)
+      !$acc exit data delete(cofwr)
+      call mpas_pool_get_array(diag, 'cofwz', cofwz)
+      !$acc exit data delete(cofwz)
+      call mpas_pool_get_array(diag, 'coftz', coftz)
+      !$acc exit data delete(coftz)
+      call mpas_pool_get_array(diag, 'cofwt', cofwt)
+      !$acc exit data delete(cofwt)
+      call mpas_pool_get_array(diag, 'cofrz', cofrz)
+      !$acc exit data delete(cofrz)
+      call mpas_pool_get_array(diag, 'vorticity', vorticity)
+      !$acc exit data copyout(vorticity)
+      call mpas_pool_get_array(diag, 'divergence', divergence)
+      !$acc exit data copyout(divergence)
+      call mpas_pool_get_array(diag, 'ruAvg', ruAvg)
+      !$acc exit data copyout(ruAvg)
+      call mpas_pool_get_array(diag, 'ruAvg_split', ruAvg_split)
+      !$acc exit data copyout(ruAvg_split)
+      call mpas_pool_get_array(diag, 'wwAvg', wwAvg)
+      !$acc exit data copyout(wwAvg)
+      call mpas_pool_get_array(diag, 'wwAvg_split', wwAvg_split)
+      !$acc exit data copyout(wwAvg_split)
+
+      call mpas_pool_get_dimension(mesh, 'nCellsSolve', nCells_ptr)
+      nCells = nCells_ptr
+      call mpas_pool_get_array(diag, 'uReconstructX', uReconstructX)
+      !$acc exit data copyout(uReconstructX(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructY', uReconstructY)
+      !$acc exit data copyout(uReconstructY(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructZ', uReconstructZ)
+      !$acc exit data copyout(uReconstructZ(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructZonal', uReconstructZonal)
+      !$acc exit data copyout(uReconstructZonal(:,1:nCells))
+      call mpas_pool_get_array(diag, 'uReconstructMeridional', uReconstructMeridional)
+      !$acc exit data copyout(uReconstructMeridional(:,1:nCells))
+
+      call mpas_pool_get_array(state, 'u', u_1, 1)
+      !$acc exit data copyout(u_1)
+      call mpas_pool_get_array(state, 'u', u_2, 2)
+      !$acc exit data copyout(u_2)
+      call mpas_pool_get_array(state, 'w', w_1, 1)
+      !$acc exit data copyout(w_1)
+      call mpas_pool_get_array(state, 'w', w_2, 2)
+      !$acc exit data copyout(w_2)
+      call mpas_pool_get_array(state, 'theta_m', theta_m_1, 1)  
+      !$acc exit data copyout(theta_m_1)   ! use values from atm_init_coupled_diagnostics
+      call mpas_pool_get_array(state, 'theta_m', theta_m_2, 2)
+      !$acc exit data copyout(theta_m_2)   ! Delete gives incorrect results
+      call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1)
+      !$acc exit data copyout(rho_zz_1)
+      call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2)
+      !$acc exit data copyout(rho_zz_2)
+      call mpas_pool_get_array(state, 'scalars', scalars_1, 1)
+      !$acc exit data copyout(scalars_1)
+      call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+      !$acc exit data copyout(scalars_2)   ! Delete gives incorrect results
+
+
+      call mpas_pool_get_array(tend, 'u', tend_ru)
+      !$acc exit data copyout(tend_ru)
+      call mpas_pool_get_array(tend, 'rho_zz', tend_rho)
+      !$acc exit data copyout(tend_rho)
+      call mpas_pool_get_array(tend, 'theta_m', tend_rt)
+      !$acc exit data copyout(tend_rt)
+      call mpas_pool_get_array(tend, 'w', tend_rw)
+      !$acc exit data copyout(tend_rw)
+      call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend)
+      !$acc exit data copyout(rt_diabatic_tend)
+      call mpas_pool_get_array(tend, 'u_euler', tend_u_euler)
+      !$acc exit data copyout(tend_u_euler)
+      call mpas_pool_get_array(tend, 'theta_euler', tend_theta_euler)
+      !$acc exit data copyout(tend_theta_euler)
+      call mpas_pool_get_array(tend, 'w_euler', tend_w_euler)
+      !$acc exit data copyout(tend_w_euler)
+      call mpas_pool_get_array(tend, 'w_pgf', tend_w_pgf)
+      !$acc exit data copyout(tend_w_pgf)
+      call mpas_pool_get_array(tend, 'w_buoy', tend_w_buoy)
+      !$acc exit data copyout(tend_w_buoy)
+      call mpas_pool_get_array(tend, 'scalars_tend', scalar_tend_save)
+      !$acc exit data copyout(scalar_tend_save)
+
+      if(config_apply_lbcs) then
+         call mpas_pool_get_array(lbc, 'lbc_u', lbc_u, 2)
+         !$acc exit data delete(lbc_u)
+         call mpas_pool_get_array(lbc, 'lbc_w', lbc_w, 2)
+         !$acc exit data delete(lbc_w)
+         call mpas_pool_get_array(lbc, 'lbc_ru', lbc_ru, 2)
+         !$acc exit data delete(lbc_ru)
+         call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_rho_edge, 2)
+         !$acc exit data delete(lbc_rho_edge)
+         call mpas_pool_get_array(lbc, 'lbc_theta', lbc_theta, 2)
+         !$acc exit data delete(lbc_theta)
+         call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_rtheta_m, 2)
+         !$acc exit data delete(lbc_rtheta_m)
+         call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_rho_zz, 2)
+         !$acc exit data delete(lbc_rho_zz)
+         call mpas_pool_get_array(lbc, 'lbc_rho', lbc_rho, 2)
+         !$acc exit data delete(lbc_rho)
+         call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_scalars, 2)
+         !$acc exit data delete(lbc_scalars)
+      
+         call mpas_pool_get_array(lbc, 'lbc_u', lbc_tend_u, 1)
+         !$acc exit data delete(lbc_tend_u)
+         call mpas_pool_get_array(lbc, 'lbc_ru', lbc_tend_ru, 1)
+         !$acc exit data delete(lbc_tend_ru)
+         call mpas_pool_get_array(lbc, 'lbc_rho_edge', lbc_tend_rho_edge, 1)
+         !$acc exit data delete(lbc_tend_rho_edge)
+         call mpas_pool_get_array(lbc, 'lbc_w', lbc_tend_w, 1)
+         !$acc exit data delete(lbc_tend_w)
+         call mpas_pool_get_array(lbc, 'lbc_theta', lbc_tend_theta, 1)
+         !$acc exit data delete(lbc_tend_theta)
+         call mpas_pool_get_array(lbc, 'lbc_rtheta_m', lbc_tend_rtheta_m, 1)
+         !$acc exit data delete(lbc_tend_rtheta_m)
+         call mpas_pool_get_array(lbc, 'lbc_rho_zz', lbc_tend_rho_zz, 1)
+         !$acc exit data delete(lbc_tend_rho_zz)
+         call mpas_pool_get_array(lbc, 'lbc_rho', lbc_tend_rho, 1)
+         !$acc exit data delete(lbc_tend_rho)
+         call mpas_pool_get_array(lbc, 'lbc_scalars', lbc_tend_scalars, 1)
+         !$acc exit data delete(lbc_tend_scalars)
+      end if
+
+      call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten)
+      !$acc exit data copyout(rthdynten)
+      MPAS_ACC_TIMER_STOP('atm_srk3 [ACC_data_xfer]')
 #endif
 
-   end subroutine mpas_atm_dynamics_init
+   end subroutine mpas_atm_post_dynamics_d2h
 
 
    !----------------------------------------------------------------------------
@@ -546,6 +1519,11 @@ subroutine mpas_atm_dynamics_finalize(domain)
       real (kind=RKIND), dimension(:), pointer :: angleEdge
       real (kind=RKIND), dimension(:), pointer :: meshScalingDel2
       real (kind=RKIND), dimension(:), pointer :: meshScalingDel4
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c2
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_s2
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_cs
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_s
 #endif
 
 
@@ -731,6 +1709,21 @@ subroutine mpas_atm_dynamics_finalize(domain)
 
       call mpas_pool_get_array(mesh, 'meshScalingDel4', meshScalingDel4)
       !$acc exit data delete(meshScalingDel4)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_c2', deformation_coef_c2)
+      !$acc exit data delete(deformation_coef_c2)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_s2', deformation_coef_s2)
+      !$acc exit data delete(deformation_coef_s2)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_cs', deformation_coef_cs)
+      !$acc exit data delete(deformation_coef_cs)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_c', deformation_coef_c)
+      !$acc exit data delete(deformation_coef_c)
+
+      call mpas_pool_get_array(mesh, 'deformation_coef_s', deformation_coef_s)
+      !$acc exit data delete(deformation_coef_s)
 #endif
 
    end subroutine mpas_atm_dynamics_finalize
@@ -774,12 +1767,14 @@ subroutine atm_timestep(domain, dt, nowTime, itimestep, exchange_halo_group)
 
       config_apply_lbcs = config_apply_lbcs_ptr
 
+      call mpas_atm_pre_dynamics_h2d(domain)
       if (trim(config_time_integration) == 'SRK3') then
          call atm_srk3(domain, dt, itimestep, exchange_halo_group)
       else
          call mpas_log_write('Unknown time integration option '//trim(config_time_integration), messageType=MPAS_LOG_ERR)
          call mpas_log_write('Currently, only ''SRK3'' is supported.', messageType=MPAS_LOG_CRIT)
       end if
+      call mpas_atm_post_dynamics_d2h(domain)
 
       call mpas_set_timeInterval(dtInterval, dt=dt)
       currTime = nowTime + dtInterval
@@ -850,6 +1845,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       logical, pointer :: config_scalar_advection
       logical, pointer :: config_positive_definite
       logical, pointer :: config_monotonic
+      logical, pointer :: config_gpu_aware_mpi
       character (len=StrKIND), pointer :: config_microp_scheme
       character (len=StrKIND), pointer :: config_convection_scheme
 
@@ -873,6 +1869,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       real (kind=RKIND), dimension(:,:,:), pointer :: scalars, scalars_1, scalars_2
 
       real (kind=RKIND), dimension(:,:), pointer :: rqvdynten, rthdynten, theta_m
+      real (kind=RKIND), dimension(:,:), pointer :: pressure_p, rtheta_p, exner, tend_u
+      real (kind=RKIND), dimension(:,:), pointer :: rho_pp, rtheta_pp, ru_p, rw_p, pv_edge, rho_edge
       real (kind=RKIND) :: theta_local, fac_m
 
 #ifndef MPAS_CAM_DYCORE
@@ -892,6 +1890,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       call mpas_pool_get_config(block % configs, 'config_scalar_advection', config_scalar_advection)
       call mpas_pool_get_config(block % configs, 'config_positive_definite', config_positive_definite)
       call mpas_pool_get_config(block % configs, 'config_monotonic', config_monotonic)
+      call mpas_pool_get_config(block % configs, 'config_gpu_aware_mpi', config_gpu_aware_mpi)
       call mpas_pool_get_config(block % configs, 'config_IAU_option', config_IAU_option)
       !  config variables for dynamics-transport splitting, WCS 18 November 2014
       call mpas_pool_get_config(block % configs, 'config_split_dynamics_transport', config_split_dynamics_transport)
@@ -1040,7 +2039,15 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       !
       ! Communicate halos for theta_m, scalars, pressure_p, and rtheta_p
       !
-      call exchange_halo_group(domain, 'dynamics:theta_m,scalars,pressure_p,rtheta_p')
+      MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+      call mpas_pool_get_array(state, 'theta_m', theta_m, 1)
+      call mpas_pool_get_array(state, 'scalars', scalars_1, 1)
+      call mpas_pool_get_array(diag, 'pressure_p', pressure_p)
+      call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p)
+      !$acc update self(theta_m,scalars_1,pressure_p,rtheta_p) if (.not. config_gpu_aware_mpi)
+      call exchange_halo_group(domain, 'dynamics:theta_m,scalars,pressure_p,rtheta_p', config_gpu_aware_mpi)
+      !$acc update device(theta_m,scalars_1,pressure_p,rtheta_p) if (.not. config_gpu_aware_mpi)
+      MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
       call mpas_timer_start('atm_rk_integration_setup')
 
@@ -1075,6 +2082,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       call mpas_timer_stop('atm_compute_moist_coefficients')
 
 #ifdef DO_PHYSICS
+      call update_d2h_pre_physics_get_tend(block % configs, state, diag, tend)
       call mpas_timer_start('physics_get_tend')
       rk_step = 1
       dynamics_substep = 1
@@ -1083,6 +2091,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
                              tend_ru_physics, tend_rtheta_physics, tend_rho_physics, &
                              exchange_halo_group )
       call mpas_timer_stop('physics_get_tend')
+      call update_h2d_post_physics_get_tend(block % configs, state, diag, tend)
 #else
 #ifndef MPAS_CAM_DYCORE
       !
@@ -1098,10 +2107,13 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       ! IAU - Incremental Analysis Update
       !
       if (trim(config_IAU_option) /= 'off') then
+         call update_d2h_pre_add_tend_anal_incr(block % configs, block % structs)
          call atm_add_tend_anal_incr(block % configs, block % structs, itimestep, dt, &
                                      tend_ru_physics, tend_rtheta_physics, tend_rho_physics)
       end if
 
+      !$acc enter data copyin(tend_rtheta_physics,tend_rho_physics,tend_ru_physics)
+
 
       DYNAMICS_SUBSTEPS : do dynamics_substep = 1, dynamics_split
 
@@ -1121,8 +2133,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 !$OMP END PARALLEL DO
          call mpas_timer_stop('atm_compute_vert_imp_coefs')
 
-         call exchange_halo_group(domain, 'dynamics:exner')
-
+      MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+      call mpas_pool_get_array(diag, 'exner', exner)      
+      !$acc update self(exner) if (.not. config_gpu_aware_mpi)
+      call exchange_halo_group(domain, 'dynamics:exner', config_gpu_aware_mpi)
+      !$acc update device(exner) if (.not. config_gpu_aware_mpi)
+      MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
          !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
          ! BEGIN Runge-Kutta loop 
@@ -1171,7 +2187,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
 !$OMP PARALLEL DO
             do thread=1,nThreads
-               call atm_compute_dyn_tend( tend, tend_physics, state, diag, mesh, block % configs, nVertLevels, rk_step, dt, & 
+               call atm_compute_dyn_tend( tend, tend_physics, state, diag, mesh, diag_physics, &
+                                          block % configs, nVertLevels, rk_step, dynamics_substep, dt, & 
                                           cellThreadStart(thread), cellThreadEnd(thread), &
                                           vertexThreadStart(thread), vertexThreadEnd(thread), &
                                           edgeThreadStart(thread), edgeThreadEnd(thread), &
@@ -1200,7 +2217,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             !***********************************
 
 ! tend_u
-            call exchange_halo_group(domain, 'dynamics:tend_u')
+            MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+            call mpas_pool_get_array(tend, 'u', tend_u)
+            !$acc update self(tend_u) if (.not. config_gpu_aware_mpi)
+            call exchange_halo_group(domain, 'dynamics:tend_u', config_gpu_aware_mpi)
+            !$acc update device(tend_u) if (.not. config_gpu_aware_mpi)
+            MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
    
             call mpas_timer_start('small_step_prep')
    
@@ -1276,7 +2298,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
             do small_step = 1, number_sub_steps(rk_step)
 
-               call exchange_halo_group(domain, 'dynamics:rho_pp')
+               MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+               call mpas_pool_get_array(diag, 'rho_pp', rho_pp)
+               !$acc update self(rho_pp) if (.not. config_gpu_aware_mpi)
+               call exchange_halo_group(domain, 'dynamics:rho_pp', config_gpu_aware_mpi)
+               !$acc update device(rho_pp) if (.not. config_gpu_aware_mpi)
+               MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
                call mpas_timer_start('atm_advance_acoustic_step')
 
@@ -1298,8 +2325,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
   
 ! rtheta_pp
 ! This is the only communications needed during the acoustic steps because we solve for u on all edges of owned cells
-
-               call exchange_halo_group(domain, 'dynamics:rtheta_pp')
+               MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+               call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp)
+               !$acc update self(rtheta_pp) if (.not. config_gpu_aware_mpi)
+               call exchange_halo_group(domain, 'dynamics:rtheta_pp', config_gpu_aware_mpi)
+               !$acc update device(rtheta_pp) if (.not. config_gpu_aware_mpi)
+               MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
 !  complete update of horizontal momentum by including 3d divergence damping at the end of the acoustic step
 
@@ -1319,7 +2350,15 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             !
             ! Communicate halos for rw_p[1,2], ru_p[1,2], rho_pp[1,2], rtheta_pp[2]
             !
-            call exchange_halo_group(domain, 'dynamics:rw_p,ru_p,rho_pp,rtheta_pp')
+            MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+            call mpas_pool_get_array(diag, 'ru_p', ru_p)
+            call mpas_pool_get_array(diag, 'rw_p', rw_p)
+            call mpas_pool_get_array(diag, 'rho_pp', rho_pp)
+            call mpas_pool_get_array(diag, 'rtheta_pp', rtheta_pp)
+            !$acc update self(rw_p,ru_p,rho_pp,rtheta_pp) if (.not. config_gpu_aware_mpi)
+            call exchange_halo_group(domain, 'dynamics:rw_p,ru_p,rho_pp,rtheta_pp', config_gpu_aware_mpi)
+            !$acc update device(rw_p,ru_p,rho_pp,rtheta_pp) if (.not. config_gpu_aware_mpi)
+            MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
             call mpas_timer_start('atm_recover_large_step_variables')
 
@@ -1354,7 +2393,6 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
                call mpas_atm_get_bdy_state(clock, block, nVertLevels, nEdges, 'u', time_dyn_step, ru_driving_values)
 
                ! do this inline at present - it is simple enough
-               !$acc enter data copyin(u)
                !$acc parallel default(present)
                !$acc loop gang worker
                do iEdge = 1, nEdgesSolve
@@ -1366,12 +2404,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
                   end if
                end do
                !$acc end parallel
-               !$acc exit data copyout(u)
 
                call mpas_atm_get_bdy_state(clock, block, nVertLevels, nEdges, 'ru', time_dyn_step, ru_driving_values)
                call mpas_pool_get_array(diag, 'ru', u)
                ! do this inline at present - it is simple enough
-               !$acc enter data copyin(u)
                !$acc parallel default(present)
                !$acc loop gang worker
                do iEdge = 1, nEdges
@@ -1383,7 +2419,6 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
                   end if
                end do
                !$acc end parallel
-               !$acc exit data copyout(u)
 
                deallocate(ru_driving_values)
 
@@ -1391,12 +2426,17 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
 !-------------------------------------------------------------------
             
+            MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+            call mpas_pool_get_array(state, 'u', u, 2)
+            !$acc update self(u) if (.not. config_gpu_aware_mpi)
             ! u
             if (config_apply_lbcs) then
-               call exchange_halo_group(domain, 'dynamics:u_123')
+               call exchange_halo_group(domain, 'dynamics:u_123', config_gpu_aware_mpi)
             else
-               call exchange_halo_group(domain, 'dynamics:u_3')
+               call exchange_halo_group(domain, 'dynamics:u_3', config_gpu_aware_mpi)
             end if
+            !$acc update device(u) if (.not. config_gpu_aware_mpi)
+            MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
             ! scalar advection: RK3 scheme of Skamarock and Gassmann (2011). 
             ! PD or monotonicity constraints applied only on the final Runge-Kutta substep.
@@ -1404,11 +2444,16 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             if (config_scalar_advection .and. (.not. config_split_dynamics_transport) ) then
 
                call advance_scalars('scalars', domain, rk_step, rk_timestep, config_monotonic, config_positive_definite, &
-                                    config_time_integration_order, config_split_dynamics_transport, exchange_halo_group)
+                                    config_time_integration_order, config_split_dynamics_transport, config_gpu_aware_mpi, exchange_halo_group)
 
                if (config_apply_lbcs) then  ! adjust boundary tendencies for regional_MPAS scalar transport
 
-                  call exchange_halo_group(domain, 'dynamics:scalars')
+                  MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+                  call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+                  !$acc update self(scalars_2) if (.not. config_gpu_aware_mpi)
+                  call exchange_halo_group(domain, 'dynamics:scalars', config_gpu_aware_mpi)
+                  !$acc update device(scalars_2) if (.not. config_gpu_aware_mpi)
+                  MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
                   allocate(scalars_driving(num_scalars,nVertLevels,nCells+1))
 
@@ -1460,17 +2505,27 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
             call mpas_timer_stop('atm_compute_solve_diagnostics')
 
+            MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+            call mpas_pool_get_array(state, 'w', w, 2)
+            call mpas_pool_get_array(diag, 'pv_edge', pv_edge)
+            call mpas_pool_get_array(diag, 'rho_edge', rho_edge)            
+            !$acc update self(w,pv_edge,rho_edge) if (.not. config_gpu_aware_mpi)
             if (config_scalar_advection .and. (.not. config_split_dynamics_transport) ) then
                !
                ! Communicate halos for w[1,2], pv_edge[1,2], rho_edge[1,2], scalars[1,2]
                !
-               call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge,scalars')
+               call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+               !$acc update self(scalars_2) if (.not. config_gpu_aware_mpi)
+               call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge,scalars', config_gpu_aware_mpi)
+               !$acc update device(scalars_2) if (.not. config_gpu_aware_mpi)
             else
                !
                ! Communicate halos for w[1,2], pv_edge[1,2], rho_edge[1,2]
                !
-               call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge')
+               call exchange_halo_group(domain, 'dynamics:w,pv_edge,rho_edge', config_gpu_aware_mpi)
             end if
+            !$acc update device(w,pv_edge,rho_edge) if (.not. config_gpu_aware_mpi)
+            MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
             ! set the zero-gradient condition on w for regional_MPAS
             
@@ -1483,8 +2538,13 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
               end do
 !$OMP END PARALLEL DO
 
+              MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
               ! w halo values needs resetting after regional boundary update
-              call exchange_halo_group(domain, 'dynamics:w')
+              call mpas_pool_get_array(state, 'w', w, 2)
+              !$acc update self(w) if (.not. config_gpu_aware_mpi)
+              call exchange_halo_group(domain, 'dynamics:w', config_gpu_aware_mpi)
+              !$acc update device(w) if (.not. config_gpu_aware_mpi)
+              MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
             end if ! end of regional_MPAS addition 
 
@@ -1495,7 +2555,14 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             !
             ! Communicate halos for theta_m[1,2], pressure_p[1,2], and rtheta_p[1,2]
             !
-            call exchange_halo_group(domain, 'dynamics:theta_m,pressure_p,rtheta_p')
+            MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+            call mpas_pool_get_array(state, 'theta_m', theta_m, 2)
+            call mpas_pool_get_array(diag, 'pressure_p', pressure_p)
+            call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p)
+            !$acc update self(theta_m,pressure_p,rtheta_p) if (.not. config_gpu_aware_mpi)
+            call exchange_halo_group(domain, 'dynamics:theta_m,pressure_p,rtheta_p', config_gpu_aware_mpi)
+            !$acc update device(theta_m,pressure_p,rtheta_p) if (.not. config_gpu_aware_mpi)
+            MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
             !
             ! Note: A halo exchange for 'exner' here as well as after the call
@@ -1532,6 +2599,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
       deallocate(qtot)  !  we are finished with these now
 
+      !$acc exit data delete(tend_rtheta_physics,tend_rho_physics,tend_ru_physics)
 #ifndef MPAS_CAM_DYCORE
       call mpas_deallocate_scratch_field(tend_rtheta_physicsField)
       call mpas_deallocate_scratch_field(tend_rho_physicsField)
@@ -1555,12 +2623,17 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
 
             call advance_scalars('scalars', domain, rk_step, rk_timestep, config_monotonic, config_positive_definite, &
-                                 config_time_integration_order, config_split_dynamics_transport, exchange_halo_group)
+                                 config_time_integration_order, config_split_dynamics_transport, config_gpu_aware_mpi, exchange_halo_group)
 
             if (config_apply_lbcs) then  ! adjust boundary tendencies for regional_MPAS scalar transport
 
+               MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
                ! need to fill halo for horizontal filter
-               call exchange_halo_group(domain, 'dynamics:scalars')
+               call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+               !$acc update self(scalars_2) if (.not. config_gpu_aware_mpi)
+               call exchange_halo_group(domain, 'dynamics:scalars', config_gpu_aware_mpi)
+               !$acc update device(scalars_2) if (.not. config_gpu_aware_mpi)
+               MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
    
                allocate(scalars_driving(num_scalars,nVertLevels,nCells+1))
 
@@ -1586,7 +2659,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 !------------------------------------------------------------------------------------------------------------------------
 
             if (rk_step < 3) then
-               call exchange_halo_group(domain, 'dynamics:scalars')
+               MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+               call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+               !$acc update self(scalars_2) if (.not. config_gpu_aware_mpi)
+               call exchange_halo_group(domain, 'dynamics:scalars', config_gpu_aware_mpi)
+               !$acc update device(scalars_2) if (.not. config_gpu_aware_mpi)
+               MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
             end if
 
          end do RK3_SPLIT_TRANSPORT
@@ -1608,7 +2686,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
                             uReconstructY,          &
                             uReconstructZ,          &
                             uReconstructZonal,      &
-                            uReconstructMeridional  &
+                            uReconstructMeridional, &
+                            lACC = .true.           &
                            )
 
 
@@ -1618,16 +2697,24 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       !
 
 #ifdef DO_PHYSICS
+      MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
       call mpas_pool_get_array(state, 'scalars', scalars_1, 1)
+      !$acc update self(scalars_1)
       call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+      !$acc update self(scalars_2)
+      MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
 
       if(config_convection_scheme == 'cu_grell_freitas' .or. &
          config_convection_scheme == 'cu_ntiedtke') then
 
+         MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
          call mpas_pool_get_array(tend_physics, 'rqvdynten', rqvdynten)
          call mpas_pool_get_array(state, 'theta_m', theta_m, 2)
+         !$acc update self(theta_m)
 
          call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten)
+         !$acc update self(rthdynten)
+         MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
 
          !NOTE: The calculation of the tendency due to horizontal and vertical advection for the water vapor mixing ratio
          !requires that the subroutine atm_advance_scalars_mono was called on the third Runge Kutta step, so that a halo
@@ -1652,8 +2739,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       where ( scalars_2(:,:,:) < 0.0)  &
          scalars_2(:,:,:) = 0.0
 
+      MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
+      !$acc update device(scalars_2, rthdynten)
+      MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
       !call microphysics schemes:
       if (trim(config_microp_scheme) /= 'off')  then
+        call update_d2h_pre_microphysics( block % configs, state, diag, 2)
         call mpas_timer_start('microphysics')
 !$OMP PARALLEL DO
         do thread=1,nThreads
@@ -1662,6 +2753,7 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
         end do
 !$OMP END PARALLEL DO
         call mpas_timer_stop('microphysics')
+        call update_h2d_post_microphysics( block % configs, state, diag, tend, 2)
       end if
 
       !
@@ -1699,7 +2791,12 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 
       if (config_apply_lbcs) then  ! adjust boundary values for regional_MPAS scalar transport
 
-         call exchange_halo_group(domain, 'dynamics:scalars')
+         MPAS_ACC_TIMER_START('atm_srk3: halo_exchanges + ACC_data_xfer')
+         call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
+         !$acc update self(scalars_2) if (.not. config_gpu_aware_mpi)
+         call exchange_halo_group(domain, 'dynamics:scalars', config_gpu_aware_mpi)
+         !$acc update device(scalars_2) if (.not. config_gpu_aware_mpi)
+         MPAS_ACC_TIMER_STOP('atm_srk3: halo_exchanges + ACC_data_xfer')
 
          allocate(scalars_driving(num_scalars,nVertLevels,nCells+1))
 
@@ -1752,7 +2849,7 @@ end subroutine atm_srk3
    !
    !-----------------------------------------------------------------------
    subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_monotonic, config_positive_definite, &
-                              config_time_integration_order, config_split_dynamics_transport, exchange_halo_group)
+                              config_time_integration_order, config_split_dynamics_transport, config_gpu_aware_mpi, exchange_halo_group)
 
       implicit none
 
@@ -1765,6 +2862,7 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
       logical, intent(in) :: config_positive_definite
       integer, intent(in) :: config_time_integration_order
       logical, intent(in) :: config_split_dynamics_transport
+      logical, intent(in) :: config_gpu_aware_mpi
       procedure (halo_exchange_routine) :: exchange_halo_group
 
       ! Local variables
@@ -1896,7 +2994,7 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
                                           edgeThreadStart(thread), edgeThreadEnd(thread), &
                                           cellSolveThreadStart(thread), cellSolveThreadEnd(thread), &
                                           scalar_old_arr, scalar_new_arr, s_max_arr, s_min_arr, wdtn_arr, &
-                                          flux_array, flux_upwind_tmp_arr, flux_tmp_arr, &
+                                          flux_array, flux_upwind_tmp_arr, flux_tmp_arr, config_gpu_aware_mpi, &
                                           exchange_halo_group, &
                                           advance_density=config_split_dynamics_transport, rho_zz_int=rho_zz_int)
          end if
@@ -1976,12 +3074,6 @@ subroutine atm_rk_integration_setup( state, diag, nVertLevels, num_scalars, &
       call mpas_pool_get_array(state, 'scalars', scalars_1, 1)
       call mpas_pool_get_array(state, 'scalars', scalars_2, 2)
 
-      MPAS_ACC_TIMER_START('atm_rk_integration_setup [ACC_data_xfer]')
-      !$acc enter data create(ru_save, u_2, rw_save, rtheta_p_save, rho_p_save, &
-      !$acc                   w_2, theta_m_2, rho_zz_2, rho_zz_old_split, scalars_2) &
-      !$acc            copyin(ru, rw, rtheta_p, rho_p, u_1, w_1, theta_m_1, &
-      !$acc                   rho_zz_1, scalars_1)
-      MPAS_ACC_TIMER_STOP('atm_rk_integration_setup [ACC_data_xfer]')
 
       !$acc kernels
       theta_m_2(:,cellEnd+1) = 0.0_RKIND
@@ -2029,12 +3121,6 @@ subroutine atm_rk_integration_setup( state, diag, nVertLevels, num_scalars, &
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_rk_integration_setup [ACC_data_xfer]')
-      !$acc exit data copyout(ru_save, rw_save, rtheta_p_save, rho_p_save, u_2, &
-      !$acc                   w_2, theta_m_2, rho_zz_2, rho_zz_old_split, scalars_2) &
-      !$acc            delete(ru, rw, rtheta_p, rho_p, u_1, w_1, theta_m_1, &
-      !$acc                  rho_zz_1, scalars_1)
-      MPAS_ACC_TIMER_STOP('atm_rk_integration_setup [ACC_data_xfer]')
 
    end subroutine atm_rk_integration_setup
 
@@ -2085,11 +3171,6 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, &
       moist_start = moist_start_ptr
       moist_end = moist_end_ptr
 
-      MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]')
-      !$acc enter data create(cqw, cqu) & 
-      !$acc            copyin(scalars)
-      MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]')
-
       !$acc parallel default(present)
       !$acc loop gang worker
 !      do iCell = cellSolveStart,cellSolveEnd
@@ -2138,10 +3219,6 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, &
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]')
-      !$acc exit data copyout(cqw, cqu) &
-      !$acc           delete(scalars)
-      MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]')
 
    end subroutine atm_compute_moist_coefficients
 
@@ -2273,11 +3350,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts,
       real (kind=RKIND) :: dtseps, c2, qtotal, rcv
       real (kind=RKIND), dimension( nVertLevels ) :: b_tri, c_tri
 
-      MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
-      !$acc enter data copyin(cqw, p, t, rb, rtb, rt, pb)
-      !$acc enter data create(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, &
-      !$acc                   c_tri, alpha_tri, gamma_tri)
-      MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
+      !$acc enter data create(b_tri, c_tri)
 
       !  set coefficients
       dtseps = .5*dts*(1.+epssm)
@@ -2357,11 +3430,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts,
       end do ! loop over cells
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
-      !$acc exit data copyout(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, &
-      !$acc                   c_tri, alpha_tri, gamma_tri)
-      !$acc exit data delete(cqw, p, t, rb, rtb, rt, pb)
-      MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
+      !$acc exit data delete(b_tri, c_tri)
 
    end subroutine atm_compute_vert_imp_coefs_work
 
@@ -2465,9 +3534,6 @@ subroutine atm_set_smlstep_pert_variables_work(nCells, nEdges, &
       integer :: iCell, iEdge, i, k
       real (kind=RKIND) :: flux
 
-      MPAS_ACC_TIMER_START('atm_set_smlstep_pert_variables [ACC_data_xfer]')
-      !$acc enter data copyin(u_tend, w_tend)
-      MPAS_ACC_TIMER_STOP('atm_set_smlstep_pert_variables [ACC_data_xfer]')
 
       ! we solve for omega instead of w (see Klemp et al MWR 2007),
       ! so here we change the w_p tendency to an omega_p tendency
@@ -2500,10 +3566,6 @@ subroutine atm_set_smlstep_pert_variables_work(nCells, nEdges, &
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_set_smlstep_pert_variables [ACC_data_xfer]')
-      !$acc exit data delete(u_tend)
-      !$acc exit data copyout(w_tend)
-      MPAS_ACC_TIMER_STOP('atm_set_smlstep_pert_variables [ACC_data_xfer]')
 
    end subroutine atm_set_smlstep_pert_variables_work
 
@@ -2736,17 +3798,6 @@ subroutine atm_advance_acoustic_step_work(nCells, nEdges, nCellsSolve, cellStart
       resm = (1.0 - epssm) / (1.0 + epssm)
       rdts = 1./dts
 
-      MPAS_ACC_TIMER_START('atm_advance_acoustic_step [ACC_data_xfer]')
-      !$acc enter data copyin(exner,cqu,cofwt,coftz,cofrz,cofwr,cofwz, &
-      !$acc                a_tri,alpha_tri,gamma_tri,rho_zz,theta_m,w, &
-      !$acc                tend_ru,tend_rho,tend_rt,tend_rw,rw,rw_save)
-      !$acc enter data create(rtheta_pp_old)
-      if(small_step == 1) then
-         !$acc enter data create(ru_p,ruAvg,rho_pp,rtheta_pp,wwAvg,rw_p)
-      else
-         !$acc enter data copyin(ru_p,ruAvg,rho_pp,rtheta_pp,wwAvg,rw_p)
-      end if
-      MPAS_ACC_TIMER_STOP('atm_advance_acoustic_step [ACC_data_xfer]')
 
       if(small_step /= 1) then  !  not needed on first small step
 
@@ -2973,13 +4024,6 @@ subroutine atm_advance_acoustic_step_work(nCells, nEdges, nCellsSolve, cellStart
       end do !  end of loop over cells
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_advance_acoustic_step [ACC_data_xfer]')
-      !$acc exit data delete(exner,cqu,cofwt,coftz,cofrz,cofwr,cofwz, &
-      !$acc                a_tri,alpha_tri,gamma_tri,rho_zz,theta_m,w, &
-      !$acc                tend_ru,tend_rho,tend_rt,tend_rw,rw,rw_save)
-      !$acc exit data copyout(rtheta_pp_old,ru_p,ruAvg,rho_pp, &
-      !$acc                   rtheta_pp,wwAvg,rw_p)
-      MPAS_ACC_TIMER_STOP('atm_advance_acoustic_step [ACC_data_xfer]')
 
    end subroutine atm_advance_acoustic_step_work
 
@@ -3031,9 +4075,6 @@ subroutine atm_divergence_damping_3d( state, diag, mesh, configs, dts, edgeStart
       nCellsSolve = nCellsSolve_ptr
       nVertLevels = nVertLevels_ptr
 
-      MPAS_ACC_TIMER_START('atm_divergence_damping_3d [ACC_data_xfer]')
-      !$acc enter data copyin(ru_p, rtheta_pp, rtheta_pp_old, theta_m)
-      MPAS_ACC_TIMER_STOP('atm_divergence_damping_3d [ACC_data_xfer]')
 
       !$acc parallel default(present)
       !$acc loop gang worker
@@ -3066,10 +4107,6 @@ subroutine atm_divergence_damping_3d( state, diag, mesh, configs, dts, edgeStart
       end do ! end loop over edges
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_divergence_damping_3d [ACC_data_xfer]')
-      !$acc exit data copyout(ru_p) &
-      !$acc           delete(rtheta_pp, rtheta_pp_old, theta_m)
-      MPAS_ACC_TIMER_STOP('atm_divergence_damping_3d [ACC_data_xfer]')
       
 
    end subroutine atm_divergence_damping_3d
@@ -3260,17 +4297,6 @@ subroutine atm_recover_large_step_variables_work(nCells, nEdges, nCellsSolve, nE
       integer :: i, iCell, iEdge, k, cell1, cell2
       real (kind=RKIND) :: invNs, rcv, p0, flux
 
-      MPAS_ACC_TIMER_START('atm_recover_large_step_variables [ACC_data_xfer]')
-      !$acc enter data copyin(rho_p_save,rho_pp,rho_base,rw_save,rw_p, &
-      !$acc                   rtheta_p_save,rtheta_pp,rtheta_base, &
-      !$acc                   ru_save,ru_p,wwAvg,ruAvg) &
-      !$acc            create(rho_zz,rho_p,rw,w,rtheta_p,theta_m, &
-      !$acc                   ru,u)
-      if (rk_step == 3) then
-         !$acc enter data copyin(rt_diabatic_tend,exner_base) &
-         !$acc            create(exner,pressure_p)
-      end if
-      MPAS_ACC_TIMER_STOP('atm_recover_large_step_variables [ACC_data_xfer]')
 
       rcv = rgas/(cp-rgas)
       p0 = 1.0e+05  ! this should come from somewhere else...
@@ -3416,17 +4442,6 @@ subroutine atm_recover_large_step_variables_work(nCells, nEdges, nCellsSolve, nE
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_recover_large_step_variables [ACC_data_xfer]')
-      !$acc exit data delete(rho_p_save,rho_pp,rho_base,rw_save,rw_p, &
-      !$acc                   rtheta_p_save,rtheta_pp,rtheta_base, &
-      !$acc                   ru_save,ru_p) &
-      !$acc            copyout(rho_zz,rho_p,rw,w,rtheta_p,theta_m, &
-      !$acc                   ru,u,wwAvg,ruAvg)
-      if (rk_step == 3) then
-         !$acc exit data delete(rt_diabatic_tend,exner_base) &
-         !$acc            copyout(exner,pressure_p)
-      end if
-      MPAS_ACC_TIMER_STOP('atm_recover_large_step_variables [ACC_data_xfer]')
 
    end subroutine atm_recover_large_step_variables_work
 
@@ -3661,10 +4676,6 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
       weight_time_old = 1. - weight_time_new
 
 
-      MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]')
-      !$acc enter data copyin(uhAvg, scalar_new)
-      MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
-
       !$acc parallel async
       !$acc loop gang worker private(scalar_weight2, ica)
       do iEdge=edgeStart,edgeEnd
@@ -3759,12 +4770,6 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
       !
 
       MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]')
-#ifndef DO_PHYSICS
-      !$acc enter data create(scalar_tend_save)
-#else
-      !$acc enter data copyin(scalar_tend_save)
-#endif
-      !$acc enter data copyin(scalar_old, fnm, fnp, rdnw, wwAvg, rho_zz_old, rho_zz_new)
       !$acc enter data create(scalar_tend_column)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
 
@@ -3847,9 +4852,7 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
       !$acc end parallel
 
       MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]')
-      !$acc exit data copyout(scalar_new)
-      !$acc exit data delete(scalar_tend_column, uhAvg, wwAvg, scalar_old, fnm, fnp, &
-      !$acc                  rdnw, rho_zz_old, rho_zz_new, scalar_tend_save)
+      !$acc exit data delete(scalar_tend_column)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
 
    end subroutine atm_advance_scalars_work
@@ -3870,7 +4873,7 @@ subroutine atm_advance_scalars_mono(field_name, block, tend, state, diag, mesh,
                                        cellStart, cellEnd, edgeStart, edgeEnd, &
                                        cellSolveStart, cellSolveEnd, &
                                        scalar_old, scalar_new, s_max, s_min, wdtn, flux_arr, &
-                                       flux_upwind_tmp, flux_tmp, exchange_halo_group, advance_density, rho_zz_int)
+                                       flux_upwind_tmp, flux_tmp, config_gpu_aware_mpi, exchange_halo_group, advance_density, rho_zz_int)
 
       implicit none
 
@@ -3891,6 +4894,7 @@ subroutine atm_advance_scalars_mono(field_name, block, tend, state, diag, mesh,
       real (kind=RKIND), dimension(:,:), intent(inout) :: wdtn
       real (kind=RKIND), dimension(:,:), intent(inout) :: flux_arr
       real (kind=RKIND), dimension(:,:), intent(inout) :: flux_upwind_tmp, flux_tmp
+      logical, intent(in) :: config_gpu_aware_mpi
       procedure (halo_exchange_routine) :: exchange_halo_group
       logical, intent(in), optional :: advance_density
       real (kind=RKIND), dimension(:,:), intent(inout), optional :: rho_zz_int
@@ -3969,7 +4973,7 @@ subroutine atm_advance_scalars_mono(field_name, block, tend, state, diag, mesh,
                                    edgesOnCell, edgesOnCell_sign, nEdgesOnCell, fnm, fnp, rdnw, nAdvCellsForEdge, &
                                    advCellsForEdge, adv_coefs, adv_coefs_3rd, scalar_old, scalar_new, s_max, s_min, &
                                    wdtn, scale_arr, flux_arr, flux_upwind_tmp, flux_tmp, &
-                                   bdyMaskCell, bdyMaskEdge, &
+                                   bdyMaskCell, bdyMaskEdge, config_gpu_aware_mpi, &
                                    exchange_halo_group, advance_density, rho_zz_int)
 
       call mpas_deallocate_scratch_field(scale)
@@ -4017,7 +5021,7 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
                                    edgesOnCell, edgesOnCell_sign, nEdgesOnCell, fnm, fnp, rdnw, nAdvCellsForEdge, &
                                    advCellsForEdge, adv_coefs, adv_coefs_3rd, scalar_old, scalar_new, s_max, s_min, &
                                    wdtn, scale_arr, flux_arr, flux_upwind_tmp, flux_tmp, &
-                                   bdyMaskCell, bdyMaskEdge, &
+                                   bdyMaskCell, bdyMaskEdge, config_gpu_aware_mpi, &
                                    exchange_halo_group, advance_density, rho_zz_int)
 
       use mpas_atm_dimensions, only : nVertLevels
@@ -4033,6 +5037,7 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
       real (kind=RKIND), intent(in)        :: dt
       integer, intent(in) :: cellStart, cellEnd, edgeStart, edgeEnd
       integer, intent(in) :: cellSolveStart, cellSolveEnd
+      logical, intent(in) :: config_gpu_aware_mpi
       procedure (halo_exchange_routine) :: exchange_halo_group
       logical, intent(in), optional :: advance_density
       real (kind=RKIND), dimension(:,:), intent(inout), optional :: rho_zz_int
@@ -4107,22 +5112,6 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
       !  The transport will maintain this positive definite solution and optionally, shape preservation (monotonicity).
 
 
-      MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-      !$acc data present(nEdgesOnCell, edgesOnCell, edgesOnCell_sign, &
-      !$acc              invAreaCell, cellsOnCell, cellsOnEdge, nAdvCellsForEdge, &
-      !$acc              advCellsForEdge, adv_coefs, adv_coefs_3rd, dvEdge, bdyMaskCell)
-
-#ifdef DO_PHYSICS
-      !$acc enter data copyin(scalar_tend)
-#else
-      !$acc enter data create(scalar_tend)
-#endif
-      if (local_advance_density) then
-         !$acc enter data copyin(rho_zz_int)
-      end if
-      !$acc enter data copyin(scalars_old, rho_zz_old, rdnw, uhAvg, wwAvg)
-      MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
-
       !$acc parallel
 
       !$acc loop gang worker
@@ -4145,19 +5134,17 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
       !$acc end parallel
 
       MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-      !$acc exit data copyout(scalar_tend)
-
-      !$acc update self(scalars_old)
+      !$acc update self(scalars_old) if (.not. config_gpu_aware_mpi)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
 !$OMP BARRIER
 !$OMP MASTER
-      call exchange_halo_group(block % domain, 'dynamics:'//trim(field_name)//'_old')
+      call exchange_halo_group(block % domain, 'dynamics:'//trim(field_name)//'_old', config_gpu_aware_mpi)
 !$OMP END MASTER
 !$OMP BARRIER
 
       MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-      !$acc update device(scalars_old)
+      !$acc update device(scalars_old) if (.not. config_gpu_aware_mpi)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
       !
@@ -4209,13 +5196,7 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
 
       end if
 
-      MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-      if (.not. local_advance_density) then
-         !$acc enter data copyin(rho_zz_new)
-      end if
-      !$acc enter data copyin(scalars_new, fnm, fnp)
       !$acc enter data create(scale_arr)
-      MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
       do iScalar = 1, num_scalars
 
@@ -4560,17 +5541,17 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
          !
 
          MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-         !$acc update self(scale_arr)
+         !$acc update self(scale_arr) if (.not. config_gpu_aware_mpi)
          MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
 !$OMP BARRIER
 !$OMP MASTER
-         call exchange_halo_group(block % domain, 'dynamics:scale')
+         call exchange_halo_group(block % domain, 'dynamics:scale', config_gpu_aware_mpi)
 !$OMP END MASTER
 !$OMP BARRIER
 
          MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-         !$acc update device(scale_arr)
+         !$acc update device(scale_arr) if (.not. config_gpu_aware_mpi)
          MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
          !$acc parallel
@@ -4718,23 +5699,12 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
 
       end do !  loop over scalars
 
-      MPAS_ACC_TIMER_START('atm_advance_scalars_mono [ACC_data_xfer]')
-      if (local_advance_density) then
-         !$acc exit data copyout(rho_zz_int)
-      else
-         !$acc exit data delete(rho_zz_new)
-      end if
-      !$acc exit data copyout(scalars_new)
-      !$acc exit data delete(scalars_old, scale_arr, rho_zz_old, wwAvg, &
-      !$acc                  uhAvg, fnm, fnp, rdnw)
-
-      !$acc end data
-      MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
+      !$acc exit data delete(scale_arr)
 
    end subroutine atm_advance_scalars_mono_work
 
 
-   subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs, nVertLevels, rk_step, dt, &
+   subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, diag_physics, configs, nVertLevels, rk_step, dynamics_substep, dt, &
                                    cellStart, cellEnd, vertexStart, vertexEnd, edgeStart, edgeEnd, &
                                    cellSolveStart, cellSolveEnd, vertexSolveStart, vertexSolveEnd, edgeSolveStart, edgeSolveEnd)
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
@@ -4750,6 +5720,8 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
    !                
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
+      use mpas_atm_dissipation_models, only : les_model_from_string, les_surface_from_string
+
       implicit none
 
       !
@@ -4760,9 +5732,10 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       type (mpas_pool_type), intent(in) :: state
       type (mpas_pool_type), intent(in) :: diag
       type (mpas_pool_type), intent(in) :: mesh
+      type (mpas_pool_type), intent(in) :: diag_physics
       type (mpas_pool_type), intent(in) :: configs
       integer, intent(in) :: nVertLevels              ! for allocating stack variables
-      integer, intent(in) :: rk_step
+      integer, intent(in) :: rk_step, dynamics_substep
       real (kind=RKIND), intent(in) :: dt
       integer, intent(in) :: cellStart, cellEnd, vertexStart, vertexEnd, edgeStart, edgeEnd
       integer, intent(in) :: cellSolveStart, cellSolveEnd, vertexSolveStart, vertexSolveEnd, edgeSolveStart, edgeSolveEnd
@@ -4778,7 +5751,9 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge, zgrid, rho_edge, rho_zz, ru, u, v, tend_u, &
                                                     divergence, vorticity, ke, pv_edge, theta_m, rw, tend_rho, &
                                                     rt_diabatic_tend, tend_theta, tend_w, w, cqw, rb, rr, pp, pressure_b, zz, zxu, cqu, & 
-                                                    h_divergence, kdiff, edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save
+                                                    h_divergence, bn2, edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save
+
+      real (kind=RKIND), dimension(:,:), pointer :: eddy_visc_horz, eddy_visc_vert
 
       real (kind=RKIND), dimension(:,:), pointer :: theta_m_save
 
@@ -4788,7 +5763,7 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
 
       real (kind=RKIND), dimension(:,:), pointer :: rthdynten
 
-      real (kind=RKIND), dimension(:,:,:), pointer :: scalars
+      real (kind=RKIND), dimension(:,:,:), pointer :: scalars, tend_scalars
 
       real (kind=RKIND), dimension(:,:), pointer :: tend_u_euler, tend_w_euler, tend_theta_euler
 
@@ -4803,6 +5778,7 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
 
       real (kind=RKIND), dimension(:), pointer :: rdzu, rdzw, fzm, fzp, qv_init
       real (kind=RKIND), dimension(:,:), pointer :: t_init 
+      real (kind=RKIND), dimension(:), pointer:: ustm, hfx, qfx
 
       real (kind=RKIND), pointer :: cf1, cf2, cf3
 
@@ -4810,12 +5786,23 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       real (kind=RKIND), dimension(:,:), pointer :: ur_cell, vr_cell
 
       real (kind=RKIND), dimension(:,:), pointer :: defc_a, defc_b
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c2, deformation_coef_s2, deformation_coef_cs
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c, deformation_coef_s
+      real (kind=RKIND), dimension(:,:), pointer :: prandtl_3d_inv
+
 
       real(kind=RKIND), dimension(:,:), pointer :: tend_w_pgf, tend_w_buoy
 
       real (kind=RKIND), pointer :: coef_3rd_order, c_s
       logical, pointer :: config_mix_full
+      logical, pointer :: config_mix_scalars
       character (len=StrKIND), pointer :: config_horiz_mixing
+      character (len=StrKIND), pointer :: config_les_model
+      character (len=StrKIND), pointer :: config_les_surface
+      integer :: les_model_opt, les_surface_opt
+      real (kind=RKIND), pointer :: config_surface_heat_flux
+      real (kind=RKIND), pointer :: config_surface_moisture_flux
+      real (kind=RKIND), pointer :: config_surface_drag_coefficient
       real (kind=RKIND), pointer :: config_del4u_div_factor
       real (kind=RKIND), pointer :: config_h_theta_eddy_visc4
       real (kind=RKIND), pointer :: config_h_mom_eddy_visc4
@@ -4828,12 +5815,22 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       logical, pointer :: config_rayleigh_damp_u
       real (kind=RKIND), pointer :: config_rayleigh_damp_u_timescale_days
       integer, pointer :: config_number_rayleigh_damp_u_levels, config_number_cam_damping_levels
+      integer, pointer :: index_qv, index_qc, index_tke
+
+      logical :: inactive_rthdynten
+      logical :: nopbl
 
 
       call mpas_pool_get_config(mesh, 'sphere_radius', r_earth)
       call mpas_pool_get_config(configs, 'config_coef_3rd_order', coef_3rd_order)
       call mpas_pool_get_config(configs, 'config_mix_full', config_mix_full)
+      call mpas_pool_get_config(configs, 'config_mix_scalars', config_mix_scalars)
       call mpas_pool_get_config(configs, 'config_horiz_mixing', config_horiz_mixing)
+      call mpas_pool_get_config(configs, 'config_les_model', config_les_model)
+      call mpas_pool_get_config(configs, 'config_les_surface', config_les_surface)
+      call mpas_pool_get_config(configs, 'config_surface_heat_flux', config_surface_heat_flux)
+      call mpas_pool_get_config(configs, 'config_surface_moisture_flux', config_surface_moisture_flux)
+      call mpas_pool_get_config(configs, 'config_surface_drag_coefficient', config_surface_drag_coefficient)
       call mpas_pool_get_config(configs, 'config_del4u_div_factor', config_del4u_div_factor)
       call mpas_pool_get_config(configs, 'config_h_theta_eddy_visc4', config_h_theta_eddy_visc4)
       call mpas_pool_get_config(configs, 'config_h_mom_eddy_visc4', config_h_mom_eddy_visc4)
@@ -4864,7 +5861,9 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_array(diag, 'rho_p', rr)
       call mpas_pool_get_array(diag, 'rho_p_save', rr_save)
       call mpas_pool_get_array(diag, 'v', v)
-      call mpas_pool_get_array(diag, 'kdiff', kdiff)
+      call mpas_pool_get_array(diag, 'eddy_visc_horz', eddy_visc_horz)
+      call mpas_pool_get_array(diag, 'eddy_visc_vert', eddy_visc_vert)
+      call mpas_pool_get_array(diag, 'bn2', bn2)
       call mpas_pool_get_array(diag, 'ru', ru)
       call mpas_pool_get_array(diag, 'ru_save', ru_save)
       call mpas_pool_get_array(diag, 'rw', rw)
@@ -4877,9 +5876,29 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_array(diag, 'pressure_base', pressure_b)
       call mpas_pool_get_array(diag, 'h_divergence', h_divergence)
       call mpas_pool_get_array(diag, 'exner', exner)
+      call mpas_pool_get_array(diag, 'prandtl_3d_inv', prandtl_3d_inv)
+
 
       call mpas_pool_get_array(tend_physics, 'rthdynten', rthdynten)
 
+      nullify(ustm)
+      call mpas_pool_get_array(diag_physics,'ustm',ustm)
+      nullify(hfx)
+      call mpas_pool_get_array(diag_physics,'hfx',hfx)
+      nullify(qfx)
+      call mpas_pool_get_array(diag_physics,'qfx',qfx)
+
+      nopbl = .false.
+      if (.not. associated(ustm) &
+          .or. .not. associated(hfx) &
+          .or. .not. associated(qfx)) then
+
+          allocate(ustm(1))
+          allocate(hfx(1))
+          allocate(qfx(1))
+          nopbl = .true.
+      end if
+
       call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge)
       call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge)
       call mpas_pool_get_array(mesh, 'cellsOnCell', cellsOnCell)
@@ -4905,6 +5924,11 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_array(mesh, 'angleEdge', angleEdge)
       call mpas_pool_get_array(mesh, 'defc_a', defc_a)
       call mpas_pool_get_array(mesh, 'defc_b', defc_b)
+      call mpas_pool_get_array(mesh, 'deformation_coef_c2', deformation_coef_c2)
+      call mpas_pool_get_array(mesh, 'deformation_coef_s2', deformation_coef_s2)
+      call mpas_pool_get_array(mesh, 'deformation_coef_cs', deformation_coef_cs)
+      call mpas_pool_get_array(mesh, 'deformation_coef_c', deformation_coef_c)
+      call mpas_pool_get_array(mesh, 'deformation_coef_s', deformation_coef_s)
       call mpas_pool_get_array(mesh, 'meshScalingDel2', meshScalingDel2)
       call mpas_pool_get_array(mesh, 'meshScalingDel4', meshScalingDel4)
       call mpas_pool_get_array(mesh, 'u_init', u_init)
@@ -4928,6 +5952,8 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_array(tend, 'w_euler', tend_w_euler)
       call mpas_pool_get_array(tend, 'w_pgf', tend_w_pgf)
       call mpas_pool_get_array(tend, 'w_buoy', tend_w_buoy)
+      call mpas_pool_get_array(tend, 'scalars_tend', tend_scalars)
+
 
       call mpas_pool_get_array(diag, 'cqw', cqw)
       call mpas_pool_get_array(diag, 'cqu', cqu)
@@ -4944,6 +5970,9 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_dimension(state, 'num_scalars', num_scalars)
       call mpas_pool_get_dimension(state, 'moist_start', moist_start)
       call mpas_pool_get_dimension(state, 'moist_end', moist_end)
+      call mpas_pool_get_dimension(state, 'index_qv', index_qv)
+      call mpas_pool_get_dimension(state, 'index_qc', index_qc)
+      call mpas_pool_get_dimension(state, 'index_tke', index_tke)
 
       call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell)
       call mpas_pool_get_array(mesh, 'nAdvCellsForEdge', nAdvCellsForEdge)
@@ -4955,53 +5984,73 @@ subroutine atm_compute_dyn_tend(tend, tend_physics, state, diag, mesh, configs,
       call mpas_pool_get_array(mesh, 'cf2', cf2)
       call mpas_pool_get_array(mesh, 'cf3', cf3)
 
+      les_model_opt = les_model_from_string(config_les_model)
+      les_surface_opt = les_surface_from_string(config_les_surface)
+
       call atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels, &
-         nCellsSolve, nEdgesSolve, vertexDegree, maxEdges, maxEdges2, num_scalars, moist_start, moist_end, &
+         nCellsSolve, nEdgesSolve, vertexDegree, maxEdges, maxEdges2, num_scalars, index_qv, index_qc, moist_start, moist_end, &
+         tend_scalars, &
          fEdge, dvEdge, dcEdge, invDcEdge, invDvEdge, invAreaCell, invAreaTriangle, meshScalingDel2, meshScalingDel4, &
          weightsOnEdge, zgrid, rho_edge, rho_zz, ru, u, v, tend_u, &
          divergence, vorticity, ke, pv_edge, theta_m, rw, tend_rho, &
          rt_diabatic_tend, tend_theta, tend_w, w, cqw, rb, rr, pp, pressure_b, zz, zxu, cqu, & 
-         h_divergence, kdiff, edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save, &
+         h_divergence, bn2, eddy_visc_horz, eddy_visc_vert, index_tke, &
+         edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save, &
          theta_m_save, exner, rr_save, scalars, tend_u_euler, tend_w_euler, tend_theta_euler, deriv_two, &
          cellsOnEdge, verticesOnEdge, edgesOnCell, edgesOnEdge, cellsOnCell, edgesOnVertex, nEdgesOnCell, nEdgesOnEdge, &
          latCell, latEdge, angleEdge, u_init, v_init, advCellsForEdge, nAdvCellsForEdge, adv_coefs, adv_coefs_3rd, &
          rdzu, rdzw, fzm, fzp, qv_init, t_init, cf1, cf2, cf3, r_earth, ur_cell, vr_cell, defc_a, defc_b, &
-         tend_w_pgf, tend_w_buoy, coef_3rd_order, c_s, config_mix_full, config_horiz_mixing, config_del4u_div_factor, &
+         deformation_coef_c2,deformation_coef_s2,deformation_coef_cs,deformation_coef_c,deformation_coef_s, &
+         tend_w_pgf, tend_w_buoy, coef_3rd_order, c_s, config_mix_full, config_mix_scalars, config_horiz_mixing, les_model_opt, &
+         les_surface_opt, prandtl_3d_inv, config_del4u_div_factor, &
          config_h_mom_eddy_visc2, config_v_mom_eddy_visc2, config_h_theta_eddy_visc2, config_v_theta_eddy_visc2, &
-         config_h_theta_eddy_visc4, config_h_mom_eddy_visc4, config_visc4_2dsmag, config_len_disp, rk_step, dt, &
+         config_h_theta_eddy_visc4, config_h_mom_eddy_visc4, config_visc4_2dsmag, config_len_disp, rk_step, dynamics_substep, dt, &
          config_mpas_cam_coef, &
          config_rayleigh_damp_u, config_rayleigh_damp_u_timescale_days, & 
          config_number_rayleigh_damp_u_levels, config_number_cam_damping_levels, &
-         rthdynten, &
+         config_surface_heat_flux, config_surface_moisture_flux, config_surface_drag_coefficient, &
+         rthdynten, ustm, hfx, qfx, &
          cellStart, cellEnd, vertexStart, vertexEnd, edgeStart, edgeEnd, &
          cellSolveStart, cellSolveEnd, vertexSolveStart, vertexSolveEnd, edgeSolveStart, edgeSolveEnd)
 
+         if (nopbl) then
+             deallocate(ustm)
+             deallocate(hfx)
+             deallocate(qfx)
+         end if
+
    end subroutine atm_compute_dyn_tend
 
 
    subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dummy, &
-      nCellsSolve, nEdgesSolve, vertexDegree, maxEdges_dummy, maxEdges2_dummy, num_scalars_dummy, moist_start, moist_end, &
+      nCellsSolve, nEdgesSolve, vertexDegree, maxEdges_dummy, maxEdges2_dummy, num_scalars_dummy, index_qv, index_qc, moist_start, moist_end, &
+      tend_scalars, &
       fEdge, dvEdge, dcEdge, invDcEdge, invDvEdge, invAreaCell, invAreaTriangle, meshScalingDel2, meshScalingDel4, &
       weightsOnEdge, zgrid, rho_edge, rho_zz, ru, u, v, tend_u, &
       divergence, vorticity, ke, pv_edge, theta_m, rw, tend_rho, &
       rt_diabatic_tend, tend_theta, tend_w, w, cqw, rb, rr, pp, pressure_b, zz, zxu, cqu, & 
-      h_divergence, kdiff, edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save, &
+      h_divergence, bn2, eddy_visc_horz, eddy_visc_vert, index_tke, &
+      edgesOnCell_sign, edgesOnVertex_sign, rw_save, ru_save, &
       theta_m_save, exner, rr_save, scalars, tend_u_euler, tend_w_euler, tend_theta_euler, deriv_two, &
       cellsOnEdge, verticesOnEdge, edgesOnCell, edgesOnEdge, cellsOnCell, edgesOnVertex, nEdgesOnCell, nEdgesOnEdge, &
       latCell, latEdge, angleEdge, u_init, v_init, advCellsForEdge, nAdvCellsForEdge, adv_coefs, adv_coefs_3rd, &
       rdzu, rdzw, fzm, fzp, qv_init, t_init, cf1, cf2, cf3, r_earth, ur_cell, vr_cell, defc_a, defc_b, &
-      tend_w_pgf, tend_w_buoy, coef_3rd_order, c_s, config_mix_full, config_horiz_mixing, config_del4u_div_factor, &
+      deformation_coef_c2,deformation_coef_s2,deformation_coef_cs,deformation_coef_c,deformation_coef_s, &
+      tend_w_pgf, tend_w_buoy, coef_3rd_order, c_s, config_mix_full, config_mix_scalars, config_horiz_mixing, les_model_opt, &
+      les_surface_opt, prandtl_3d_inv, config_del4u_div_factor, &
       config_h_mom_eddy_visc2, config_v_mom_eddy_visc2, config_h_theta_eddy_visc2, config_v_theta_eddy_visc2, &
-      config_h_theta_eddy_visc4, config_h_mom_eddy_visc4, config_visc4_2dsmag, config_len_disp, rk_step, dt, &
+      config_h_theta_eddy_visc4, config_h_mom_eddy_visc4, config_visc4_2dsmag, config_len_disp, rk_step, dynamics_substep, dt, &
       config_mpas_cam_coef, &
       config_rayleigh_damp_u, config_rayleigh_damp_u_timescale_days, &
       config_number_rayleigh_damp_u_levels, config_number_cam_damping_levels, &
-      rthdynten, &
+      config_surface_heat_flux, config_surface_moisture_flux, config_surface_drag_coefficient, &
+      rthdynten, ustm, hfx, qfx, &
       cellStart, cellEnd, vertexStart, vertexEnd, edgeStart, edgeEnd, &
       cellSolveStart, cellSolveEnd, vertexSolveStart, vertexSolveEnd, edgeSolveStart, edgeSolveEnd)
 
 
       use mpas_atm_dimensions
+      use mpas_atm_dissipation_models, only : LES_MODEL_NONE
 
 
       implicit none
@@ -5011,7 +6060,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       ! Dummy arguments
       !
       integer :: nCells, nEdges, nVertices, nVertLevels_dummy, nCellsSolve, nEdgesSolve, vertexDegree, &
-                 maxEdges_dummy, maxEdges2_dummy, num_scalars_dummy, moist_start, moist_end
+                 maxEdges_dummy, maxEdges2_dummy, num_scalars_dummy, index_qv, index_qc, moist_start, moist_end, index_tke
 
       real (kind=RKIND), dimension(nEdges+1) :: fEdge
       real (kind=RKIND), dimension(nEdges+1) :: dvEdge
@@ -5050,7 +6099,11 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       real (kind=RKIND), dimension(nVertLevels,nEdges+1) :: zxu
       real (kind=RKIND), dimension(nVertLevels,nEdges+1) :: cqu
       real (kind=RKIND), dimension(nVertLevels,nCells+1) :: h_divergence
-      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: kdiff
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: eddy_visc_horz
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: eddy_visc_vert
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: bn2
+      real (kind=RKIND), dimension(:) :: ustm, hfx, qfx
+      real (kind=RKIND), dimension(nVertLevels,nCells+1) :: prandtl_3d_inv
       real (kind=RKIND), dimension(maxEdges,nCells+1) :: edgesOnCell_sign
       real (kind=RKIND), dimension(vertexDegree,nVertices+1) :: edgesOnVertex_sign
       real (kind=RKIND), dimension(nVertLevels+1,nCells+1) :: rw_save
@@ -5060,6 +6113,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       real (kind=RKIND), dimension(nVertLevels,nCells+1) :: exner
       real (kind=RKIND), dimension(nVertLevels,nCells+1) :: rr_save
       real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1) :: scalars
+      real (kind=RKIND), dimension(num_scalars,nVertLevels,nCells+1) :: tend_scalars
       real (kind=RKIND), dimension(nVertLevels,nEdges+1) :: tend_u_euler
       real (kind=RKIND), dimension(nVertLevels+1,nCells+1) :: tend_w_euler
       real (kind=RKIND), dimension(nVertLevels,nCells+1) :: tend_theta_euler
@@ -5098,13 +6152,20 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
       real (kind=RKIND), dimension(maxEdges,nCells+1) :: defc_a
       real (kind=RKIND), dimension(maxEdges,nCells+1) :: defc_b
+      real (kind=RKIND), dimension(maxEdges,nCells+1) :: deformation_coef_c2, deformation_coef_s2, deformation_coef_cs
+      real (kind=RKIND), dimension(maxEdges,nCells+1) :: deformation_coef_c, deformation_coef_s
 
       real (kind=RKIND), dimension(nVertLevels+1,nCells+1) :: tend_w_pgf
       real (kind=RKIND), dimension(nVertLevels+1,nCells+1) :: tend_w_buoy
 
       real (kind=RKIND) :: coef_3rd_order, c_s
-      logical :: config_mix_full
+      logical :: config_mix_full, config_mix_scalars
       character (len=StrKIND) :: config_horiz_mixing
+      integer, intent(in) :: les_model_opt
+      integer, intent(in) :: les_surface_opt
+      real (kind=RKIND) :: config_surface_heat_flux
+      real (kind=RKIND) :: config_surface_moisture_flux
+      real (kind=RKIND) :: config_surface_drag_coefficient
       real (kind=RKIND) :: config_del4u_div_factor
       real (kind=RKIND) :: config_h_theta_eddy_visc4
       real (kind=RKIND) :: config_h_mom_eddy_visc4
@@ -5112,7 +6173,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       real (kind=RKIND) :: config_len_disp
       real (kind=RKIND) :: config_h_mom_eddy_visc2, config_v_mom_eddy_visc2, config_h_theta_eddy_visc2, config_v_theta_eddy_visc2
 
-      integer, intent(in) :: rk_step
+      integer, intent(in) :: rk_step, dynamics_substep
       real (kind=RKIND), intent(in) :: dt
 
       real (kind=RKIND) :: config_mpas_cam_coef
@@ -5136,6 +6197,10 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       real (kind=RKIND), dimension( nVertLevels+1 ) :: d_diag, d_off_diag, flux_arr
       real (kind=RKIND), dimension( nVertLevels + 1 ) :: wduz, wdwz, wdtz, dpzx
       real (kind=RKIND), dimension( nVertLevels ) :: ru_edge_w, q, u_mix
+
+!      real (kind=RKIND), dimension( nVertLevels+1 ) :: d_11, d_22, d_12
+!      real (kind=RKIND), dimension( nVertLevels+1 ) :: dudx, dudy, dvdx, dvdy
+
       real (kind=RKIND) :: theta_turb_flux, w_turb_flux, r
       real (kind=RKIND) :: scalar_weight
       real (kind=RKIND) :: inv_r_earth
@@ -5153,6 +6218,9 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       real (kind=RKIND) :: flux3, flux4
       real (kind=RKIND) :: q_im2, q_im1, q_i, q_ip1, ua, coef3
 
+      logical, parameter :: perturbation_coriolis = .true.
+      real (kind=RKIND) :: reference_u
+
       flux4(q_im2, q_im1, q_i, q_ip1, ua) =                     &
                 ua*( 7.*(q_i + q_im1) - (q_ip1 + q_im2) )/12.0
 
@@ -5162,43 +6230,14 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
 
       MPAS_ACC_TIMER_START('atm_compute_dyn_tend_work [ACC_data_xfer]')
-      if (rk_step == 1) then
-         !$acc enter data create(tend_w_euler)
-         !$acc enter data create(tend_u_euler)
-         !$acc enter data create(tend_theta_euler)
-         !$acc enter data create(tend_rho)
-
-         !$acc enter data create(kdiff)
-         !$acc enter data copyin(tend_rho_physics)
-         !$acc enter data copyin(rb, rr_save)
-         !$acc enter data copyin(divergence, vorticity)
-         !$acc enter data copyin(v)
-         !$acc enter data copyin(u_init, v_init)
-      else
-         !$acc enter data copyin(tend_w_euler)
-         !$acc enter data copyin(tend_u_euler)
-         !$acc enter data copyin(tend_theta_euler)
-         !$acc enter data copyin(tend_rho)
+      if (les_model_opt /= LES_MODEL_NONE) then
+         !$acc enter data copyin(bn2)
       end if
-      !$acc enter data create(tend_u)
-      !$acc enter data copyin(cqu, pp, u, w, pv_edge, rho_edge, ke)
-      !$acc enter data create(h_divergence)
-      !$acc enter data copyin(ru, rw)
+      !$acc enter data copyin(ustm, hfx, qfx)
       !$acc enter data create(rayleigh_damp_coef)
-      !$acc enter data copyin(tend_ru_physics)
-      !$acc enter data create(tend_w)
-      !$acc enter data copyin(rho_zz)
-      !$acc enter data create(tend_theta)
-      !$acc enter data copyin(theta_m)
-      !$acc enter data copyin(ru_save, theta_m_save)
-      !$acc enter data copyin(cqw)
-      !$acc enter data copyin(tend_rtheta_physics)
-      !$acc enter data copyin(rw_save, rt_diabatic_tend)
-      !$acc enter data create(rthdynten)
-      !$acc enter data copyin(t_init)
-#ifdef CURVATURE
-      !$acc enter data copyin(ur_cell, vr_cell)
-#endif
+      !$acc enter data create(eddy_visc_horz)
+      !$acc enter data create(eddy_visc_vert)
+      !$acc enter data create(prandtl_3d_inv)
       MPAS_ACC_TIMER_STOP('atm_compute_dyn_tend_work [ACC_data_xfer]')
 
       prandtl_inv = 1.0_RKIND / prandtl
@@ -5221,57 +6260,51 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
          !$acc end parallel
 
          ! Smagorinsky eddy viscosity, based on horizontal deformation (in this case on model coordinate surfaces).
-         ! The integration coefficients were precomputed and stored in defc_a and defc_b
+         ! The integration coefficients were precomputed and stored in deformation_coef_*
 
-         if(config_horiz_mixing == "2d_smagorinsky") then
+         if(les_model_opt == LES_MODEL_NONE) then
 
-            !$acc parallel default(present)
-            !$acc loop gang worker private(d_diag, d_off_diag)
-            do iCell = cellStart,cellEnd
+            if(config_horiz_mixing == "2d_smagorinsky") then
 
-               !$acc loop vector
-               do k = 1, nVertLevels
-                  d_diag(k) = 0.0_RKIND
-                  d_off_diag(k) = 0.0_RKIND
-               end do
+               call smagorinsky_2d( eddy_visc_horz, u, v, c_s, config_len_disp,                        &
+                                    deformation_coef_c2, deformation_coef_s2, deformation_coef_cs,     &
+                                    invDt, h_mom_eddy_visc4, config_visc4_2dsmag, h_theta_eddy_visc4,  &
+                                    cellStart, cellEnd, nEdgesOnCell, edgesOnCell,                     &
+                                    nCells, nEdges                                                    )
 
-               !$acc loop seq
-               do iEdge=1,nEdgesOnCell(iCell)
+            else if(config_horiz_mixing == "2d_fixed") then
+
+               !$acc parallel default(present)
+               !$acc loop gang worker
+               do iCell = cellStart, cellEnd
                   !$acc loop vector
-                  do k=1,nVertLevels
-                     d_diag(k)     = d_diag(k)     + defc_a(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell))  &
-                                                   - defc_b(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
-                     d_off_diag(k) = d_off_diag(k) + defc_b(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell))  &
-                                                   + defc_a(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
+                  do k = 1, nVertLevels
+                      eddy_visc_horz(k,iCell) = config_h_theta_eddy_visc2
                   end do
                end do
-!DIR$ IVDEP
-               !$acc loop vector
-               do k=1, nVertLevels
-                  ! here is the Smagorinsky formulation, 
-                  ! followed by imposition of an upper bound on the eddy viscosity
-                  kdiff(k,iCell) = min((c_s * config_len_disp)**2 * sqrt(d_diag(k)**2 + d_off_diag(k)**2),(0.01*config_len_disp**2) * invDt)
-               end do
-            end do
-            !$acc end parallel
+               !$acc end parallel
 
-            h_mom_eddy_visc4   = config_visc4_2dsmag * config_len_disp**3
-            h_theta_eddy_visc4 = h_mom_eddy_visc4
+               h_mom_eddy_visc4 = config_h_mom_eddy_visc4
+               h_theta_eddy_visc4 = config_h_theta_eddy_visc4
 
-         else if(config_horiz_mixing == "2d_fixed") then
+            end if
 
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iCell = cellStart, cellEnd
-               !$acc loop vector
-               do k = 1, nVertLevels
-                  kdiff(k,iCell) = config_h_theta_eddy_visc2
-               end do
-            end do
-            !$acc end parallel
+         else if (les_model_opt /= LES_MODEL_NONE) then
 
-            h_mom_eddy_visc4 = config_h_mom_eddy_visc4
-            h_theta_eddy_visc4 = config_h_theta_eddy_visc4
+            ! call mpas_log_write(' BV call, index qv, qc, tke $i $i $i ', intArgs=(/index_qv, index_qc, index_tke/))
+
+            call calculate_n2( bn2, theta_m, exner, pressure_b, pp, zgrid, scalars, index_qv, index_qc, qtot, &
+                               cellStart, cellEnd, nCells)
+      
+            call les_models( les_model_opt, les_surface_opt, dynamics_substep, eddy_visc_horz, eddy_visc_vert, &
+                             u, v, ur_cell, vr_cell,                                            &
+                             w, c_s, bn2, zgrid, config_len_disp,                               &
+                             deformation_coef_c2, deformation_coef_s2, deformation_coef_cs,     &
+                             deformation_coef_c, deformation_coef_s, prandtl_3d_inv,            &
+                             invDt, h_mom_eddy_visc4, config_visc4_2dsmag, h_theta_eddy_visc4,  &
+                             scalars, tend_scalars, index_tke, rho_zz, meshScalingDel2,         &
+                             cellStart, cellEnd, nEdgesOnCell, edgesOnCell, cellsOnEdge,        &
+                             nCells, nEdges, nVertLevels, maxEdges, num_scalars                )
 
          end if
 
@@ -5288,7 +6321,7 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
                do k = nVertLevels-config_number_cam_damping_levels + 1, nVertLevels
                   visc2cam = 4.0*2.0833*config_len_disp*config_mpas_cam_coef
                   visc2cam = visc2cam*(1.0-real(nVertLevels-k)/real(config_number_cam_damping_levels))
-                  kdiff(k  ,iCell) = max(kdiff(k  ,iCell),visc2cam)
+                  eddy_visc_horz(k,iCell) = max(eddy_visc_horz(k,iCell),visc2cam)
                end do
             end do
             !$acc end parallel
@@ -5298,8 +6331,6 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       end if
 
       ! tendency for density.
-      ! accumulate total water here for later use in w tendency calculation.
-
       ! accumulate horizontal mass-flux
 
       !$acc parallel default(present)
@@ -5381,281 +6412,112 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
             !$acc loop vector
             do k=1,nVertLevels
                tend_u_euler(k,iEdge) =  - cqu(k,iEdge)*( (pp(k,cell2)-pp(k,cell1))*invDcEdge(iEdge)/(.5*(zz(k,cell2)+zz(k,cell1))) &
-                                              -0.5*zxu(k,iEdge)*(dpdz(k,cell1)+dpdz(k,cell2)) )
-            end do
-
-         end if
-
-         ! vertical transport of u
-
-         wduz(1) = 0.
-
-         k = 2
-         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2))*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))
-         !$acc loop vector
-         do k=3,nVertLevels-1
-            wduz(k) = flux3( u(k-2,iEdge),u(k-1,iEdge),u(k,iEdge),u(k+1,iEdge),0.5*(rw(k,cell1)+rw(k,cell2)), 1.0_RKIND )
-         end do
-         k = nVertLevels
-         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2))*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))
-
-         wduz(nVertLevels+1) = 0.
-
-!DIR$ IVDEP
-         !$acc loop vector
-         do k=1,nVertLevels
-            tend_u(k,iEdge) = - rdzw(k)*(wduz(k+1)-wduz(k)) !  first use of tend_u
-         end do
-
-         ! Next, nonlinear Coriolis term (q) following Ringler et al JCP 2009
-
-         !$acc loop vector
-         do k=1,nVertLevels
-            q(k) = 0.0_RKIND
-         end do
-
-         !$acc loop seq
-         do j = 1,nEdgesOnEdge(iEdge)
-            eoe = edgesOnEdge(j,iEdge)
-
-            !$acc loop vector
-            do k=1,nVertLevels
-               workpv = 0.5 * (pv_edge(k,iEdge) + pv_edge(k,eoe))
-!  the original definition of pv_edge had a factor of 1/density.  We have removed that factor
-!  given that it was not integral to any conservation property of the system
-               q(k) = q(k) + weightsOnEdge(j,iEdge) * u(k,eoe) * workpv
-            end do
-         end do
-
-!DIR$ IVDEP
-         !$acc loop vector
-         do k=1,nVertLevels
-
-            ! horizontal ke gradient and vorticity terms in the vector invariant formulation
-            ! of the horizontal momentum equation
-            tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge)* (q(k) - (ke(k,cell2) - ke(k,cell1))       &
-                                                                 * invDcEdge(iEdge))                            &
-                                             - u(k,iEdge)*0.5*(h_divergence(k,cell1)+h_divergence(k,cell2)) 
-#ifdef CURVATURE
-            ! curvature terms for the sphere
-            tend_u(k,iEdge) = tend_u(k,iEdge) &
-                             - 2.*omega*cos(angleEdge(iEdge))*cos(latEdge(iEdge))  &
-                               *rho_edge(k,iEdge)*.25*(w(k,cell1)+w(k+1,cell1)+w(k,cell2)+w(k+1,cell2))          & 
-                             - u(k,iEdge)*.25*(w(k+1,cell1)+w(k,cell1)+w(k,cell2)+w(k+1,cell2))                  &
-                               *rho_edge(k,iEdge) * inv_r_earth
-#endif
-         end do
-
-      end do
-      !$acc end parallel
-
-      !
-      !  horizontal mixing for u
-      !  mixing terms are integrated using forward-Euler, so this tendency is only computed in the
-      !  first Runge-Kutta substep and saved for use in later RK substeps 2 and 3.
-      !
-
-      if (rk_step == 1) then
-
-!$OMP BARRIER
-
-         ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (u) ).
-         ! First, storage to hold the result from the first del^2 computation.
-
-         !$acc parallel default(present)
-         !$acc loop gang worker
-         do iEdge = edgeStart, edgeEnd
-            !$acc loop vector
-            do k = 1, nVertLevels
-               delsq_u(k,iEdge) = 0.0_RKIND
-            end do
-         end do
-         !$acc end parallel
-
-         !$acc parallel default(present)
-         !$acc loop gang worker
-         do iEdge=edgeStart,edgeEnd
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
-            vertex1 = verticesOnEdge(1,iEdge)
-            vertex2 = verticesOnEdge(2,iEdge)
-            r_dc = invDcEdge(iEdge)
-            r_dv = min(invDvEdge(iEdge), 4*invDcEdge(iEdge))
-
-!DIR$ IVDEP
-            !$acc loop vector
-            do k=1,nVertLevels
-
-               ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity
-               !                    only valid for h_mom_eddy_visc4 == constant
-              u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) * r_dc  &
-                              -( vorticity(k,vertex2) - vorticity(k,vertex1) ) * r_dv
-
-               delsq_u(k,iEdge) = delsq_u(k,iEdge) + u_diffusion
-
-               kdiffu = 0.5*(kdiff(k,cell1)+kdiff(k,cell2))
-
-               ! include 2nd-orer diffusion here 
-               tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) &
-                                       + rho_edge(k,iEdge)* kdiffu * u_diffusion * meshScalingDel2(iEdge)
-
-            end do
-         end do
-         !$acc end parallel
-
-         if (h_mom_eddy_visc4 > 0.0) then  ! 4th order mixing is active
-
-!$OMP BARRIER
-
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iVertex=vertexStart,vertexEnd
-
-               !$acc loop vector
-               do k=1,nVertLevels
-                  delsq_vorticity(k,iVertex) = 0.0_RKIND
-               end do
-
-               !$acc loop seq
-               do i=1,vertexDegree
-                  iEdge = edgesOnVertex(i,iVertex)
-                  edge_sign = invAreaTriangle(iVertex) * dcEdge(iEdge) * edgesOnVertex_sign(i,iVertex)
-
-                  !$acc loop vector
-                  do k=1,nVertLevels
-                     delsq_vorticity(k,iVertex) = delsq_vorticity(k,iVertex) + edge_sign * delsq_u(k,iEdge)
-                  end do
-               end do
-            end do
-
-            !$acc loop gang worker
-            do iCell=cellStart,cellEnd
-
-               !$acc loop vector
-               do k=1,nVertLevels
-                  delsq_divergence(k,iCell) = 0.0_RKIND
-               end do
-
-               r = invAreaCell(iCell)
-
-               !$acc loop seq
-               do i=1,nEdgesOnCell(iCell)
-                  iEdge = edgesOnCell(i,iCell)
-                  edge_sign = r * dvEdge(iEdge) * edgesOnCell_sign(i,iCell)
-
-                  !$acc loop vector
-                  do k=1,nVertLevels
-                     delsq_divergence(k,iCell) = delsq_divergence(k,iCell) + edge_sign * delsq_u(k,iEdge)
-                  end do
-               end do
-            end do
-            !$acc end parallel
-         
-!$OMP BARRIER
-
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iEdge=edgeSolveStart,edgeSolveEnd
-               cell1 = cellsOnEdge(1,iEdge)
-               cell2 = cellsOnEdge(2,iEdge)
-               vertex1 = verticesOnEdge(1,iEdge)
-               vertex2 = verticesOnEdge(2,iEdge)
-
-               u_mix_scale = meshScalingDel4(iEdge)*h_mom_eddy_visc4
-               r_dc = u_mix_scale * config_del4u_div_factor * invDcEdge(iEdge)
-               r_dv = u_mix_scale * min(invDvEdge(iEdge), 4*invDcEdge(iEdge))
+                                              -0.5*zxu(k,iEdge)*(dpdz(k,cell1)+dpdz(k,cell2)) )
+            end do
 
-!DIR$ IVDEP
-               !$acc loop vector
-               do k=1,nVertLevels
+         end if
 
-                  ! Compute diffusion, computed as \nabla divergence - k \times \nabla vorticity
-                  !                    only valid for h_mom_eddy_visc4 == constant
-                  !
-                  ! Here, we scale the diffusion on the divergence part a factor of config_del4u_div_factor 
-                  !    relative to the rotational part.  The stability constraint on the divergence component is much less
-                  !    stringent than the rotational part, and this flexibility may be useful.
-                  !
-                  u_diffusion =  rho_edge(k,iEdge) *  ( ( delsq_divergence(k,cell2)  - delsq_divergence(k,cell1) ) * r_dc  &
-                                                       -( delsq_vorticity(k,vertex2) - delsq_vorticity(k,vertex1) ) * r_dv )
-                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) - u_diffusion
-                  
-               end do
-            end do
-            !$acc end parallel
-         
-         end if ! 4th order mixing is active 
+         ! vertical transport of u
 
-      !
-      !  vertical mixing for u - 2nd order filter in physical (z) space
-      !
-         if ( v_mom_eddy_visc2 > 0.0 ) then
+         wduz(1) = 0.
 
-            if (config_mix_full) then  ! mix full state
+         k = 2
+         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2))*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))
+         !$acc loop vector
+         do k=3,nVertLevels-1
+            wduz(k) = flux3( u(k-2,iEdge),u(k-1,iEdge),u(k,iEdge),u(k+1,iEdge),0.5*(rw(k,cell1)+rw(k,cell2)), 1.0_RKIND )
+         end do
+         k = nVertLevels
+         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2))*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))
 
-               !$acc parallel default(present)
-               !$acc loop gang worker
-               do iEdge=edgeSolveStart,edgeSolveEnd
+         wduz(nVertLevels+1) = 0.
 
-                  cell1 = cellsOnEdge(1,iEdge)
-                  cell2 = cellsOnEdge(2,iEdge)
+!DIR$ IVDEP
+         !$acc loop vector
+         do k=1,nVertLevels
+            tend_u(k,iEdge) = - rdzw(k)*(wduz(k+1)-wduz(k)) !  first use of tend_u
+         end do
 
-                  !$acc loop vector
-                  do k=2,nVertLevels-1
+         ! Next, nonlinear Coriolis term (q) following Ringler et al JCP 2009
 
-                     z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
-                     z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
-                     z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
-                     z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+         !$acc loop vector
+         do k=1,nVertLevels
+            q(k) = 0.0_RKIND
+         end do
 
-                     zm = 0.5*(z1+z2)
-                     z0 = 0.5*(z2+z3)
-                     zp = 0.5*(z3+z4)
+         !$acc loop seq
+         do j = 1,nEdgesOnEdge(iEdge)
+            eoe = edgesOnEdge(j,iEdge)
 
-                     tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &
-                                        (u(k+1,iEdge)-u(k  ,iEdge))/(zp-z0)                      &
-                                       -(u(k  ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm))
-                  end do
-               end do
-               !$acc end parallel
+            !$acc loop vector
+            do k=1,nVertLevels
+               workpv = 0.5 * (pv_edge(k,iEdge) + pv_edge(k,eoe))
+!  the original definition of pv_edge had a factor of 1/density.  We have removed that factor
+!  given that it was not integral to any conservation property of the system
+               q(k) = q(k) + weightsOnEdge(j,iEdge) * u(k,eoe) * workpv
+            end do
+         end do
 
-            else  ! idealized cases where we mix on the perturbation from the initial 1-D state
+         if (perturbation_coriolis) then  !  this is correct only for constant f
+            !$acc loop seq
+            do j = 1,nEdgesOnEdge(iEdge)
+               eoe = edgesOnEdge(j,iEdge)
 
-               !$acc parallel default(present)
-               !$acc loop gang worker private(u_mix)
-               do iEdge=edgeSolveStart,edgeSolveEnd
+               !$acc loop vector
+               do k=1,nVertLevels
+                  reference_u = u_init(k) * cos(angleEdge(eoe)) -  v_init(k) * sin(angleEdge(eoe))
+                  q(k) = q(k) - weightsOnEdge(j,iEdge) * reference_u * fEdge(iEdge)
+!                 q(k) = q(k) - weightsOnEdge(j,iEdge) * reference_u * 0.729210E-04
+               end do
+            end do
+         end if
 
-                  cell1 = cellsOnEdge(1,iEdge)
-                  cell2 = cellsOnEdge(2,iEdge)
+!DIR$ IVDEP
+         !$acc loop vector
+         do k=1,nVertLevels
 
-                  !$acc loop vector
-                  do k=1,nVertLevels
-                     u_mix(k) = u(k,iEdge) - u_init(k) * cos( angleEdge(iEdge) ) &
-                                           - v_init(k) * sin( angleEdge(iEdge) )
-                  end do
+            ! horizontal ke gradient and vorticity terms in the vector invariant formulation
+            ! of the horizontal momentum equation
+            tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge)* (q(k) - (ke(k,cell2) - ke(k,cell1))       &
+                                                                 * invDcEdge(iEdge))                            &
+                                             - u(k,iEdge)*0.5*(h_divergence(k,cell1)+h_divergence(k,cell2)) 
+#ifdef CURVATURE
+            ! curvature terms for the sphere
+            tend_u(k,iEdge) = tend_u(k,iEdge) &
+                             - 2.*omega*cos(angleEdge(iEdge))*cos(latEdge(iEdge))  &
+                               *rho_edge(k,iEdge)*.25*(w(k,cell1)+w(k+1,cell1)+w(k,cell2)+w(k+1,cell2))          & 
+                             - u(k,iEdge)*.25*(w(k+1,cell1)+w(k,cell1)+w(k,cell2)+w(k+1,cell2))                  &
+                               *rho_edge(k,iEdge) * inv_r_earth
+#endif
+         end do
 
-                  !$acc loop vector
-                  do k=2,nVertLevels-1
+      end do
+      !$acc end parallel
 
-                     z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
-                     z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
-                     z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
-                     z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+      !
+      !  horizontal mixing for u
+      !  mixing terms are integrated using forward-Euler, so this tendency is only computed in the
+      !  first Runge-Kutta substep and saved for use in later RK substeps 2 and 3.
+      !
 
-                     zm = 0.5*(z1+z2)
-                     z0 = 0.5*(z2+z3)
-                     zp = 0.5*(z3+z4)
+      if (rk_step == 1) then
 
-                     tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &
-                                        (u_mix(k+1)-u_mix(k  ))/(zp-z0)                      &
-                                       -(u_mix(k  )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm))
-                  end do
-               end do
-               !$acc end parallel
+!$OMP BARRIER
 
-            end if  ! mix perturbation state
+         call  u_dissipation_3d( edgeStart, edgeEnd, edgeSolveStart, edgeSolveEnd, vertexStart, vertexEnd,    &
+                                 cellStart, cellEnd, nCells, nEdges, nVertices, vertexDegree,                 &
+                                 cellsOnEdge, verticesOnEdge, edgesOnCell, edgesOnVertex,                     &
+                                 nEdgesOnCell, edgesOnCell_sign, edgesOnVertex_sign,                          &
+                                 invAreaCell, invAreaTriangle, invDvEdge, invDcEdge,                          &
+                                 angleEdge, dcEdge, dvEdge, meshScalingDel2, meshScalingDel4,                 &
+                                 config_mix_full, h_mom_eddy_visc4, v_mom_eddy_visc2,                         &
+                                 config_del4u_div_factor, zgrid,                                              &
+                                 eddy_visc_horz, eddy_visc_vert, zz, rdzu, rdzw,                              &
+                                 fzm, fzp, les_model_opt, les_surface_opt,                                    &
+                                 config_surface_drag_coefficient,                                             &
+                                 delsq_u, delsq_vorticity, delsq_divergence,                                  &
+                                 u, v, divergence, vorticity, rho_edge, rho_zz, u_init, v_init, ustm, tend_u_euler )
 
-         end if  ! vertical mixing of horizontal momentum
 
       end if ! (rk_step 1 test for computing mixing terms)
 
@@ -5783,82 +6645,20 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
       if (rk_step == 1) then
 
-!  !OMP BARRIER  why is this openmp barrier here???
-
-         ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (u) ).
-         !
-         ! First, storage to hold the result from the first del^2 computation.
-         !  we copied code from the theta mixing, hence the theta* names.
-
-         !$acc parallel default(present)
-         !$acc loop gang worker
-         do iCell=cellStart,cellEnd
-
-            !$acc loop vector
-            do k=1,nVertLevels
-               delsq_w(k,iCell) = 0.0_RKIND
-            end do
-
-            !$acc loop vector
-            do k=1,nVertLevels+1
-               tend_w_euler(k,iCell) = 0.0_RKIND
-            end do
-
-            r_areaCell = invAreaCell(iCell)
-
-            !$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
-
-               edge_sign = 0.5 * r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge)
-
-               cell1 = cellsOnEdge(1,iEdge)
-               cell2 = cellsOnEdge(2,iEdge)
-
-!DIR$ IVDEP
-              !$acc loop vector
-              do k=2,nVertLevels
-
-                  w_turb_flux =  edge_sign*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))
-                  delsq_w(k,iCell) = delsq_w(k,iCell) + w_turb_flux
-                  w_turb_flux = w_turb_flux * meshScalingDel2(iEdge) * 0.25 * &
-                                  (kdiff(k,cell1)+kdiff(k,cell2)+kdiff(k-1,cell1)+kdiff(k-1,cell2))
-                  tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + w_turb_flux
-               end do
-            end do
-         end do
-         !$acc end parallel
-
-!$OMP BARRIER
-
-         if (h_mom_eddy_visc4 > 0.0) then  ! 4th order mixing is active
-
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iCell=cellSolveStart,cellSolveEnd    ! Technically updating fewer cells than before...
-
-               r_areaCell = h_mom_eddy_visc4 * invAreaCell(iCell)
-
-               !$acc loop seq
-               do i=1,nEdgesOnCell(iCell)
-                  iEdge = edgesOnCell(i,iCell)
-                  cell1 = cellsOnEdge(1,iEdge)
-                  cell2 = cellsOnEdge(2,iEdge)
-
-                  edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell) * invDcEdge(iEdge)
-
-                  !$acc loop vector
-                  do k=2,nVertLevels
-                     tend_w_euler(k,iCell) = tend_w_euler(k,iCell) - edge_sign * (delsq_w(k,cell2) - delsq_w(k,cell1))
-                  end do
-           
-               end do
-            end do
-            !$acc end parallel
-
-         end if ! 4th order mixing is active 
-
-      end if ! horizontal mixing for w computed in first rk_step
+         call  w_dissipation_3d( cellStart, cellEnd, cellSolveStart, cellSolveEnd,         &
+                                 nCells, nEdges,                                           &
+                                 nEdgesOnCell, edgesOnCell, cellsOnEdge, edgesOnCell_sign, &
+                                 invAreaCell, invDcEdge, dvEdge,                           &
+                                 meshScalingDel2, meshScalingDel4,                         &
+                                 rdzw, rdzu,                                               &
+                                 v_mom_eddy_visc2, h_mom_eddy_visc4,                       &
+                                 delsq_w,                                                  &
+                                 w, rho_edge, rho_zz, divergence, zz,                      &
+                                 eddy_visc_horz, eddy_visc_vert,                           &
+                                 les_model_opt, les_surface_opt,                           &
+                                 tend_w_euler                                             )
+
+      end if ! mixing for w computed in first rk_step
 
 ! Note for OpenMP parallelization: We could avoid allocating the delsq_w scratch
 !   array, and just use the delsq_theta array as was previously done; however,
@@ -5911,27 +6711,6 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       end do
       !$acc end parallel
 
-      if (rk_step == 1) then
-
-         if ( v_mom_eddy_visc2 > 0.0 ) then
-
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iCell=cellSolveStart,cellSolveEnd
-!DIR$ IVDEP
-               !$acc loop vector
-               do k=2,nVertLevels
-                  tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + v_mom_eddy_visc2*0.5*(rho_zz(k,iCell)+rho_zz(k-1,iCell))*(  &
-                                           (w(k+1,iCell)-w(k  ,iCell))*rdzw(k)                              &
-                                          -(w(k  ,iCell)-w(k-1,iCell))*rdzw(k-1) )*rdzu(k)
-               end do
-            end do
-            !$acc end parallel
-
-         end if
-
-      end if ! mixing term computed first rk_step
-
       ! add in mixing terms for w
 
       !$acc parallel default(present)
@@ -6022,69 +6801,25 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
       if (rk_step == 1) then
 
-         !$acc parallel default(present)
-         !$acc loop gang worker
-         do iCell=cellStart,cellEnd
-
-            !$acc loop vector
-            do k=1,nVertLevels
-               delsq_theta(k,iCell) = 0.0_RKIND
-               tend_theta_euler(k,iCell) = 0.0_RKIND
-            end do
-
-            r_areaCell = invAreaCell(iCell)
-
-            !$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
-               edge_sign = r_areaCell*edgesOnCell_sign(i,iCell) * dvEdge(iEdge) * invDcEdge(iEdge)
-               pr_scale = prandtl_inv * meshScalingDel2(iEdge)
-               cell1 = cellsOnEdge(1,iEdge)
-               cell2 = cellsOnEdge(2,iEdge)
-!DIR$ IVDEP
-               !$acc loop vector
-               do k=1,nVertLevels
-
-!  we are computing the Smagorinsky filter at more points than needed here so as to pick up the delsq_theta for 4th order filter below
-
-                  theta_turb_flux = edge_sign*(theta_m(k,cell2) - theta_m(k,cell1))*rho_edge(k,iEdge)
-                  delsq_theta(k,iCell) = delsq_theta(k,iCell) + theta_turb_flux
-                  theta_turb_flux = theta_turb_flux*0.5*(kdiff(k,cell1)+kdiff(k,cell2)) * pr_scale
-                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + theta_turb_flux
-
-               end do
-            end do
-          end do
-          !$acc end parallel
-
-!$OMP BARRIER
-            
-         if (h_theta_eddy_visc4 > 0.0) then  ! 4th order mixing is active
-
-            !$acc parallel default(present)
-            !$acc loop gang worker
-            do iCell=cellSolveStart,cellSolveEnd    ! Technically updating fewer cells than before...
-
-               r_areaCell = h_theta_eddy_visc4 * prandtl_inv * invAreaCell(iCell)
-
-               !$acc loop seq
-               do i=1,nEdgesOnCell(iCell)
-
-                  iEdge = edgesOnCell(i,iCell)
-                  edge_sign = meshScalingDel4(iEdge)*r_areaCell*dvEdge(iEdge)*edgesOnCell_sign(i,iCell)*invDcEdge(iEdge)
-
-                  cell1 = cellsOnEdge(1,iEdge)
-                  cell2 = cellsOnEdge(2,iEdge)
-
-                  !$acc loop vector
-                  do k=1,nVertLevels
-                     tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) - edge_sign*(delsq_theta(k,cell2) - delsq_theta(k,cell1))
-                  end do
-               end do
-            end do
-            !$acc end parallel
-
-         end if ! 4th order mixing is active 
+         call scalar_dissipation_3d_les( cellStart, cellEnd, cellSolveStart, cellSolveEnd,         &
+                                         nCells, nEdges,                                           &
+                                         nEdgesOnCell, edgesOnCell, cellsOnEdge, edgesOnCell_sign, &
+                                         invAreaCell, invDcEdge, dvEdge,                           &
+                                         meshScalingDel2, meshScalingDel4,                         &
+                                         config_mix_full, t_init, zgrid,                           &
+                                         rdzw, rdzu, fzm, fzp,                                     &
+                                         v_theta_eddy_visc2, h_theta_eddy_visc4, prandtl_inv,      &
+                                         prandtl_3d_inv,                                           &
+                                         delsq_theta,                                              &
+                                         theta_m, rho_edge, rho_zz, zz,                            &
+                                         eddy_visc_horz, eddy_visc_vert,                           &
+                                         bn2, config_len_disp, scalars, tend_scalars,              &
+                                         index_tke, index_qv, num_scalars, config_mix_scalars,     &
+                                         les_model_opt, les_surface_opt, clock, dt,                &
+                                         config_surface_heat_flux, config_surface_moisture_flux,   &
+                                         ur_cell, vr_cell,                                         &
+                                         hfx, qfx,                                                 &
+                                         tend_theta_euler, dynamics_substep                       )
 
       end if ! theta mixing calculated first rk_step
 
@@ -6125,116 +6860,26 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       end do
       !$acc end parallel
 
-      !
-      !  vertical mixing for theta - 2nd order 
-      !
-
-      if (rk_step == 1) then
-
-         if ( v_theta_eddy_visc2 > 0.0 ) then  ! vertical mixing for theta_m
-
-            if (config_mix_full) then
-
-               !$acc parallel default(present)
-               !$acc loop gang worker
-               do iCell = cellSolveStart,cellSolveEnd
-
-                  !$acc loop vector
-                  do k=2,nVertLevels-1
-                     z1 = zgrid(k-1,iCell)
-                     z2 = zgrid(k  ,iCell)
-                     z3 = zgrid(k+1,iCell)
-                     z4 = zgrid(k+2,iCell)
-
-                     zm = 0.5*(z1+z2)
-                     z0 = 0.5*(z2+z3)
-                     zp = 0.5*(z3+z4)
-
-                     tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&
-                                              (theta_m(k+1,iCell)-theta_m(k  ,iCell))/(zp-z0)                 &
-                                             -(theta_m(k  ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm))
-                  end do
-               end do
-               !$acc end parallel
-
-            else  ! idealized cases where we mix on the perturbation from the initial 1-D state
-
-               !$acc parallel default(present)
-               !$acc loop gang worker
-               do iCell = cellSolveStart,cellSolveEnd
-                  do k=2,nVertLevels-1
-                     z1 = zgrid(k-1,iCell)
-                     z2 = zgrid(k  ,iCell)
-                     z3 = zgrid(k+1,iCell)
-                     z4 = zgrid(k+2,iCell)
-
-                     zm = 0.5*(z1+z2)
-                     z0 = 0.5*(z2+z3)
-                     zp = 0.5*(z3+z4)
-
-                     tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&
-                                              ((theta_m(k+1,iCell)-t_init(k+1,iCell))-(theta_m(k  ,iCell)-t_init(k,iCell)))/(zp-z0)      &
-                                             -((theta_m(k  ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm))
-                  end do
-               end do
-               !$acc end parallel
-
-            end if
-
-         end if
-
-      end if ! compute vertical theta mixing on first rk_step
-
       !$acc parallel default(present)
       !$acc loop gang worker
       do iCell = cellSolveStart,cellSolveEnd
 !DIR$ IVDEP
          !$acc loop vector
          do k=1,nVertLevels
-!            tend_theta(k,iCell) = tend_theta(k,iCell) + tend_theta_euler(k,iCell)
             tend_theta(k,iCell) = tend_theta(k,iCell) + tend_theta_euler(k,iCell) + tend_rtheta_physics(k,iCell)
          end do
       end do
       !$acc end parallel
 
       MPAS_ACC_TIMER_START('atm_compute_dyn_tend_work [ACC_data_xfer]')
-      if (rk_step == 1) then
-         !$acc exit data copyout(tend_w_euler)
-         !$acc exit data copyout(tend_u_euler)
-         !$acc exit data copyout(tend_theta_euler)
-         !$acc exit data copyout(tend_rho)
-
-         !$acc exit data delete(kdiff)
-         !$acc exit data delete(tend_rho_physics)
-         !$acc exit data delete(rb, rr_save)
-         !$acc exit data delete(divergence, vorticity)
-         !$acc exit data delete(v)
-         !$acc exit data delete(u_init, v_init)
-      else
-         !$acc exit data delete(tend_w_euler)
-         !$acc exit data delete(tend_u_euler)
-         !$acc exit data delete(tend_theta_euler)
-         !$acc exit data delete(tend_rho)
+      if (les_model_opt /= LES_MODEL_NONE) then
+         !$acc exit data copyout(bn2)
       end if
-      !$acc exit data copyout(tend_u)
-      !$acc exit data delete(cqu, pp, u, w, pv_edge, rho_edge, ke)
-      !$acc exit data copyout(h_divergence)
-      !$acc exit data delete(ru, rw)
+      !$acc exit data delete(ustm, hfx, qfx)
       !$acc exit data delete(rayleigh_damp_coef)
-      !$acc exit data delete(tend_ru_physics)
-      !$acc exit data copyout(tend_w)
-      !$acc exit data delete(rho_zz)
-      !$acc exit data copyout(tend_theta)
-      !$acc exit data delete(theta_m)
-      !$acc exit data delete(ru_save, theta_m_save)
-      !$acc exit data delete(cqw)
-      !$acc exit data delete(tend_rtheta_physics)
-      !$acc exit data delete(rw_save, rt_diabatic_tend)
-      !$acc exit data copyout(rthdynten)
-      !$acc exit data delete(t_init)
-#ifdef CURVATURE
-      !$acc exit data delete(ur_cell, vr_cell)
-#endif
+      !$acc exit data delete(eddy_visc_horz)
+      !$acc exit data delete(eddy_visc_vert)
+      !$acc exit data delete(prandtl_3d_inv)
       MPAS_ACC_TIMER_STOP('atm_compute_dyn_tend_work [ACC_data_xfer]')
 
    end subroutine atm_compute_dyn_tend_work
@@ -6403,26 +7048,10 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       logical :: reconstruct_v
 
 
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data copyin(cellsOnEdge,dcEdge,dvEdge, &
-      !$acc                   edgesOnVertex,edgesOnVertex_sign,invAreaTriangle, &
-      !$acc                   nEdgesOnCell,edgesOnCell, &
-      !$acc                   edgesOnCell_sign,invAreaCell, &
-      !$acc                   invAreaTriangle,edgesOnVertex, &
-      !$acc                   verticesOnCell,kiteForCell,kiteAreasOnVertex, &
-      !$acc                   nEdgesOnEdge,edgesOnEdge,weightsOnEdge, &
-      !$acc                   fVertex, &
-      !$acc                   verticesOnEdge, &
-      !$acc                   invDvEdge,invDcEdge)
-      !$acc enter data copyin(u,h)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
 
       !
       ! Compute height on cell edges at velocity locations
       !
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data create(h_edge,vorticity,divergence)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       !$acc parallel default(present)
       !$acc loop gang
       do iEdge=edgeStart,edgeEnd
@@ -6507,9 +7136,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       !
       ! Replace 2.0 with 2 in exponentiation to avoid outside chance that
       ! compiler will actually allow "float raised to float" operation
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data create(ke)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       !$acc parallel default(present)
       !$acc loop gang
       do iCell=cellStart,cellEnd
@@ -6604,14 +7230,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
         if(rk_step /= 3) reconstruct_v = .false.
       end if
 
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      if (reconstruct_v) then
-         !$acc enter data create(v)
-      else
-         !$acc enter data copyin(v)
-      end if
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
-
       if (reconstruct_v) then
         !$acc parallel default(present)
         !$acc loop gang
@@ -6639,9 +7257,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       !
       ! Avoid dividing h_vertex by areaTriangle and move areaTriangle into
       ! numerator for the pv_vertex calculation
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data create(pv_vertex)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       !$acc parallel default(present)
       !$acc loop collapse(2)
       do iVertex = vertexStart,vertexEnd
@@ -6665,9 +7280,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       ! Compute pv at the edges
       !   ( this computes pv_edge at all edges bounding real cells )
       !
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data create(pv_edge)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       !$acc parallel default(present)
       !$acc loop collapse(2)
       do iEdge = edgeStart,edgeEnd
@@ -6685,9 +7297,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
          !    ( this computes pv_cell for all real cells )
          !  only needed for APVM upwinding
          !
-         MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-         !$acc enter data create(pv_cell)
-         MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
          !$acc parallel default(present)
          !$acc loop gang
          do iCell=cellStart,cellEnd
@@ -6726,9 +7335,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
          ! Merged loops for calculating gradPVt, gradPVn and pv_edge
          ! Also precomputed inverses of dvEdge and dcEdge to avoid repeated divisions
          !
-         MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-         !$acc enter data create(gradPVt,gradPVn)
-         MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
          r = config_apvm_upwinding * dt
          !$acc parallel default(present)
          !$acc loop gang
@@ -6745,31 +7351,10 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
          end do
          !$acc end parallel
 
-         MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-         !$acc exit data delete(pv_cell,gradPVt,gradPVn)
-         MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
 
       end if  ! apvm upwinding
 
 
-      MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc exit data delete(cellsOnEdge,dcEdge,dvEdge, &
-      !$acc                  edgesOnVertex,edgesOnVertex_sign,invAreaTriangle, &
-      !$acc                  nEdgesOnCell,edgesOnCell, &
-      !$acc                  edgesOnCell_sign,invAreaCell, &
-      !$acc                  invAreaTriangle,edgesOnVertex, &
-      !$acc                  verticesOnCell,kiteForCell,kiteAreasOnVertex, &
-      !$acc                  nEdgesOnEdge,edgesOnEdge,weightsOnEdge, &
-      !$acc                  verticesOnEdge, &
-      !$acc                  fVertex,invDvEdge,invDcEdge)
-      !$acc exit data delete(u,h)
-      !$acc exit data copyout(h_edge,vorticity,divergence, &
-      !$acc                   ke, &
-      !$acc                   v, &
-      !$acc                   pv_vertex, &
-      !$acc                   pv_edge)
-      MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
-
    end subroutine atm_compute_solve_diagnostics_work
 
 
@@ -6858,17 +7443,13 @@ subroutine atm_init_coupled_diagnostics(state, time_lev, diag, mesh, configs, &
       call mpas_pool_get_array(mesh, 'zb3_cell', zb3_cell)
 
       MPAS_ACC_TIMER_START('atm_init_coupled_diagnostics [ACC_data_xfer]')
-      ! copyin invariant fields
-      !$acc enter data copyin(cellsOnEdge,nEdgesOnCell,edgesOnCell, &
-      !$acc                   edgesOnCell_sign,zz,fzm,fzp,zb,zb3, &
-      !$acc                   zb_cell,zb3_cell)
 
       ! copyin the data that is only on the right-hand side
-      !$acc enter data copyin(scalars(index_qv,:,:),u,w,rho,theta, &
+      !$acc enter data copyin(scalars(index_qv,:,:),w,rho,theta, &
       !$acc                           rho_base,theta_base)
 
       ! copyin the data that will be modified in this routine
-      !$acc enter data create(theta_m,rho_zz,ru,rw,rho_p,rtheta_base, &
+      !$acc enter data create(theta_m,ru,rw,rho_p,rtheta_base, &
       !$acc                   rtheta_p,exner,exner_base,pressure_p, &
       !$acc                   pressure_base)
       MPAS_ACC_TIMER_STOP('atm_init_coupled_diagnostics [ACC_data_xfer]')
@@ -6992,17 +7573,12 @@ subroutine atm_init_coupled_diagnostics(state, time_lev, diag, mesh, configs, &
       !$acc end parallel
 
       MPAS_ACC_TIMER_START('atm_init_coupled_diagnostics [ACC_data_xfer]')
-      ! delete invariant fields
-      !$acc exit data delete(cellsOnEdge,nEdgesOnCell,edgesOnCell, &
-      !$acc                  edgesOnCell_sign,zz,fzm,fzp,zb,zb3, &
-      !$acc                  zb_cell,zb3_cell)
-
       ! delete the data that is only on the right-hand side
-      !$acc exit data delete(scalars(index_qv,:,:),u,w,rho,theta, &
+      !$acc exit data delete(scalars(index_qv,:,:),w,rho,theta, &
       !$acc                           rho_base,theta_base)
 
       ! copyout the data that will be modified in this routine
-      !$acc exit data copyout(theta_m,rho_zz,ru,rw,rho_p,rtheta_base, &
+      !$acc exit data copyout(theta_m,ru,rw,rho_p,rtheta_base, &
       !$acc                   rtheta_p,exner,exner_base,pressure_p, &
       !$acc                   pressure_base)
       MPAS_ACC_TIMER_STOP('atm_init_coupled_diagnostics [ACC_data_xfer]')
@@ -7069,13 +7645,6 @@ subroutine atm_rk_dynamics_substep_finish( state, diag, nVertLevels, dynamics_su
       call mpas_pool_get_array(state, 'rho_zz', rho_zz_1, 1)
       call mpas_pool_get_array(state, 'rho_zz', rho_zz_2, 2)
 
-      MPAS_ACC_TIMER_START('atm_rk_dynamics_substep_finish [ACC_data_xfer]')
-      !$acc enter data create(ru_save, u_1, rtheta_p_save, theta_m_1, rho_p_save, rw_save, &
-      !$acc                   w_1, rho_zz_1) &
-      !$acc            copyin(ru, u_2, rtheta_p, rho_p, theta_m_2, rho_zz_2, rw, &
-      !$acc                   w_2, ruAvg, wwAvg, ruAvg_split, wwAvg_split, rho_zz_old_split)
-      MPAS_ACC_TIMER_STOP('atm_rk_dynamics_substep_finish [ACC_data_xfer]')
-
       ! Interim fix for the atm_compute_dyn_tend_work subroutine accessing uninitialized values
       ! in garbage cells of theta_m
       !$acc kernels
@@ -7180,13 +7749,6 @@ subroutine atm_rk_dynamics_substep_finish( state, diag, nVertLevels, dynamics_su
          !$acc end parallel
       end if
 
-      MPAS_ACC_TIMER_START('atm_rk_dynamics_substep_finish [ACC_data_xfer]')
-      !$acc exit data copyout(ru_save, u_1, rtheta_p_save, rho_p_save, rw_save, &
-      !$acc                   w_1, theta_m_1, rho_zz_1, ruAvg, wwAvg, ruAvg_split, &
-      !$acc                   wwAvg_split) &
-      !$acc            delete(ru, u_2, rtheta_p, rho_p, theta_m_2, rho_zz_2, rw, &
-      !$acc                  w_2, rho_zz_old_split)
-      MPAS_ACC_TIMER_STOP('atm_rk_dynamics_substep_finish [ACC_data_xfer]')
 
    end subroutine atm_rk_dynamics_substep_finish
 
@@ -7241,9 +7803,6 @@ subroutine atm_zero_gradient_w_bdy_work( w, bdyMaskCell, nearestRelaxationCell,
 
       integer :: iCell, k
 
-      MPAS_ACC_TIMER_START('atm_zero_gradient_w_bdy_work [ACC_data_xfer]')
-      !$acc enter data copyin(w)
-      MPAS_ACC_TIMER_STOP('atm_zero_gradient_w_bdy_work [ACC_data_xfer]')
 
       !$acc parallel default(present)
       !$acc loop gang worker
@@ -7259,9 +7818,6 @@ subroutine atm_zero_gradient_w_bdy_work( w, bdyMaskCell, nearestRelaxationCell,
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_zero_gradient_w_bdy_work [ACC_data_xfer]')
-      !$acc exit data copyout(w)
-      MPAS_ACC_TIMER_STOP('atm_zero_gradient_w_bdy_work [ACC_data_xfer]')
 
 
    end subroutine atm_zero_gradient_w_bdy_work
@@ -7302,11 +7858,6 @@ subroutine atm_bdy_adjust_dynamics_speczone_tend( tend, mesh, config, nVertLevel
       call mpas_pool_get_array(mesh, 'bdyMaskEdge', bdyMaskEdge)
       call mpas_pool_get_array(tend, 'rt_diabatic_tend', rt_diabatic_tend)
 
-      MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]')
-      !$acc enter data copyin(tend_ru,tend_rho,tend_rt,tend_rw, &
-      !$acc                   rt_diabatic_tend)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]')
-
       !$acc parallel default(present)
       !$acc loop gang worker
       do iCell = cellSolveStart, cellSolveEnd
@@ -7333,11 +7884,6 @@ subroutine atm_bdy_adjust_dynamics_speczone_tend( tend, mesh, config, nVertLevel
          end if
       end do
       !$acc end parallel
-
-      MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]')
-      !$acc exit data copyout(tend_ru,tend_rho,tend_rt, &
-      !$acc                   tend_rw,rt_diabatic_tend)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_speczone_tend [ACC_data_xfer]')
       
     end subroutine atm_bdy_adjust_dynamics_speczone_tend
 
@@ -7423,10 +7969,7 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me
       divdamp_coef = divdamp_coef_ptr
       vertexDegree = vertexDegree_ptr
 
-      MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]')
-      !$acc enter data copyin(tend_rho, tend_rt, rho_zz, theta_m, tend_ru, ru)
       !$acc enter data create(divergence1, divergence2, vorticity1, vorticity2)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]')
       
       !  First, Rayleigh damping terms for ru, rtheta_m and rho_zz
       !$acc parallel default(present)
@@ -7571,11 +8114,7 @@ subroutine atm_bdy_adjust_dynamics_relaxzone_tend( config, tend, state, diag, me
        end do  !  end of loop over edges
        !$acc end parallel
 
-       MPAS_ACC_TIMER_START('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]')
-      !$acc exit data copyout(tend_rho, tend_rt, tend_ru)
-      !$acc exit data delete(rho_zz, theta_m, ru, &
-      !$acc            divergence1, divergence2, vorticity1, vorticity2)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_dynamics_relaxzone_tend [ACC_data_xfer]')
+      !$acc exit data delete(divergence1, divergence2, vorticity1, vorticity2)
        
    end subroutine atm_bdy_adjust_dynamics_relaxzone_tend
       
@@ -7609,10 +8148,6 @@ subroutine atm_bdy_reset_speczone_values( state, diag, mesh, nVertLevels, &
       call mpas_pool_get_array(state, 'theta_m', theta_m, 2)
       call mpas_pool_get_array(diag, 'rtheta_p', rtheta_p)
       call mpas_pool_get_array(diag, 'rtheta_base', rtheta_base)
-
-      MPAS_ACC_TIMER_START('atm_bdy_reset_speczone_values [ACC_data_xfer]')
-      !$acc enter data copyin(rtheta_base, theta_m, rtheta_p)
-      MPAS_ACC_TIMER_STOP('atm_bdy_reset_speczone_values [ACC_data_xfer]')
       
       !$acc parallel default(present)
       !$acc loop gang worker
@@ -7627,11 +8162,6 @@ subroutine atm_bdy_reset_speczone_values( state, diag, mesh, nVertLevels, &
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_bdy_reset_speczone_values [ACC_data_xfer]')
-      !$acc exit data copyout(theta_m, rtheta_p) &
-      !$acc            delete(rtheta_base)
-      MPAS_ACC_TIMER_STOP('atm_bdy_reset_speczone_values [ACC_data_xfer]')
-
    end subroutine atm_bdy_reset_speczone_values
 
 !-------------------------------------------------------------------------
@@ -7720,10 +8250,7 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk,
       integer :: iCell, iEdge, iScalar, i, k, cell1, cell2
 
       !---
-      MPAS_ACC_TIMER_START('atm_bdy_adjust_scalars [ACC_data_xfer]')
-      !$acc enter data create(scalars_tmp) &
-      !$acc            copyin(scalars_new)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_scalars [ACC_data_xfer]')
+      !$acc enter data create(scalars_tmp)
 
       !$acc parallel default(present)
       !$acc loop gang worker
@@ -7805,10 +8332,7 @@ subroutine atm_bdy_adjust_scalars_work( scalars_new, scalars_driving, dt, dt_rk,
       end do
       !$acc end parallel
 
-      MPAS_ACC_TIMER_START('atm_bdy_adjust_scalars [ACC_data_xfer]')
-      !$acc exit data delete(scalars_tmp) &
-      !$acc           copyout(scalars_new)
-      MPAS_ACC_TIMER_STOP('atm_bdy_adjust_scalars [ACC_data_xfer]')
+      !$acc exit data delete(scalars_tmp)
 
    end subroutine atm_bdy_adjust_scalars_work
 
@@ -7878,10 +8402,6 @@ subroutine atm_bdy_set_scalars_work( scalars_driving, scalars_new, &
 
       !---
 
-      MPAS_ACC_TIMER_START('atm_bdy_set_scalars_work [ACC_data_xfer]')
-      !$acc enter data copyin(scalars_new)
-      MPAS_ACC_TIMER_STOP('atm_bdy_set_scalars_work [ACC_data_xfer]')
-
       !$acc parallel default(present)
       !$acc loop gang worker
       do iCell = cellSolveStart, cellSolveEnd ! threaded over cells
@@ -7902,10 +8422,6 @@ subroutine atm_bdy_set_scalars_work( scalars_driving, scalars_new, &
 
       end do  ! updates now in temp storage
       !$acc end parallel
-
-      MPAS_ACC_TIMER_START('atm_bdy_set_scalars_work [ACC_data_xfer]')
-      !$acc exit data copyout(scalars_new)
-      MPAS_ACC_TIMER_STOP('atm_bdy_set_scalars_work [ACC_data_xfer]')
             
    end subroutine atm_bdy_set_scalars_work
 
@@ -7975,16 +8491,6 @@ subroutine summarize_timestep(domain)
       nVertLevels = nVertLevels_ptr
       num_scalars = num_scalars_ptr
 
-      MPAS_ACC_TIMER_START('summarize_timestep [ACC_data_xfer]')
-      if (config_print_detailed_minmax_vel) then
-         !$acc enter data copyin(w,u,v)
-      else if (config_print_global_minmax_vel) then
-         !$acc enter data copyin(w,u)
-      end if
-      if (config_print_global_minmax_sca) then
-         !$acc enter data copyin(scalars)
-      end if
-      MPAS_ACC_TIMER_STOP('summarize_timestep [ACC_data_xfer]')
 
       if (config_print_detailed_minmax_vel) then
          call mpas_log_write('')
@@ -8343,17 +8849,6 @@ subroutine summarize_timestep(domain)
 
       end if
 
-      MPAS_ACC_TIMER_START('summarize_timestep [ACC_data_xfer]')
-      if (config_print_detailed_minmax_vel) then
-         !$acc exit data delete(w,u,v)
-      else if (config_print_global_minmax_vel) then
-         !$acc exit data delete(w,u)
-      end if
-      if (config_print_global_minmax_sca) then
-         !$acc exit data delete(scalars)
-      end if
-      MPAS_ACC_TIMER_STOP('summarize_timestep [ACC_data_xfer]')
-
    end subroutine summarize_timestep
 
 end module atm_time_integration
diff --git a/src/core_atmosphere/mpas_atm_core.F b/src/core_atmosphere/mpas_atm_core.F
index f7d04a1f0c..bae45193d5 100644
--- a/src/core_atmosphere/mpas_atm_core.F
+++ b/src/core_atmosphere/mpas_atm_core.F
@@ -18,12 +18,13 @@ module atm_core
    ! in a named group
    !
    abstract interface
-      subroutine halo_exchange_routine(domain, halo_group, ierr)
+      subroutine halo_exchange_routine(domain, halo_group, withGPUAwareMPI, ierr)
 
          use mpas_derived_types, only : domain_type
 
          type (domain_type), intent(inout) :: domain
          character(len=*), intent(in) :: halo_group
+         logical, intent(in), optional :: withGPUAwareMPI
          integer, intent(out), optional :: ierr
 
       end subroutine halo_exchange_routine
@@ -43,7 +44,8 @@ function atm_core_init(domain, startTimeStamp) result(ierr)
       use mpas_atm_dimensions, only : mpas_atm_set_dims
       use mpas_atm_diagnostics_manager, only : mpas_atm_diag_setup
       use mpas_atm_threading, only : mpas_atm_threading_init
-      use atm_time_integration, only : mpas_atm_dynamics_init
+      use atm_time_integration, only : mpas_atm_dynamics_init, &
+                              mpas_atm_pre_dynamics_h2d, mpas_atm_post_dynamics_d2h
       use mpas_timer, only : mpas_timer_start, mpas_timer_stop
       use mpas_attlist, only : mpas_modify_att
       use mpas_string_utils, only : mpas_string_replace
@@ -509,6 +511,7 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
       call mpas_pool_get_dimension(block % dimensions, 'edgeSolveThreadStart', edgeSolveThreadStart)
       call mpas_pool_get_dimension(block % dimensions, 'edgeSolveThreadEnd', edgeSolveThreadEnd)
 
+      call mpas_atm_pre_computesolvediag_h2d(block)
 !$OMP PARALLEL DO
       do thread=1,nThreads
          if (.not. config_do_restart .or. (config_do_restart .and. config_do_DAcycling)) then
@@ -527,6 +530,7 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
                                             edgeThreadStart(thread), edgeThreadEnd(thread))
       end do
 !$OMP END PARALLEL DO
+      call mpas_atm_post_computesolvediag_d2h(block)
 
       deallocate(ke_vertex)
       deallocate(ke_edge)
@@ -540,13 +544,18 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
       call mpas_pool_get_array(diag, 'uReconstructZ', uReconstructZ)
       call mpas_pool_get_array(diag, 'uReconstructZonal', uReconstructZonal)
       call mpas_pool_get_array(diag, 'uReconstructMeridional', uReconstructMeridional)
+      call mpas_reconstruct_2d_h2d(mesh, u, uReconstructX, uReconstructY, uReconstructZ, &
+                                   uReconstructZonal, uReconstructMeridional)
       call mpas_reconstruct(mesh, u,                   &
                             uReconstructX,             &
                             uReconstructY,             &
                             uReconstructZ,             &
                             uReconstructZonal,         &
-                            uReconstructMeridional     &
+                            uReconstructMeridional,    &
+                            lACC = .true.              &
                            )
+      call mpas_reconstruct_2d_d2h(mesh, u, uReconstructX, uReconstructY, uReconstructZ, &
+               uReconstructZonal, uReconstructMeridional)
    
 #ifdef DO_PHYSICS
       !proceed with initialization of physics parameterization if moist_physics is set to true:
diff --git a/src/core_atmosphere/mpas_atm_halos.F b/src/core_atmosphere/mpas_atm_halos.F
index df02ee30a2..19c4a5be46 100644
--- a/src/core_atmosphere/mpas_atm_halos.F
+++ b/src/core_atmosphere/mpas_atm_halos.F
@@ -15,12 +15,13 @@ module mpas_atm_halos
    ! in a named group
    !
    abstract interface
-      subroutine halo_exchange_routine(domain, halo_group, ierr)
+      subroutine halo_exchange_routine(domain, halo_group, withGPUAwareMPI, ierr)
 
          use mpas_derived_types, only : domain_type
 
          type (domain_type), intent(inout) :: domain
          character(len=*), intent(in) :: halo_group
+         logical, intent(in), optional :: withGPUAwareMPI
          integer, intent(out), optional :: ierr
 
       end subroutine halo_exchange_routine
@@ -61,18 +62,23 @@ subroutine atm_build_halo_groups(domain, ierr)
 
       ! Local variables
       character(len=StrKIND), pointer :: config_halo_exch_method
+      logical, pointer :: config_gpu_aware_mpi
 
 
       !
       ! Determine from the namelist option config_halo_exch_method which halo exchange method to employ
       !
       call mpas_pool_get_config(domain % blocklist % configs, 'config_halo_exch_method', config_halo_exch_method)
+      call mpas_pool_get_config(domain % blocklist % configs, 'config_gpu_aware_mpi', config_gpu_aware_mpi)
 
       if (trim(config_halo_exch_method) == 'mpas_dmpar') then
          call mpas_log_write('')
          call mpas_log_write('*** Using ''mpas_dmpar'' routines for exchanging halos')
          call mpas_log_write('')
 
+         if (config_gpu_aware_mpi) then
+            call mpas_log_write('GPU-aware MPI is not presently supported with config_halo_exch_method = mpas_dmpar',MPAS_LOG_CRIT)
+         end if
          !
          ! Set up halo exchange groups used during atmosphere core initialization
          !
diff --git a/src/core_atmosphere/physics/mpas_atmphys_interface.F b/src/core_atmosphere/physics/mpas_atmphys_interface.F
index 71e46dfcd2..76a7e4fb6d 100644
--- a/src/core_atmosphere/physics/mpas_atmphys_interface.F
+++ b/src/core_atmosphere/physics/mpas_atmphys_interface.F
@@ -6,17 +6,29 @@
 ! distributed with this code, or at http://mpas-dev.github.com/license.html
 !
 !=================================================================================================================
+
+#ifdef MPAS_OPENACC
+#define MPAS_ACC_TIMER_START(X) call mpas_timer_start(X)
+#define MPAS_ACC_TIMER_STOP(X) call mpas_timer_stop(X)
+#else
+#define MPAS_ACC_TIMER_START(X)
+#define MPAS_ACC_TIMER_STOP(X)
+#endif
+
  module mpas_atmphys_interface
  use mpas_kind_types
  use mpas_pool_routines
 
  use mpas_atmphys_constants
  use mpas_atmphys_vars
+ use mpas_timer
 
  implicit none
  private
  public:: allocate_forall_physics,   &
           deallocate_forall_physics, &
+          update_d2h_pre_microphysics, &
+          update_h2d_post_microphysics, &
           MPAS_to_physics,           &
           microphysics_from_MPAS,    &
           microphysics_to_MPAS
@@ -545,6 +557,40 @@ subroutine MPAS_to_physics(configs,mesh,state,time_lev,diag,diag_physics,its,ite
 
  end subroutine MPAS_to_physics
 
+!=================================================================================================================
+ subroutine update_d2h_pre_microphysics(configs,state,diag,time_lev)
+!=================================================================================================================
+
+!input variables:
+ type(mpas_pool_type),intent(in):: configs
+ type(mpas_pool_type),intent(in):: state
+ type(mpas_pool_type),intent(in):: diag
+
+ integer:: time_lev
+
+!local pointers:
+ real(kind=RKIND),dimension(:,:),pointer  :: exner,pressure_b,w
+ real(kind=RKIND),dimension(:,:),pointer  :: rho_zz,theta_m,pressure_p
+ real(kind=RKIND),dimension(:,:,:),pointer:: scalars
+
+
+ MPAS_ACC_TIMER_START('update_d2h_pre_microphysics [ACC_data_xfer]')
+ call mpas_pool_get_array(diag,'exner'        ,exner     )
+ call mpas_pool_get_array(diag,'pressure_base',pressure_b)
+ call mpas_pool_get_array(diag,'pressure_p'   ,pressure_p)
+
+ call mpas_pool_get_array(state,'rho_zz' ,rho_zz ,time_lev)
+ call mpas_pool_get_array(state,'theta_m',theta_m,time_lev)
+ call mpas_pool_get_array(state,'w'      ,w      ,time_lev)
+ !$acc update host(exner, pressure_b, pressure_p, rho_zz, theta_m, w)
+
+ call mpas_pool_get_array(state,'scalars',scalars,time_lev)
+ !$acc update host(scalars)
+
+ MPAS_ACC_TIMER_STOP('update_d2h_pre_microphysics [ACC_data_xfer]')
+
+end subroutine update_d2h_pre_microphysics
+
 !=================================================================================================================
  subroutine microphysics_from_MPAS(configs,mesh,state,time_lev,diag,diag_physics,tend_physics,its,ite)
 !=================================================================================================================
@@ -599,7 +645,7 @@ subroutine microphysics_from_MPAS(configs,mesh,state,time_lev,diag,diag_physics,
  call mpas_pool_get_dimension(state,'index_qv',index_qv)
  call mpas_pool_get_dimension(state,'index_qc',index_qc)
  call mpas_pool_get_dimension(state,'index_qr',index_qr)
- call mpas_pool_get_array(state,'scalars',scalars,time_lev)
+ call mpas_pool_get_array(state,'scalars',scalars,time_lev) 
  qv => scalars(index_qv,:,:)
  qc => scalars(index_qc,:,:)
  qr => scalars(index_qr,:,:)
@@ -1042,6 +1088,48 @@ subroutine microphysics_to_MPAS(configs,mesh,state,time_lev,diag,diag_physics,te
 
  end subroutine microphysics_to_MPAS
 
+ !=================================================================================================================
+ subroutine update_h2d_post_microphysics(configs,state,diag,tend,time_lev)
+!=================================================================================================================
+
+!input variables:
+ type(mpas_pool_type),intent(in):: configs
+ type(mpas_pool_type),intent(in):: state
+ type(mpas_pool_type),intent(in):: diag
+ type(mpas_pool_type),intent(inout):: tend
+
+
+ integer:: time_lev
+
+!local pointers:
+ real(kind=RKIND),dimension(:,:),pointer  :: exner,exner_b,pressure_b,rtheta_p,rtheta_b
+ real(kind=RKIND),dimension(:,:),pointer  :: rho_zz,theta_m,pressure_p
+ real(kind=RKIND),dimension(:,:,:),pointer:: scalars
+ real(kind=RKIND),dimension(:,:),pointer  :: rt_diabatic_tend
+
+ call mpas_pool_get_array(diag,'exner'        ,exner     )
+ call mpas_pool_get_array(diag,'exner_base'   ,exner_b   )
+ call mpas_pool_get_array(diag,'pressure_base',pressure_b)
+ call mpas_pool_get_array(diag,'pressure_p'   ,pressure_p)
+ call mpas_pool_get_array(diag,'rtheta_base'  ,rtheta_b  )
+ call mpas_pool_get_array(diag,'rtheta_p'     ,rtheta_p  )
+
+ call mpas_pool_get_array(state,'rho_zz' ,rho_zz ,time_lev)
+ call mpas_pool_get_array(state,'theta_m',theta_m,time_lev)
+
+ call mpas_pool_get_array(state,'scalars',scalars,time_lev)
+
+ call mpas_pool_get_array(tend,'rt_diabatic_tend',rt_diabatic_tend)
+
+
+ MPAS_ACC_TIMER_START('update_h2d_post_microphysics [ACC_data_xfer]')
+ !$acc update device(exner, exner_b, pressure_b, pressure_p, rtheta_b)
+ !$acc update device(rtheta_p, rho_zz, theta_m, scalars)
+ !$acc update device(rt_diabatic_tend)
+ MPAS_ACC_TIMER_STOP('update_h2d_post_microphysics [ACC_data_xfer]')
+
+end subroutine update_h2d_post_microphysics
+
 !=================================================================================================================
  end module mpas_atmphys_interface
 !=================================================================================================================
diff --git a/src/core_atmosphere/physics/mpas_atmphys_todynamics.F b/src/core_atmosphere/physics/mpas_atmphys_todynamics.F
index 284b072851..71f37eb550 100644
--- a/src/core_atmosphere/physics/mpas_atmphys_todynamics.F
+++ b/src/core_atmosphere/physics/mpas_atmphys_todynamics.F
@@ -6,6 +6,15 @@
 ! distributed with this code, or at http://mpas-dev.github.com/license.html
 !
 !=================================================================================================================
+
+#ifdef MPAS_OPENACC
+#define MPAS_ACC_TIMER_START(X) call mpas_timer_start(X)
+#define MPAS_ACC_TIMER_STOP(X) call mpas_timer_stop(X)
+#else
+#define MPAS_ACC_TIMER_START(X)
+#define MPAS_ACC_TIMER_STOP(X)
+#endif
+
  module mpas_atmphys_todynamics
  use mpas_kind_types
  use mpas_pool_routines
@@ -13,10 +22,11 @@ module mpas_atmphys_todynamics
  use mpas_atm_dimensions
 
  use mpas_atmphys_constants, only: R_d,R_v,degrad
+ use mpas_timer
 
  implicit none
  private
- public:: physics_get_tend
+ public:: physics_get_tend, update_d2h_pre_physics_get_tend, update_h2d_post_physics_get_tend
 
 
 !Interface between the physics parameterizations and the non-hydrostatic dynamical core.
@@ -46,12 +56,13 @@ module mpas_atmphys_todynamics
 ! in a named group
 !
  abstract interface
-    subroutine halo_exchange_routine(domain, halo_group, ierr)
+    subroutine halo_exchange_routine(domain, halo_group, withGPUAwareMPI, ierr)
 
        use mpas_derived_types, only : domain_type
 
        type (domain_type), intent(inout) :: domain
        character(len=*), intent(in) :: halo_group
+       logical, intent(in), optional :: withGPUAwareMPI
        integer, intent(out), optional :: ierr
 
     end subroutine halo_exchange_routine
@@ -60,6 +71,40 @@ end subroutine halo_exchange_routine
 
  contains
 
+
+!=================================================================================================================
+ subroutine update_d2h_pre_physics_get_tend(configs,state,diag,tend)
+!=================================================================================================================
+
+!input variables:
+ type(mpas_pool_type),intent(in):: configs
+ type(mpas_pool_type),intent(in):: state
+ type(mpas_pool_type),intent(in):: diag
+ type(mpas_pool_type),intent(in):: tend
+
+!local variables:
+ real(kind=RKIND),dimension(:,:),pointer:: mass          ! time level 2 rho_zz
+ real(kind=RKIND),dimension(:,:),pointer:: mass_edge     ! diag rho_edge
+ real(kind=RKIND),dimension(:,:),pointer:: theta_m       ! time level 1
+ real(kind=RKIND),dimension(:,:,:),pointer:: scalars
+ 
+ real(kind=RKIND),dimension(:,:),pointer:: tend_u_phys
+ real(kind=RKIND),dimension(:,:,:),pointer:: tend_scalars
+
+ MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
+ call mpas_pool_get_array(state,'theta_m' ,theta_m,1)
+ call mpas_pool_get_array(state,'scalars' ,scalars,1)
+ call mpas_pool_get_array(state,'rho_zz'  ,mass,2   )
+ call mpas_pool_get_array(diag ,'rho_edge',mass_edge)
+ call mpas_pool_get_array(diag ,'tend_u_phys',tend_u_phys)
+
+ !$acc update self(theta_m, scalars, mass, mass_edge)
+ 
+ call mpas_pool_get_array(tend,'scalars_tend',tend_scalars)
+ !$acc update self(tend_scalars) ! Probably not needed
+ MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
+
+ end subroutine update_d2h_pre_physics_get_tend
  
 !=================================================================================================================
  subroutine physics_get_tend(block,mesh,state,diag,tend,tend_physics,configs,rk_step,dynamics_substep, &
@@ -245,6 +290,26 @@ subroutine physics_get_tend(block,mesh,state,diag,tend,tend_physics,configs,rk_s
 
  end subroutine physics_get_tend
 
+ !=================================================================================================================
+ subroutine update_h2d_post_physics_get_tend(configs,state,diag,tend)
+!=================================================================================================================
+
+!input variables:
+ type(mpas_pool_type),intent(in):: configs
+ type(mpas_pool_type),intent(in):: state
+ type(mpas_pool_type),intent(in):: diag
+ type(mpas_pool_type),intent(in):: tend
+
+!local variables:
+ real(kind=RKIND),dimension(:,:,:),pointer:: tend_scalars
+
+ MPAS_ACC_TIMER_START('atm_srk3: physics ACC_data_xfer')
+ call mpas_pool_get_array(tend,'scalars_tend',tend_scalars)
+ !$acc update device(tend_scalars)
+ MPAS_ACC_TIMER_STOP('atm_srk3: physics ACC_data_xfer')
+
+ end subroutine update_h2d_post_physics_get_tend
+ 
 !=================================================================================================================
  subroutine physics_get_tend_work( &
                     block,mesh,nCells,nEdges,nCellsSolve,nEdgesSolve,rk_step,dynamics_substep, &
diff --git a/src/core_init_atmosphere/Registry.xml b/src/core_init_atmosphere/Registry.xml
index cf4934a81b..b4f4622246 100644
--- a/src/core_init_atmosphere/Registry.xml
+++ b/src/core_init_atmosphere/Registry.xml
@@ -468,6 +468,11 @@
                         <var name="defc_b" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
                         <var name="cell_gradient_coef_x" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
                         <var name="cell_gradient_coef_y" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
+                        <var name="deformation_coef_c2" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
+                        <var name="deformation_coef_s2" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
+                        <var name="deformation_coef_cs" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
+                        <var name="deformation_coef_c" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
+                        <var name="deformation_coef_s" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
                         <var name="coeffs_reconstruct" packages="gwd_stage_in;vertical_stage_in;met_stage_in"/>
                         <var name="cf1" packages="met_stage_in"/>
                         <var name="cf2" packages="met_stage_in"/>
@@ -573,6 +578,11 @@
                         <var name="defc_b"/>
                         <var name="cell_gradient_coef_x"/>
                         <var name="cell_gradient_coef_y"/>
+                        <var name="deformation_coef_c2"/>
+                        <var name="deformation_coef_s2"/>
+                        <var name="deformation_coef_cs"/>
+                        <var name="deformation_coef_c"/>
+                        <var name="deformation_coef_s"/>
                         <var name="coeffs_reconstruct"/>
                         <var name="cf1" packages="vertical_stage_out;met_stage_out"/>
                         <var name="cf2" packages="vertical_stage_out;met_stage_out"/>
@@ -1114,6 +1124,21 @@
                 <var name="cell_gradient_coef_y" type="real" dimensions="maxEdges nCells" units="m^-1"
                      description="Coefficients for computing the y (meridional) derivative of a cell-centered variable"/>
 
+                <var name="deformation_coef_c2" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-squared terms of the 3d deformation"/>
+ 
+                <var name="deformation_coef_s2" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the sine-squared terms of the 3d deformation"/>
+
+                <var name="deformation_coef_cs" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-sine terms of the 3d deformation"/>
+
+                <var name="deformation_coef_c" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the cos-only terms of the 3d deformation"/>
+
+                <var name="deformation_coef_s" type="real" dimensions="maxEdges nCells" units="unitless"
+                     description="Coefficients for computing the sine-only terms of the 3d deformation"/>
+
                 <!-- arrays required for reconstruction of velocity field -->
                 <var name="coeffs_reconstruct" type="real" dimensions="R3 maxEdges nCells" units="unitless"
                      description="Coefficients to reconstruct velocity vectors at cell centers"/>
@@ -1171,6 +1196,8 @@
                         <var name="nwfa" array_group="number" units="nb kg^{-1}"
                              description="Gocart water-friendly aerosol number concentration"
                              packages="mp_thompson_aers_in"/>
+                        <var name="tke" array_group="turbulence" units="m^2 s^{-2}"
+                             description="Turbulent kinetic energy for the prognostic tke LES scheme"/>
                 </var_array>
 
                 <!-- Climatology for ocean mixed-layer depth -->
diff --git a/src/core_init_atmosphere/mpas_atm_advection.F b/src/core_init_atmosphere/mpas_atm_advection.F
index f4d44c984e..4852b17117 100644
--- a/src/core_init_atmosphere/mpas_atm_advection.F
+++ b/src/core_init_atmosphere/mpas_atm_advection.F
@@ -11,6 +11,7 @@ module atm_advection
    use mpas_derived_types
    use mpas_pool_routines
    use mpas_constants
+   use mpas_vector_operations
    use mpas_abort, only : mpas_dmpar_global_abort
    use mpas_log, only : mpas_log_write
 
@@ -758,10 +759,14 @@ subroutine atm_initialize_deformation_weights( mesh, nCells, on_a_sphere, sphere
 
       real (kind=RKIND), dimension(:,:), pointer :: defc_a, defc_b
       real (kind=RKIND), dimension(:,:), pointer :: cell_gradient_coef_x, cell_gradient_coef_y
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c2, deformation_coef_s2, deformation_coef_cs
+      real (kind=RKIND), dimension(:,:), pointer :: deformation_coef_c, deformation_coef_s
       integer, dimension(:,:), pointer :: cellsOnEdge, edgesOnCell, cellsOnCell, verticesOnCell
       integer, dimension(:), pointer :: nEdgesOnCell
       real (kind=RKIND), dimension(:), pointer :: xCell, yCell, zCell
       real (kind=RKIND), dimension(:), pointer :: xVertex, yVertex, zVertex
+      real (kind=RKIND), dimension(:), pointer :: xEdge, yEdge, zEdge, angleEdge
+      real (kind=RKIND), dimension(:), pointer :: areaCell
 
       real (kind=RKIND), dimension(nCells) :: theta_abs
 
@@ -772,19 +777,32 @@ subroutine atm_initialize_deformation_weights( mesh, nCells, on_a_sphere, sphere
       integer :: iCell
       real (kind=RKIND) :: pii
       real (kind=RKIND), dimension(25) :: xp, yp
+      real (kind=RKIND) :: xe, ye
       
       real (kind=RKIND) :: length_scale
       integer, dimension(25) :: cell_list
 
-      integer :: iv
+      integer :: iv, ie
       logical :: do_the_cell
       real (kind=RKIND) :: area_cell, sint2, cost2, sint_cost, dx, dy
 
+      logical, pointer :: is_periodic
+      real(kind=RKIND), pointer :: x_period, y_period
+
+
+      call mpas_pool_get_config(mesh, 'is_periodic', is_periodic)
+      call mpas_pool_get_config(mesh, 'x_period', x_period)
+      call mpas_pool_get_config(mesh, 'y_period', y_period)
 
       call mpas_pool_get_array(mesh, 'defc_a', defc_a)
       call mpas_pool_get_array(mesh, 'defc_b', defc_b)
       call mpas_pool_get_array(mesh, 'cell_gradient_coef_x', cell_gradient_coef_x)
       call mpas_pool_get_array(mesh, 'cell_gradient_coef_y', cell_gradient_coef_y)
+      call mpas_pool_get_array(mesh, 'deformation_coef_c2', deformation_coef_c2)
+      call mpas_pool_get_array(mesh, 'deformation_coef_s2', deformation_coef_s2)
+      call mpas_pool_get_array(mesh, 'deformation_coef_cs', deformation_coef_cs)
+      call mpas_pool_get_array(mesh, 'deformation_coef_c', deformation_coef_c)
+      call mpas_pool_get_array(mesh, 'deformation_coef_s', deformation_coef_s)
       call mpas_pool_get_array(mesh, 'nEdgesOnCell', nEdgesOnCell)
       call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge)
       call mpas_pool_get_array(mesh, 'edgesOnCell', edgesOnCell)
@@ -796,9 +814,19 @@ subroutine atm_initialize_deformation_weights( mesh, nCells, on_a_sphere, sphere
       call mpas_pool_get_array(mesh, 'xVertex', xVertex)
       call mpas_pool_get_array(mesh, 'yVertex', yVertex)
       call mpas_pool_get_array(mesh, 'zVertex', zVertex)
+      call mpas_pool_get_array(mesh, 'xEdge', xEdge)
+      call mpas_pool_get_array(mesh, 'yEdge', yEdge)
+      call mpas_pool_get_array(mesh, 'zEdge', zEdge)
+      call mpas_pool_get_array(mesh, 'angleEdge', angleEdge)
+      call mpas_pool_get_array(mesh, 'areaCell', areaCell)
 
       defc_a(:,:) = 0.
       defc_b(:,:) = 0.
+      deformation_coef_c2(:,:) = 0.
+      deformation_coef_s2(:,:) = 0.
+      deformation_coef_cs(:,:) = 0.
+      deformation_coef_c(:,:) = 0.
+      deformation_coef_s(:,:) = 0.
 
       cell_gradient_coef_x(:,:) = 0.
       cell_gradient_coef_y(:,:) = 0.
@@ -888,21 +916,58 @@ subroutine atm_initialize_deformation_weights( mesh, nCells, on_a_sphere, sphere
 
          else     ! On an x-y plane
 
-            theta_abs(iCell) = 0.0
+            do i=1,n-1
+               iv = verticesOnCell(i,iCell)
+               xp(i) = mpas_fix_periodicity(xVertex(iv),xCell(iCell),x_period) - xCell(iCell)
+               yp(i) = mpas_fix_periodicity(yVertex(iv),yCell(iCell),y_period) - yCell(iCell)
+            end do
 
-            xp(1) = xCell(iCell)
-            yp(1) = yCell(iCell)
+            ! if(iCell.lt.11) then
+            !  call mpas_log_write(' setting defc coefs, cell $i', intArgs=(/iCell/))
+            !  do i=1,n-1
+            !    iv = verticesOnCell(i,iCell)
+            !    call mpas_log_write(' xp,yp,xvc,yvc, $r $r $r $r', realArgs=(/xp(i),yp(i),xVertex(iv)-xCell(iCell),yVertex(iv)-yCell(iCell)/))
+            !  end do
+            ! end if
 
-            do i=2,n
-               iv = verticesOnCell(i-1,iCell)
-               xp(i) = xVertex(iv)
-               yp(i) = yVertex(iv)
+            do i=1,n-1
+              ie = edgesOnCell(i,iCell)
+              xe = mpas_fix_periodicity(xEdge(ie),xCell(iCell),x_period) - xCell(iCell)
+              ye = mpas_fix_periodicity(yEdge(ie),yCell(iCell),y_period) - yCell(iCell)
+              thetat(i) = atan2(ye,xe)
             end do
 
+            ! if(iCell .lt. 11) then
+            !   call mpas_log_write(' edge angles, plane calc, cell $i', intArgs=(/iCell/))
+            !   do i=1,n-1
+            !     call mpas_log_write(' edge angle $r', realArgs=(/thetat(i)*180./3.1415926/))
+            !   end do
+            ! end if
+
+            theta_abs(iCell) = thetat(1)
+
          end if
 
          ! (1) compute cell area on the tangent plane used in the integrals
          ! (2) compute angle of cell edge normal vector.  here we are repurposing thetat
+         thetat(1) = theta_abs(iCell)
+
+         do i=2,n-1
+            ip1 = i+1
+            if (ip1 == n) ip1 = 1
+            thetat(i) = plane_angle( 0.0_RKIND, 0.0_RKIND, 0.0_RKIND,  &
+                                     xp(i)-xp(i-1), yp(i)-yp(i-1), 0.0_RKIND,  &
+                                     xp(ip1)-xp(i), yp(ip1)-yp(i), 0.0_RKIND,  &
+                                     0.0_RKIND, 0.0_RKIND, 1.0_RKIND)
+            thetat(i) = thetat(i) + thetat(i-1)
+         end do
+
+         !   if(iCell .lt. 11) then
+         !     call mpas_log_write(' edge angles, generic calc, cell $i', intArgs=(/iCell/))
+         !     do i=1,n-1
+         !       call mpas_log_write(' edge angle $r', realArgs=(/thetat(i)*180./3.1415926/))
+         !     end do
+         !   end if
 
          area_cell = 0.
          do i=1,n-1
@@ -927,15 +992,241 @@ subroutine atm_initialize_deformation_weights( mesh, nCells, on_a_sphere, sphere
             defc_b(i,iCell) = dl*2.*sint_cost/area_cell
             cell_gradient_coef_x(i,iCell) = dl*cos(thetat(i))/area_cell
             cell_gradient_coef_y(i,iCell) = dl*sin(thetat(i))/area_cell
+            deformation_coef_c2(i,iCell) = dl*cost2/area_cell
+            deformation_coef_s2(i,iCell) = dl*sint2/area_cell
+            deformation_coef_cs(i,iCell) = dl*sint_cost/area_cell
+            deformation_coef_c(i,iCell) = dl*cos(thetat(i))/area_cell
+            deformation_coef_s(i,iCell) = dl*sin(thetat(i))/area_cell
             if (cellsOnEdge(1,EdgesOnCell(i,iCell)) /= iCell) then
                defc_a(i,iCell) = - defc_a(i,iCell)
                defc_b(i,iCell) = - defc_b(i,iCell)
+               deformation_coef_c2(i,iCell) = - deformation_coef_c2(i,iCell)
+               deformation_coef_s2(i,iCell) = - deformation_coef_s2(i,iCell)
+               deformation_coef_cs(i,iCell) = - deformation_coef_cs(i,iCell)
+!               deformation_coef_c(i,iCell) = - deformation_coef_c(i,iCell)
+!               deformation_coef_s(i,iCell) = - deformation_coef_s(i,iCell)
             end if
  
          end do
 
       end do
 
+      call atm_init_test_coefs( deformation_coef_c2, deformation_coef_s2, &
+                                deformation_coef_cs, deformation_coef_c,  &
+                                deformation_coef_s,                       &
+                                is_periodic, on_a_sphere,                 &
+                                x_period, y_period,                       &
+                                xEdge, yEdge, zEdge,                      &
+                                xCell, yCell, zCell, nCells,              &
+                                angleEdge, nEdgesOnCell, edgesOnCell )
+
+
    end subroutine atm_initialize_deformation_weights
 
- end module atm_advection
+
+   subroutine atm_init_test_coefs( deformation_coef_c2, deformation_coef_s2, &
+                                   deformation_coef_cs, deformation_coef_c,  &
+                                   deformation_coef_s,                       &
+                                   is_periodic, on_a_sphere,                 &
+                                   x_period, y_period,                       &
+                                   xEdge, yEdge, zEdge,                      &
+                                   xCell, yCell, zCell, nCells,              &
+                                   angleEdge, nEdgesOnCell, edgesOnCell )
+
+      implicit none
+
+      logical :: is_periodic, on_a_sphere
+      integer :: nCells
+      integer, dimension(:) :: nEdgesOnCell
+      real (kind=RKIND) :: x_period, y_period
+      real (kind=RKIND), dimension(:,:) :: deformation_coef_c2, deformation_coef_s2
+      real (kind=RKIND), dimension(:,:) :: deformation_coef_cs
+      real (kind=RKIND), dimension(:,:) :: deformation_coef_c, deformation_coef_s
+      integer, dimension(:,:) :: edgesOnCell
+      real (kind=RKIND), dimension(:) :: angleEdge, xEdge, yEdge, zEdge
+      real (kind=RKIND), dimension(:) :: xCell, yCell, zCell
+
+      ! local variables
+
+      integer :: iCell, iEdge, ie
+      real (kind=RKIND) :: cos_edge, sin_edge, ux, uy, vx, vy, wx, wy
+      real (kind=RKIND) :: xc, yc, xe, ye
+      real (kind=RKIND) :: angle_e, ue, ve, we, e_int
+      real (kind=RKIND) :: dudx, dudy, dvdx, dvdy, dwdx, dwdy
+      real (kind=RKIND) :: dudx_c, dudy_c, dvdx_c, dvdy_c, dwdx_c, dwdy_c
+
+      real (kind=RKIND) :: dudx_err_max, dudy_err_max, dvdx_err_max, dvdy_err_max, dwdx_err_max, dwdy_err_max
+      real (kind=RKIND) :: dudx_err_tot, dudy_err_tot, dvdx_err_tot, dvdy_err_tot, dwdx_err_tot, dwdy_err_tot
+      real (kind=RKIND) :: dudx_max, dudy_max, dvdx_max, dvdy_max, dwdx_max, dwdy_max
+
+      real (kind=RKIND) :: ang
+      real (kind=RKIND), parameter :: x_vel= 1.0, y_vel=1.0, w_vel=1.0
+      real (kind=RKIND) :: u_edge, v_edge, w_edge, x, y, angle, xl, yl
+      real (kind=RKIND) :: dudx_cell, dudy_cell, dvdx_cell, dvdy_cell, dwdx_cell, dwdy_cell
+
+      !  Test tunction definitions
+      !
+      !  here are the velocity field functions and their derivatives.
+      !  First a simple test: U = x_vel*(-x+y), V = y_vel * (-x+y), W = w_vel*(-x+y)
+
+      u_edge(x,y,ang,xl,yl) =  (x_vel*(x+y)) * cos(ang) + (y_vel * (x+y) * sin(ang))
+      v_edge(x,y,ang,xl,yl) = -(x_vel*(x+y)) * sin(ang) + (y_vel * (x+y) * cos(ang))
+      w_edge(x,y,xl,yl) = w_vel * (x+y)
+
+      dudx_cell(x,y,xl,yl) = x_vel
+      dudy_cell(x,y,xl,yl) = x_vel
+      dvdx_cell(x,y,xl,yl) = y_vel
+      dvdy_cell(x,y,xl,yl) = y_vel
+      dwdx_cell(x,y,xl,yl) = w_vel
+      dwdy_cell(x,y,xl,yl) = w_vel
+
+      ! -----------------
+
+      if ( (.not. on_a_sphere) .and. (is_periodic) ) then  ! test is for doubly-periodic Cartesian plane only
+
+         dudx_err_max = 0.
+         dudy_err_max = 0.
+         dvdx_err_max = 0.
+         dvdy_err_max = 0.
+         dwdx_err_max = 0.
+         dwdy_err_max = 0.
+
+         dudx_err_tot = 0.
+         dudy_err_tot = 0.
+         dvdx_err_tot = 0.
+         dvdy_err_tot = 0.
+         dwdx_err_tot = 0.
+         dwdy_err_tot = 0.
+
+         dudx_max = 0.
+         dudy_max = 0.
+         dvdx_max = 0.
+         dvdy_max = 0.
+         dwdx_max = 0.
+         dwdy_max = 0.
+
+         do iCell = 1, nCells
+
+            dudx = 0.
+            dudy = 0.
+            dvdx = 0.
+            dvdy = 0.
+            dwdx = 0.
+            dwdy = 0.
+
+            xc = xCell(iCell)
+            yc = yCell(iCell)
+
+            dudx_c = dudx_cell(xc,yc,x_period,y_period)
+            dudy_c = dudy_cell(xc,yc,x_period,y_period)
+            dvdx_c = dvdx_cell(xc,yc,x_period,y_period)
+            dvdy_c = dvdy_cell(xc,yc,x_period,y_period)
+            dwdx_c = dwdx_cell(xc,yc,x_period,y_period)
+            dwdy_c = dwdy_cell(xc,yc,x_period,y_period)
+
+            do iEdge = 1, nEdgesOnCell(iCell)
+
+               ie = edgesOnCell(iEdge,iCell)
+               angle_e = angleEdge(ie)
+               xe = xEdge(ie)
+               ye = yEdge(ie)
+
+               xe = mpas_fix_periodicity(xe,xc,x_period)
+               ye = mpas_fix_periodicity(ye,yc,y_period)
+
+               ue = u_edge(xe,ye,angle_e,x_period,y_period)
+               ve = v_edge(xe,ye,angle_e,x_period,y_period)
+               we = w_edge(xe,ye,x_period,y_period)
+
+               dudx = dudx + deformation_coef_c2(iEdge,iCell)*ue &
+                           - deformation_coef_cs(iEdge,iCell)*ve
+               dudy = dudy + deformation_coef_cs(iEdge,iCell)*ue &
+                           - deformation_coef_s2(iEdge,iCell)*ve
+               dvdx = dvdx + deformation_coef_cs(iEdge,iCell)*ue &
+                           + deformation_coef_c2(iEdge,iCell)*ve
+               dvdy = dvdy + deformation_coef_s2(iEdge,iCell)*ue &
+                           + deformation_coef_cs(iEdge,iCell)*ve
+
+               dwdx = dwdx + deformation_coef_c(iEdge,iCell)*we
+               dwdy = dwdy + deformation_coef_s(iEdge,iCell)*we
+
+            end do
+
+            ! call mpas_log_write(' u_x, u_y, $r, $r ', realArgs=(/dudx, dudy/))
+            ! call mpas_log_write(' v_x, v_y, $r, $r ', realArgs=(/dvdx, dvdy/))
+            ! call mpas_log_write(' w_x, w_y, $r, $r ', realArgs=(/dwdx, dwdy/))
+
+            ! check result for cell
+
+            e_int = abs(dudx_c - dudx)
+            dudx_err_tot = dudx_err_tot + e_int
+            dudx_err_max = max(dudx_err_max, e_int)
+
+            e_int = abs(dudy_c - dudy)
+            dudy_err_tot = dudy_err_tot + e_int
+            dudy_err_max = max(dudy_err_max, e_int)
+
+            e_int = abs(dvdx_c - dvdx)
+            dvdx_err_tot = dvdx_err_tot + e_int
+            dvdx_err_max = max(dvdx_err_max, e_int)
+
+            e_int = abs(dvdy_c - dvdy)
+            dvdy_err_tot = dvdy_err_tot + e_int
+            dvdy_err_max = max(dvdy_err_max, e_int)
+
+            e_int = abs(dwdx_c - dwdx)
+            dwdx_err_tot = dwdx_err_tot + e_int
+            dwdx_err_max = max(dwdx_err_max, e_int)
+
+            e_int = abs(dwdy_c - dwdy)
+            dwdy_err_tot = dwdy_err_tot + e_int
+            dwdy_err_max = max(dwdy_err_max, e_int)
+
+            dudx_max = max(dudx_max, abs(dudx_c))
+            dudy_max = max(dudy_max, abs(dudy_c))
+            dvdx_max = max(dvdx_max, abs(dvdx_c))
+            dvdy_max = max(dvdy_max, abs(dvdy_c))
+            dwdx_max = max(dwdx_max, abs(dwdx_c))
+            dwdy_max = max(dwdy_max, abs(dwdy_c))
+
+         end do
+
+         ! scale errors
+
+         dudx_err_max = dudx_err_max/dudx_max
+         dudy_err_max = dudy_err_max/dudy_max
+         dvdx_err_max = dvdx_err_max/dvdx_max
+         dvdy_err_max = dvdy_err_max/dvdy_max
+         dwdx_err_max = dwdx_err_max/dwdx_max
+         dwdy_err_max = dwdy_err_max/dwdy_max
+
+         dudx_err_tot = dudx_err_tot/dudx_max/real(nCells)
+         dudy_err_tot = dudy_err_tot/dudy_max/real(nCells)
+         dvdx_err_tot = dvdx_err_tot/dvdx_max/real(nCells)
+         dvdy_err_tot = dvdy_err_tot/dvdy_max/real(nCells)
+         dwdx_err_tot = dwdx_err_tot/dwdx_max/real(nCells)
+         dwdy_err_tot = dwdy_err_tot/dwdy_max/real(nCells)
+
+         ! output
+
+         call mpas_log_write('  ')
+         call mpas_log_write(' deformation coefficients check ')
+         call mpas_log_write(' dudx check, max abs(dudx), max and avg error  $r, $r, $r', &
+                             realArgs=(/dudx_max, dudx_err_max, dudx_err_tot/))
+         call mpas_log_write(' dudy check, max abs(dudy), max and avg error  $r, $r, $r', &
+                             realArgs=(/dudy_max, dudy_err_max, dudy_err_tot/))
+         call mpas_log_write(' dvdx check, max abs(dvdx), max and avg error  $r, $r, $r', &
+                             realArgs=(/dvdx_max, dvdx_err_max, dvdx_err_tot/))
+         call mpas_log_write(' dvdy check, max abs(dvdy), max and avg error  $r, $r, $r', &
+                             realArgs=(/dvdy_max, dvdy_err_max, dvdy_err_tot/))
+         call mpas_log_write(' dwdx check, max abs(dwdx), max and avg error  $r, $r, $r', &
+                             realArgs=(/dwdx_max, dwdx_err_max, dwdx_err_tot/))
+         call mpas_log_write(' dwdy check, max abs(dwdy), max and avg error  $r, $r, $r', &
+                             realArgs=(/dwdy_max, dwdy_err_max, dwdy_err_tot/))
+         call mpas_log_write('  ')
+
+      end if
+
+   end subroutine atm_init_test_coefs
+
+end module atm_advection
diff --git a/src/core_init_atmosphere/mpas_init_atm_cases.F b/src/core_init_atmosphere/mpas_init_atm_cases.F
index 673ebfc525..9c372cb163 100644
--- a/src/core_init_atmosphere/mpas_init_atm_cases.F
+++ b/src/core_init_atmosphere/mpas_init_atm_cases.F
@@ -367,6 +367,27 @@ subroutine init_atm_setup_case(domain, stream_manager)
          !
          call mpas_stream_mgr_reset_alarms(stream_manager, streamID='lbc', direction=MPAS_STREAM_OUTPUT, ierr=ierr)
 
+      else if (config_init_case == 10) then
+
+         call mpas_log_write(' les test case ')
+         block_ptr => domain % blocklist
+         do while (associated(block_ptr))
+
+            call mpas_pool_get_dimension(block_ptr % dimensions, 'nCells', nCells)
+            call mpas_pool_get_dimension(block_ptr % dimensions, 'nVertLevels', nVertLevels)
+
+            call mpas_pool_get_subpool(block_ptr % structs, 'mesh', mesh)
+            call mpas_pool_get_subpool(block_ptr % structs, 'fg', fg)
+            call mpas_pool_get_subpool(block_ptr % structs, 'state', state)
+            call mpas_pool_get_subpool(block_ptr % structs, 'diag', diag)
+
+            call mpas_log_write(' calling test case setup ')
+            call init_atm_case_les(domain % dminfo, mesh, fg, nCells, nVertLevels, state, diag, config_init_case, block_ptr % configs)
+            call decouple_variables(mesh, nCells, nVertLevels, state, diag)
+            call mpas_log_write(' returned from test case setup ')
+            block_ptr => block_ptr % next
+         end do
+
       else if (config_init_case == 13 ) then
 
          call mpas_log_write(' CAM-MPAS grid ')
@@ -399,7 +420,7 @@ subroutine init_atm_setup_case(domain, stream_manager)
       else
 
          call mpas_log_write(' ***********************************************************', messageType=MPAS_LOG_ERR)
-         call mpas_log_write(' Only test cases 1 through 9 and 13 are currently supported.', messageType=MPAS_LOG_ERR)
+         call mpas_log_write(' Only test cases 1 through 10 and 13 are currently supported.', messageType=MPAS_LOG_ERR)
          call mpas_log_write(' ***********************************************************', messageType=MPAS_LOG_CRIT)
 
       end if
@@ -1408,10 +1429,11 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       integer, dimension(:), pointer :: nEdgesOnEdge 
       integer, dimension(:,:), pointer :: edgesOnEdge, cellsOnEdge
       real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge
-
+      real (kind=RKIND), pointer :: x_period, y_period
       integer :: iCell, iCell1, iCell2 , iEdge, ivtx, i, k, nz, nz1, itr, cell1, cell2
       integer, pointer :: nEdges, nVertices, maxEdges, nCellsSolve
       integer, pointer :: index_qv
+      integer, pointer :: index_tke
 
       real (kind=RKIND), dimension(nVertLevels + 1 ) :: zc, zw, ah
       real (kind=RKIND), dimension(nVertLevels ) :: zu, dzw, rdzwp, rdzwm
@@ -1438,6 +1460,7 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       real (kind=RKIND), pointer :: nominalMinDc
       logical, pointer :: on_a_sphere
       real (kind=RKIND), pointer :: sphere_radius
+      real (kind=RKIND), pointer :: config_ztop
 
       real (kind=RKIND), dimension(:,:), pointer :: t_init, w, rw, v, rho, theta
       real (kind=RKIND), dimension(:), pointer :: u_init, qv_init, angleEdge, fEdge, fVertex
@@ -1459,10 +1482,13 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       call mpas_pool_get_array(mesh, 'areaTriangle', areaTriangle)
       call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex)
       call mpas_pool_get_array(mesh, 'nominalMinDc', nominalMinDc)
-
+     
+      call mpas_pool_get_config(mesh, 'x_period', x_period)
+      call mpas_pool_get_config(mesh, 'y_period', y_period)
       call mpas_pool_get_config(mesh, 'on_a_sphere', on_a_sphere)
       call mpas_pool_get_config(mesh, 'sphere_radius', sphere_radius)
       call mpas_pool_get_config(configs, 'config_interface_projection', config_interface_projection)
+      call mpas_pool_get_config(configs, 'config_ztop', config_ztop)
 
       !
       ! Scale all distances
@@ -1485,6 +1511,8 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       areaTriangle(:) = areaTriangle(:) * a_scale**2.0
       kiteAreasOnVertex(:,:) = kiteAreasOnVertex(:,:) * a_scale**2.0
       nominalMinDc = nominalMinDc * a_scale
+      x_period = x_period * a_scale
+      y_period = y_period * a_scale
 
       call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge)
       call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge)
@@ -1545,6 +1573,7 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       call mpas_pool_get_array(state, 'scalars', scalars)
 
       call mpas_pool_get_dimension(state, 'index_qv', index_qv)
+      call mpas_pool_get_dimension(state, 'index_tke', index_tke)
 
       scalars(:,:,:) = 0.
 
@@ -1572,7 +1601,8 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
       !     metrics for hybrid coordinate and vertical stretching
 
       str = 1.0
-      zt = 20000.
+!      zt = 20000.
+      zt = config_ztop
       dz = zt/float(nz1)
 
 !      write(0,*) ' dz = ',dz
@@ -1859,6 +1889,18 @@ subroutine init_atm_case_squall_line(dminfo, mesh, nCells, nVertLevels, state, d
            t (k,i) = thi(k,i)*(1.+1.61*scalars(index_qv,k,i))
         end do
       end do
+!
+!  initial seed for tke
+!
+!     call mpas_log_write('index_qv $i ', intArgs=(/index_qv/))
+!     call mpas_log_write('index_tke $i ', intArgs=(/index_tke/))
+      scalars(index_tke,:,:) = 0.
+
+      do k = 1,nz1  
+          do i=1,nCells
+             scalars(index_tke,k,i) = 0.1
+          end do
+      end do
 
       do itr=1,30
 
@@ -2013,6 +2055,7 @@ subroutine init_atm_case_mtn_wave(dminfo, mesh, nCells, nVertLevels, state, diag
       integer, dimension(:), pointer :: nEdgesOnEdge, nEdgesOnCell
       integer, dimension(:,:), pointer :: edgesOnEdge, cellsOnEdge, cellsOnCell
       real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge
+      real (kind=RKIND), pointer :: x_period, y_period
 
       integer :: iCell, iCell1, iCell2 , iEdge, ivtx, i, k, nz, itr, cell1, cell2, nz1
       integer, pointer :: nEdges, maxEdges, nCellsSolve, nVertices
@@ -2071,6 +2114,8 @@ subroutine init_atm_case_mtn_wave(dminfo, mesh, nCells, nVertLevels, state, diag
       call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex)
       call mpas_pool_get_array(mesh, 'nominalMinDc', nominalMinDc)
 
+      call mpas_pool_get_config(mesh, 'x_period', x_period)
+      call mpas_pool_get_config(mesh, 'y_period', y_period)
       call mpas_pool_get_config(mesh, 'on_a_sphere', on_a_sphere)
       call mpas_pool_get_config(mesh, 'sphere_radius', sphere_radius)
 
@@ -2112,6 +2157,8 @@ subroutine init_atm_case_mtn_wave(dminfo, mesh, nCells, nVertLevels, state, diag
       areaTriangle(:) = areaTriangle(:) * a_scale**2.0
       kiteAreasOnVertex(:,:) = kiteAreasOnVertex(:,:) * a_scale**2.0
       nominalMinDc = nominalMinDc * a_scale
+      x_period = x_period * a_scale
+      y_period = y_period * a_scale
 
       
       call mpas_pool_get_dimension(mesh, 'nEdges', nEdges)
@@ -6184,6 +6231,692 @@ subroutine init_atm_case_lbc(timestamp, block, mesh, nCells, nEdges, nVertLevels
 
    end subroutine init_atm_case_lbc
 
+!---------------------
+
+   subroutine init_atm_case_les(dminfo, mesh, fg, nCells, nVertLevels, state, diag, test_case, configs)
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Large Eddy Simulation (les) test case setup
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+      implicit none
+
+      type (dm_info), intent(in) :: dminfo
+      type (mpas_pool_type), intent(inout) :: mesh
+      type (mpas_pool_type), intent(inout) :: fg
+      integer, intent(in) :: nCells
+      integer, intent(in) :: nVertLevels
+      type (mpas_pool_type), intent(inout) :: state
+      type (mpas_pool_type), intent(inout) :: diag
+      type (mpas_pool_type), intent(inout):: configs
+
+      integer, intent(in) :: test_case
+
+      real (kind=RKIND), dimension(:), pointer :: rdzw, dzu, rdzu, fzm, fzp
+      real (kind=RKIND), dimension(:,:), pointer :: zgrid, zxu, zz, hx, cqw
+      real (kind=RKIND), dimension(:,:), pointer :: ppb, pb, rho_zz, rb, rr, tb, rtb, p, pp, dss, t, rt, u, ru
+      real (kind=RKIND), dimension(:,:,:), pointer :: scalars
+      real (kind=RKIND), dimension(:,:,:), pointer :: zb, zb3
+
+      real (kind=RKIND), dimension(:), pointer :: xland
+      integer, dimension(:), pointer :: landmask, lu_index
+
+      !This is temporary variable here. It just need when calculate tangential velocity v.
+      integer :: eoe
+      integer, dimension(:), pointer :: nEdgesOnEdge 
+      integer, dimension(:,:), pointer :: edgesOnEdge, cellsOnEdge
+      real (kind=RKIND), dimension(:,:), pointer :: weightsOnEdge
+      real (kind=RKIND), pointer :: x_period, y_period
+
+      integer :: iCell, iCell1, iCell2 , iEdge, ivtx, i, k, nz, nz1, itr, cell1, cell2
+      integer, pointer :: nEdges, nVertices, maxEdges, nCellsSolve
+      integer, pointer :: index_qv
+      integer, pointer :: index_tke
+
+      real (kind=RKIND), dimension(nVertLevels + 1 ) :: zc, zw, ah
+      real (kind=RKIND), dimension(nVertLevels ) :: zu, dzw, rdzwp, rdzwm
+
+      real (kind=RKIND), dimension(nVertLevels, nCells) :: thi, tbi, cqwb
+
+      real (kind=RKIND) ::  xnutr
+      real (kind=RKIND) ::  ztemp, zd, zt, dz, str
+
+      real (kind=RKIND), dimension(nVertLevels ) :: qvb, qvp, zg
+      real (kind=RKIND), dimension(nVertLevels ) :: t_init_1d
+
+      real (kind=RKIND) :: cof1, cof2
+      real (kind=RKIND), pointer :: cf1, cf2, cf3
+      real (kind=RKIND) :: pitop, pibtop, ptopb, ptop, rcp, rcv, p0
+      real (kind=RKIND) :: a_scale
+
+      real (kind=RKIND), dimension(:), pointer :: xCell, yCell, zCell
+      real (kind=RKIND), dimension(:), pointer :: xEdge, yEdge, zEdge
+      real (kind=RKIND), dimension(:), pointer :: xVertex, yVertex, zVertex
+      real (kind=RKIND), dimension(:), pointer :: dcEdge, dvEdge, areaCell, areaTriangle
+      real (kind=RKIND), dimension(:,:), pointer :: kiteAreasOnVertex
+      logical, pointer :: on_a_sphere
+      real (kind=RKIND), pointer :: sphere_radius
+      real (kind=RKIND), pointer :: config_ztop
+
+      real (kind=RKIND), dimension(:,:), pointer :: t_init, w, rw, v, rho, theta
+      real (kind=RKIND), dimension(:), pointer :: u_init, v_init, qv_init, angleEdge, fEdge, fVertex
+      real (kind=RKIND) :: u_vel, v_vel, randx
+
+      call mpas_pool_get_array(mesh, 'xCell', xCell)
+      call mpas_pool_get_array(mesh, 'yCell', yCell)
+      call mpas_pool_get_array(mesh, 'zCell', zCell)
+      call mpas_pool_get_array(mesh, 'xEdge', xEdge)
+      call mpas_pool_get_array(mesh, 'yEdge', yEdge)
+      call mpas_pool_get_array(mesh, 'zEdge', zEdge)
+      call mpas_pool_get_array(mesh, 'xVertex', xVertex)
+      call mpas_pool_get_array(mesh, 'yVertex', yVertex)
+      call mpas_pool_get_array(mesh, 'zVertex', zVertex)
+      call mpas_pool_get_array(mesh, 'dvEdge', dvEdge)
+      call mpas_pool_get_array(mesh, 'dcEdge', dcEdge)
+      call mpas_pool_get_array(mesh, 'areaCell', areaCell)
+      call mpas_pool_get_array(mesh, 'areaTriangle', areaTriangle)
+      call mpas_pool_get_array(mesh, 'kiteAreasOnVertex', kiteAreasOnVertex)
+
+      call mpas_pool_get_config(mesh, 'x_period', x_period)
+      call mpas_pool_get_config(mesh, 'y_period', y_period)
+      call mpas_pool_get_config(mesh, 'on_a_sphere', on_a_sphere)
+      call mpas_pool_get_config(mesh, 'sphere_radius', sphere_radius)
+      call mpas_pool_get_config(configs, 'config_ztop', config_ztop)
+
+      !
+      ! Scale all distances
+      !
+
+      a_scale = 1.0
+
+      xCell(:) = xCell(:) * a_scale
+      yCell(:) = yCell(:) * a_scale
+      zCell(:) = zCell(:) * a_scale
+      xVertex(:) = xVertex(:) * a_scale
+      yVertex(:) = yVertex(:) * a_scale
+      zVertex(:) = zVertex(:) * a_scale
+      xEdge(:) = xEdge(:) * a_scale
+      yEdge(:) = yEdge(:) * a_scale
+      zEdge(:) = zEdge(:) * a_scale
+      dvEdge(:) = dvEdge(:) * a_scale
+      dcEdge(:) = dcEdge(:) * a_scale
+      areaCell(:) = areaCell(:) * a_scale**2.0
+      areaTriangle(:) = areaTriangle(:) * a_scale**2.0
+      kiteAreasOnVertex(:,:) = kiteAreasOnVertex(:,:) * a_scale**2.0
+      x_period = x_period * a_scale
+      y_period = y_period * a_scale
+
+      call mpas_pool_get_array(mesh, 'weightsOnEdge', weightsOnEdge)
+      call mpas_pool_get_array(mesh, 'nEdgesOnEdge', nEdgesOnEdge)
+      call mpas_pool_get_array(mesh, 'edgesOnEdge', edgesOnEdge)
+      call mpas_pool_get_array(mesh, 'cellsOnEdge', cellsOnEdge)
+
+      call mpas_pool_get_dimension(mesh, 'nEdges', nEdges)
+      call mpas_pool_get_dimension(mesh, 'nVertices', nVertices)
+      call mpas_pool_get_dimension(mesh, 'nCellsSolve', nCellsSolve)
+      call mpas_pool_get_dimension(mesh, 'maxEdges', maxEdges)
+      nz1 = nVertLevels
+      nz = nz1 + 1
+
+      call mpas_pool_get_array(mesh, 'zgrid', zgrid)
+      call mpas_pool_get_array(mesh, 'zb', zb)
+      call mpas_pool_get_array(mesh, 'zb3', zb3)
+      call mpas_pool_get_array(mesh, 'rdzw', rdzw)
+      call mpas_pool_get_array(mesh, 'dzu', dzu)
+      call mpas_pool_get_array(mesh, 'rdzu', rdzu)
+      call mpas_pool_get_array(mesh, 'fzm', fzm)
+      call mpas_pool_get_array(mesh, 'fzp', fzp)
+      call mpas_pool_get_array(mesh, 'zxu', zxu)
+      call mpas_pool_get_array(mesh, 'zz', zz)
+      call mpas_pool_get_array(mesh, 'hx', hx)
+      call mpas_pool_get_array(mesh, 'dss', dss)
+      call mpas_pool_get_array(mesh, 't_init', t_init)
+      call mpas_pool_get_array(mesh, 'u_init', u_init)
+      call mpas_pool_get_array(mesh, 'v_init', v_init)
+      call mpas_pool_get_array(mesh, 'qv_init', qv_init)
+      call mpas_pool_get_array(mesh, 'angleEdge', angleEdge)
+      call mpas_pool_get_array(mesh, 'fEdge', fEdge)
+      call mpas_pool_get_array(mesh, 'fVertex', fVertex)
+
+      call mpas_pool_get_array(mesh, 'cf1', cf1)
+      call mpas_pool_get_array(mesh, 'cf2', cf2)
+      call mpas_pool_get_array(mesh, 'cf3', cf3)
+
+      call mpas_pool_get_array(diag, 'pressure_base', ppb)
+      call mpas_pool_get_array(diag, 'exner_base', pb)
+      call mpas_pool_get_array(diag, 'rho_base', rb)
+      call mpas_pool_get_array(diag, 'theta_base', tb)
+      call mpas_pool_get_array(diag, 'rtheta_base', rtb)
+      call mpas_pool_get_array(diag, 'exner', p)
+      call mpas_pool_get_array(diag, 'cqw', cqw)
+
+      call mpas_pool_get_array(diag, 'pressure_p', pp)
+      call mpas_pool_get_array(diag, 'rho_p', rr)
+      call mpas_pool_get_array(diag, 'rtheta_p', rt)
+      call mpas_pool_get_array(diag, 'ru', ru)
+      call mpas_pool_get_array(diag, 'rw', rw)
+      call mpas_pool_get_array(diag, 'v', v)
+      call mpas_pool_get_array(diag, 'rho', rho)
+      call mpas_pool_get_array(diag, 'theta', theta)
+
+      call mpas_pool_get_array(state, 'rho_zz', rho_zz)
+      call mpas_pool_get_array(state, 'theta_m', t)
+      call mpas_pool_get_array(state, 'u', u)
+      call mpas_pool_get_array(state, 'w', w)
+      call mpas_pool_get_array(state, 'scalars', scalars)
+
+      call mpas_pool_get_dimension(state, 'index_qv', index_qv)
+      call mpas_pool_get_dimension(state, 'index_tke', index_tke)
+
+      call mpas_pool_get_array(fg, 'xland', xland)
+      call mpas_pool_get_array(mesh, 'landmask', landmask)
+      call mpas_pool_get_array(mesh, 'lu_index', lu_index)
+ 
+      scalars(:,:,:) = 0.
+
+      call atm_initialize_advection_rk(mesh, nCells, nEdges, maxEdges, on_a_sphere, sphere_radius )
+      call atm_initialize_deformation_weights(mesh, nCells, on_a_sphere, sphere_radius)
+      
+      xnutr = 0.
+      zd = 12000.
+
+      p0 = 1.e+05
+      rcp = rgas/cp
+      rcv = rgas/(cp-rgas)
+
+     call mpas_log_write(' point 1 in test case setup ')
+
+! We may pass in an hx(:,:) that has been precomputed elsewhere.
+! For now it is independent of k
+
+      do iCell=1,nCells
+        do k=1,nz
+          hx(k,iCell) = 0.  ! les on a flat Cartesian plane
+        end do
+      end do
+
+!      write(0,*) ' dz = ',dz
+      call mpas_log_write(' hx computation complete ')
+
+      !     metrics for hybrid coordinate and vertical stretching
+
+      str = 1.0  ! no stretching in les case: constant dz
+      ! zt = 20000.
+      zt = config_ztop
+      dz = zt/float(nz1)
+
+
+      do k=1,nz
+
+            ! zw(k) = zt*(real(k-1)*dz/zt)**str 
+            zw(k) = float(k-1)*dz
+            zc(k) = zw(k)
+!
+!           ah(k) governs the transition between terrain-following 
+!           and pureheight coordinates
+!                ah(k) = 0 is a terrain-following coordinate
+!                ah(k) = 1 is a height coordinate
+ 
+!            ah(k) = 1.-cos(.5*pii*(k-1)*dz/zt)**6
+            ah(k) = 1.
+!            call mpas_log_write(' k, zc, zw, ah = $i $r $r $r', intArgs=(/k/), realArgs=(/zc(k),zw(k),ah(k)/))
+      end do
+      do k=1,nz1
+         dzw (k) = zw(k+1)-zw(k)
+         rdzw(k) = 1./dzw(k)
+         zu(k  ) = .5*(zw(k)+zw(k+1))
+      end do
+      do k=2,nz1
+         dzu (k)  = .5*(dzw(k)+dzw(k-1))
+         rdzu(k)  =  1./dzu(k)
+         fzp (k)  = .5* dzw(k  )/dzu(k)
+         fzm (k)  = .5* dzw(k-1)/dzu(k)
+         rdzwp(k) = dzw(k-1)/(dzw(k  )*(dzw(k)+dzw(k-1)))
+         rdzwm(k) = dzw(k  )/(dzw(k-1)*(dzw(k)+dzw(k-1)))
+      end do
+
+!**********  how are we storing cf1, cf2 and cf3?
+
+      COF1 = (2.*DZU(2)+DZU(3))/(DZU(2)+DZU(3))*DZW(1)/DZU(2) 
+      COF2 =     DZU(2)        /(DZU(2)+DZU(3))*DZW(1)/DZU(3) 
+      CF1  = FZP(2) + COF1
+      CF2  = FZM(2) - COF1 - COF2
+      CF3  = COF2       
+
+      do iCell=1,nCells
+        do k=1,nz
+            zgrid(k,iCell) = ah(k)*(zc(k)*(1.-hx(k,iCell)/zt)+hx(k,iCell)) &
+                           + (1.-ah(k)) * zc(k)
+        end do
+        do k=1,nz1
+          zz (k,iCell) = (zw(k+1)-zw(k))/(zgrid(k+1,iCell)-zgrid(k,iCell))
+        end do
+      end do
+
+      do i=1, nEdges
+        iCell1 = cellsOnEdge(1,i)
+        iCell2 = cellsOnEdge(2,i)
+        do k=1,nz1
+          zxu (k,i) = 0.5 * (zgrid(k,iCell2)-zgrid(k,iCell1) + zgrid(k+1,iCell2)-zgrid(k+1,iCell1)) / dcEdge(i)
+        end do
+      end do
+      do i=1, nCells
+        do k=1,nz1
+          ztemp = .5*(zgrid(k+1,i)+zgrid(k,i))
+          dss(k,i) = 0.
+          ztemp = zgrid(k,i)
+          if(ztemp.gt.zd+.1)  then
+             dss(k,i) = dss(k,i)+xnutr*sin(.5*pii*(ztemp-zd)/(zt-zd))**2
+          end if
+        end do
+      end do
+
+!
+! initialization
+!
+         do i=1,nCells
+            do k=1,nz1
+
+               ztemp = .5*(zgrid(k,i)+zgrid(k+1,i))
+
+               ! if(ztemp .gt. ztr) then
+               !   t (k,i) = thetar*exp(9.8*(ztemp-ztr)/(1003.*ttr))
+               !   relhum(k,i) = 0.25
+               !  else
+               !   t (k,i) = 300.+43.*(ztemp/ztr)**1.25
+               !   relhum(k,i) = (1.-0.75*(ztemp/ztr)**1.25)
+               !   if(t(k,i).lt.thetas) t(k,i) = thetas
+               ! end if
+
+               t(k,i) = atm_get_sounding('theta',ztemp)
+               scalars(index_qv,k,i) = atm_get_sounding('qv',ztemp)
+
+               tb(k,i) = t(k,i)
+               thi(k,i) = t(k,i)
+               tbi(k,i) = t(k,i)
+               cqw(k,i) = 1.
+               cqwb(k,i) = 1.
+            end do
+         end do
+
+!  set the velocity field - we are on a plane here.
+
+         do i=1, nEdges
+            cell1 = cellsOnEdge(1,i)
+            cell2 = cellsOnEdge(2,i)
+            if(cell1 <= nCellsSolve .or. cell2 <= nCellsSolve) then
+               do k=1,nz1
+                  ztemp = .25*( zgrid(k,cell1 )+zgrid(k+1,cell1 )  &
+                               +zgrid(k,cell2)+zgrid(k+1,cell2))
+                  u_vel = atm_get_sounding('u',ztemp)
+                  v_vel = atm_get_sounding('v',ztemp)
+                  u(k,i) = cos(angleEdge(i))*u_vel - sin(angleEdge(i))*v_vel
+                  if(i == 1 ) then
+                    u_init(k) = u_vel
+                    v_init(k) = v_vel
+                  end if
+               end do
+            end if
+         end do
+
+         call mpas_dmpar_bcast_reals(dminfo, nz1, u_init)
+
+!
+!    for reference sounding 
+!
+     do itr=1,30
+
+      pitop = 1.-.5*dzw(1)*gravity*(1.+scalars(index_qv,1,1))/(cp*t(1,1)*zz(1,1))
+      pibtop = 1.-.5*dzw(1)*gravity*(1.+qvb(1))/(cp*tb(1,1)*zz(1,1))
+      do k=2,nz1
+         pitop = pitop-dzu(k)*gravity/(cp*cqw(k,1)*.5*(t(k,1)+t(k-1,1))   &
+                                   *.5*(zz(k,1)+zz(k-1,1)))
+         pibtop = pibtop-dzu(k)*gravity/(cp*cqwb(k,1)*.5*(tb(k,1)+tb(k-1,1))   &
+                                   *.5*(zz(k,1)+zz(k-1,1)))
+
+         !call mpas_log_write('$i $r $r $r $r', intArgs=(/k/), realArgs=(/pitop,tb(k,1),dzu(k),tb(k,1)/))
+      end do
+      pitop = pitop-.5*dzw(nz1)*gravity*(1.+scalars(index_qv,nz1,1))/(cp*t(nz1,1)*zz(nz1,1))
+      pibtop = pibtop-.5*dzw(nz1)*gravity*(1.+qvb(nz1))/(cp*tb(nz1,1)*zz(nz1,1))
+
+      call mpas_dmpar_bcast_real(dminfo, pitop)
+      call mpas_dmpar_bcast_real(dminfo, pibtop)
+
+      ptopb = p0*pibtop**(1./rcp)
+!     call mpas_log_write('ptopb = $r', realArgs=(/0.01_RKIND*ptopb/))
+
+      do i=1, nCells
+         pb(nz1,i) = pibtop+.5*dzw(nz1)*gravity*(1.+qvb(nz1))/(cp*tb(nz1,i)*zz(nz1,i))
+         p (nz1,i) = pitop+.5*dzw(nz1)*gravity*(1.+scalars(index_qv,nz1,i))/(cp*t (nz1,i)*zz(nz1,i))
+         do k=nz1-1,1,-1
+            pb(k,i)  = pb(k+1,i) + dzu(k+1)*gravity/(cp*cqwb(k+1,i)*.5*(tb(k,i)+tb(k+1,i))   &
+                                           *.5*(zz(k,i)+zz(k+1,i)))
+            p (k,i)  = p (k+1,i) + dzu(k+1)*gravity/(cp*cqw(k+1,i)*.5*(t (k,i)+t (k+1,i))   &
+                                           *.5*(zz(k,i)+zz(k+1,i)))
+         end do
+         do k=1,nz1
+            rb (k,i) = pb(k,i)**(1./rcv)/((rgas/p0)*tb(k,i)*zz(k,i))
+            rtb(k,i) = rb(k,i)*tb(k,i)
+            rr (k,i) = p (k,i)**(1./rcv)/((rgas/p0)*t (k,i)*zz(k,i))-rb(k,i)
+            ppb(k,i) = p0*(zz(k,i)*rgas*rtb(k,i)/p0)**(cp/cv)
+         end do
+      end do
+
+     !
+     ! update water vapor mixing ratio from humidity profile
+     !
+     ! do i= 1,nCells
+     !    do k=1,nz1
+     !       temp     = p(k,i)*thi(k,i)
+     !       pres     = p0*p(k,i)**(1./rcp)
+     !       qvs      = 380.*exp(17.27*(temp-273.)/(temp-36.))/pres
+     !       scalars(index_qv,k,i) = min(0.014_RKIND,relhum(k,i)*qvs)
+     !    end do
+     ! end do
+
+      do k=1,nz1
+!*********************************************************************
+!           QVB = QV INCLUDES MOISTURE IN REFERENCE STATE
+!            qvb(k) = scalars(index_qv,k,1)
+!           QVB = 0 PRODUCES DRY REFERENCE STATE
+            qvb(k) = 0.
+            qvp(k) = scalars(index_qv,k,1)*1000.
+            zg(k) = .5*(zgrid(k,1)+zgrid(k+1,1))
+!*********************************************************************
+      end do
+
+      do i= 1,nCells
+         do k=1,nz1
+            t (k,i) = thi(k,i)*(1.+1.61*scalars(index_qv,k,i))
+            tb(k,i) = tbi(k,i)*(1.+1.61*qvb(k))
+         end do
+         do k=2,nz1
+            cqw (k,i) = 1./(1.+.5*(scalars(index_qv,k,i)+scalars(index_qv,k-1,i)))
+            cqwb(k,i) = 1./(1.+.5*(qvb(k)+qvb(k-1)))
+         end do
+      end do
+
+      end do !end of iteration loop
+
+      call mpas_log_write(' sounding ')
+      call mpas_log_write(' k,     zg,     rb,     tb,     rtb,     t,     rr,      p,    qvp')
+      do k=1,nVertLevels
+         call mpas_log_write('$i $r $r $r $r $r $r $r $r', intArgs=(/k/), realArgs=(/zg(k),rb(k,1),tb(k,1),rtb(k,1),t(k,1),rr(k,1),p(k,1),qvp(k)/))
+      end do
+
+!
+!     potential temperature perturbation
+!
+
+!      do i=1,nCells
+!        do k = 1,nz1  ! same as in WRF
+!          call random_number(randx)
+!          thi(k,i) = thi(k,i) + 0.1*(randx-0.5)
+!           t (k,i) = thi(k,i)*(1.+1.61*scalars(index_qv,k,i))
+!        end do
+!      end do
+
+      do k = 1,nz1  ! same as in WRF
+        if(zg(k) .le. 397.0) then  ! the SAS initial PBL height
+          do i=1,nCells
+            call random_number(randx)
+            thi(k,i) = thi(k,i) + 1.0*(randx-0.5)
+             t (k,i) = thi(k,i)*(1.+1.61*scalars(index_qv,k,i))
+          end do
+        end if
+      end do
+!
+!  initial seed for tke
+!
+      scalars(index_tke,:,:) = 0.
+
+      do k = 1,nz1  
+        if( zg(k) .le. 255.) then
+          do i=1,nCells
+            !! scalars(index_tke,k,i) = 0.1
+               scalars(index_tke,k,i) = 0.4*((1.-(zg(k)/255.))**3)
+          end do
+        end if
+      end do
+
+      do itr=1,30
+
+        pitop = 1.-.5*dzw(1)*gravity*(1.+scalars(index_qv,1,1))/(cp*t(1,1)*zz(1,1))
+        do k=2,nz1
+          pitop = pitop-dzu(k)*gravity/(cp*cqw(k,1)*.5*(t (k,1)+t (k-1,1)) &
+                                                  *.5*(zz(k,1)+zz(k-1,1)))
+        end do
+        pitop = pitop - .5*dzw(nz1)*gravity*(1.+scalars(index_qv,nz1,1))/(cp*t(nz1,1)*zz(nz1,1))
+        ptop = p0*pitop**(1./rcp)
+!       call mpas_log_write('ptop  = $r $r', realArgs=(/0.01_RKIND*ptop, 0.01_RKIND*ptopb/))
+
+        call mpas_dmpar_bcast_real(dminfo, ptop)
+
+        do i = 1, nCells
+
+          pp(nz1,i) = ptop-ptopb+.5*dzw(nz1)*gravity*   &
+                       (rr(nz1,i)+(rr(nz1,i)+rb(nz1,i))*scalars(index_qv,nz1,i))
+          do k=nz1-1,1,-1
+!             pp(k,i) = pp(k+1,i)+.5*dzu(k+1)*gravity*                   &
+!                            (rr(k  ,i)+(rr(k  ,i)+rb(k  ,i))*scalars(index_qv,k  ,i)  &
+!                            +rr(k+1,i)+(rr(k+1,i)+rb(k+1,i))*scalars(index_qv,k+1,i))
+               pp(k,i) = pp(k+1,i)+dzu(k+1)*gravity*(    &
+                            fzm(k+1)*(rb(k+1,i)*(scalars(index_qv,k+1,i)-qvb(k+1))    &
+                                     +rr(k+1,i)*(1.+scalars(index_qv,k+1,i)))         &
+                           +fzp(k+1)*(rb(k  ,i)*(scalars(index_qv,k  ,i)-qvb(k))      &
+                                     +rr(k  ,i)*(1.+scalars(index_qv,k  ,i))))
+          end do
+!         if (itr==1.and.i==1) then
+!            do k=1,nz1
+!               call mpas_log_write('pp-check $r', realArgs=(/pp(k,i)/))
+!            end do
+!         end if
+          do k=1,nz1
+             rt(k,i) = (pp(k,i)/(rgas*zz(k,i))                   &
+                     -rtb(k,i)*(p(k,i)-pb(k,i)))/p(k,i)       
+             p (k,i) = (zz(k,i)*(rgas/p0)*(rtb(k,i)+rt(k,i)))**rcv
+             rr(k,i) = (rt(k,i)-rb(k,i)*(t(k,i)-tb(k,i)))/t(k,i)
+          end do
+
+        end do ! loop over cells
+
+      end do !  iteration loop
+!----------------------------------------------------------------------
+!
+      do k=1,nz1
+         qv_init(k) = scalars(index_qv,k,1)
+      end do
+
+      t_init_1d(:) = t(:,1)
+      call mpas_dmpar_bcast_reals(dminfo, nz1, t_init_1d)
+      call mpas_dmpar_bcast_reals(dminfo, nz1, qv_init)
+
+      do i=1,nCells
+         do k=1,nz1
+            t_init(k,i) = t_init_1d(k)
+            rho_zz(k,i) = rb(k,i)+rr(k,i)
+         end do
+      end do
+
+      do i=1,nEdges
+        cell1 = cellsOnEdge(1,i)
+        cell2 = cellsOnEdge(2,i)
+        if(cell1 <= nCellsSolve .or. cell2 <= nCellsSolve) then
+          do k=1,nz1
+            ru (k,i)  = 0.5*(rho_zz(k,cell1)+rho_zz(k,cell2))*u(k,i)    
+          end do
+        end if
+      end do
+
+
+      !
+      !  we are assuming w and rw are zero for this initialization
+      !  i.e., no terrain
+      !
+       rw = 0.0
+       w = 0.0
+
+       zb = 0.0
+       zb3 = 0.0
+
+      !
+      ! Generate rotated Coriolis field - same settings as in WRF
+      !
+      do iEdge=1,nEdges
+         ! fEdge(iEdge) = 1.e-04
+         fEdge(iEdge) = 7.2921e-05
+      end do
+
+      do iVtx=1,nVertices
+         ! fVertex(iVtx) = 1.e-04
+         fVertex(iVtx) = 7.2921e-05
+      end do
+
+      !
+      ! Compute mass fluxes tangential to each edge (i.e., through the faces of dual grid cells)
+      !
+      v(:,:) = 0.0
+      do iEdge = 1, nEdges
+         do i=1,nEdgesOnEdge(iEdge)
+            eoe = edgesOnEdge(i,iEdge)
+            if (eoe > 0) then
+               do k = 1, nVertLevels
+                 v(k,iEdge) = v(k,iEdge) + weightsOnEdge(i,iEdge) * u(k, eoe)
+              end do
+            end if
+         end do
+      end do
+
+     ! call mpas_log_write(' k,u_init, t_init, qv_init ')
+     ! do k=1,nVertLevels
+     !   call mpas_log_write('$i $r $r $r', intArgs=(/k/), realArgs=(/u_init(k),t_init(k,1),qv_init(k)/))
+     ! end do
+
+      ! Compute rho and theta from rho_zz and theta_m
+      do iCell=1,nCells
+         do k=1,nVertLevels
+            rho(k,iCell) = rho_zz(k,iCell) * zz(k,iCell)
+            theta(k,iCell) = t(k,iCell) / (1.0 + 1.61 * scalars(index_qv,k,iCell))
+         end do
+      end do
+
+      do iCell = 1, nCells
+         !land category
+         landmask(iCell) = 1
+         lu_index(iCell) = 1
+         xland(iCell) = 1.0
+         if (iCell == 1) call mpas_log_write(' landmask, lu_index, xland $i $i $r', intArgs=(/landmask(iCell),lu_index(iCell)/), realArgs=(/xland(iCell)/))
+      end do
+
+   end subroutine init_atm_case_les
+
+
+   real (kind=RKIND) function atm_get_sounding( variable, height )
+
+     implicit none
+     real (kind=RKIND), intent(in) :: height
+     character(len=*), intent(in) :: variable
+
+     atm_get_sounding = -999.
+! SAS case sounding
+
+     if(variable == 'u') then 
+        atm_get_sounding = 2.0  ! SAS value
+
+     else if (variable == 'v') then
+        atm_get_sounding = 0.
+
+     else if (variable == 'theta') then
+
+        if(height .le. 352.5) then
+          atm_get_sounding = 296.6
+        else if(height .le. 442.5) then
+          atm_get_sounding = 296.6 + (height-352.5)*1.5/90.
+        else
+          atm_get_sounding = 298.1 + (height-442.5)*0.003
+        end if
+
+     else if (variable == 'qv') then
+
+        if(height .le. 352.5) then
+          atm_get_sounding = 11.8/1000.
+        else if(height .le. 442.5) then
+          atm_get_sounding = 11.8/1000. - (height-352.5)*4.0/90./1000.
+        else
+          atm_get_sounding = max(7.8/1000. - (height-442.5)*0.004/1000.,0.0)
+        end if
+
+      end if 
+
+   end function atm_get_sounding
+
+
+   real (kind=RKIND) function atm_get_sounding_1( variable, height )
+
+     implicit none
+     real (kind=RKIND), intent(in) :: height
+     character(len=*), intent(in) :: variable
+
+     atm_get_sounding_1 = -999.
+
+     if(variable == 'u') then 
+        atm_get_sounding_1 = 15.0
+     else if (variable == 'v') then
+        atm_get_sounding_1 = 0.
+     else if (variable == 'qv') then
+
+        atm_get_sounding_1 = 0. ! dry sounding
+        ! atm_get_sounding_1 = 0.010
+        ! if(height .gt. 500.) atm_get_sounding_1 = 0.004
+
+     else if (variable == 'theta') then
+
+        if(height .le. 500.) then
+          atm_get_sounding_1 = 300.
+        else if(height .le. 600.) then
+          atm_get_sounding_1 = 300. + (height-500.)*3./100.
+        else
+          atm_get_sounding_1 = 303. + (height-600.)*3./1000.
+        end if
+        ! atm_get_sounding_1 = atm_get_sounding_1 - 10.0  ! for water case
+
+      end if 
+
+   end function atm_get_sounding_1
+
+   real (kind=RKIND) function atm_get_sounding_2( variable, height )
+
+     implicit none
+     real (kind=RKIND), intent(in) :: height
+     character(len=*), intent(in) :: variable
+
+     atm_get_sounding_2 = -999.
+
+     if(variable == 'u') then 
+        atm_get_sounding_2 = 10.0
+     else if (variable == 'v') then
+        atm_get_sounding_2 = 0.
+     else if (variable == 'qv') then
+
+        ! atm_get_sounding_2 = 0. ! dry sounding_2
+        atm_get_sounding_2 = 0.010
+        if(height .gt. 1000.) atm_get_sounding_2 = 0.004
+
+     else if (variable == 'theta') then
+
+        if(height .le. 1000.) then
+          atm_get_sounding_2 = 300.
+        else if(height .le. 1150.) then
+          atm_get_sounding_2 = 300. + (height-1000.)*8./150.
+        else
+          atm_get_sounding_2 = 308. + (height-1150.)*3./1000.
+        end if
+        atm_get_sounding_2 = atm_get_sounding_2 - 10.0  ! for water case
+
+      end if 
+
+   end function atm_get_sounding_2
+
+!-----------
 
    !-----------------------------------------------------------------------
    !  routine init_atm_case_cam_mpas
diff --git a/src/framework/Makefile b/src/framework/Makefile
index 2d8e7dc92b..64d3d6faf4 100644
--- a/src/framework/Makefile
+++ b/src/framework/Makefile
@@ -110,7 +110,7 @@ mpas_c_interfacing.o:
 xml_stream_parser.o: xml_stream_parser.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(CPPINCLUDES) -I../external/ezxml -c xml_stream_parser.c
 
-mpas_halo.o: mpas_derived_types.o mpas_pool_routines.o mpas_log.o
+mpas_halo.o: mpas_derived_types.o mpas_pool_routines.o mpas_log.o mpas_timer.o
 
 mpas_stream_inquiry.o : mpas_derived_types.o mpas_log.o mpas_c_interfacing.o
 
diff --git a/src/framework/mpas_dmpar.F b/src/framework/mpas_dmpar.F
index 6d68c0c656..5d9b48d53b 100644
--- a/src/framework/mpas_dmpar.F
+++ b/src/framework/mpas_dmpar.F
@@ -7448,19 +7448,28 @@ end subroutine mpas_dmpar_exch_group_end_halo_exch!}}}
 !>  exchange is complete.
 !
 !-----------------------------------------------------------------------
-   subroutine mpas_dmpar_exch_group_full_halo_exch(domain, groupName, iErr)!{{{
+   subroutine mpas_dmpar_exch_group_full_halo_exch(domain, groupName, withGPUAwareMPI, iErr)!{{{
 
       type (domain_type), intent(inout) :: domain
       character (len=*), intent(in) :: groupName
+      logical, optional, intent(in) :: withGPUAwareMPI
       integer, optional, intent(out) :: iErr
 
       type (mpas_exchange_group), pointer :: exchGroupPtr
       integer :: nLen
+      logical :: useGPUAwareMPI
 
       if ( present(iErr) ) then
          iErr = MPAS_DMPAR_NOERR
       end if
 
+      useGPUAwareMPI = .false.
+      if (present(withGPUAwareMPI)) then
+        if (withGPUAwareMPI) then
+           call mpas_log_write(' GPU-aware MPI not implemented in this module', MPAS_LOG_CRIT)
+        end if
+      end if
+
       nLen = len_trim(groupName)
       DMPAR_DEBUG_WRITE(' -- Trying to perform a full exchange for group ' // trim(groupName))
 
diff --git a/src/framework/mpas_halo.F b/src/framework/mpas_halo.F
index 4ab8817c23..8877d6330d 100644
--- a/src/framework/mpas_halo.F
+++ b/src/framework/mpas_halo.F
@@ -280,6 +280,30 @@ subroutine mpas_halo_exch_group_complete(domain, groupName, iErr)
 
         call refactor_lists(domain, groupName, iErr)
 
+        ! Logic to return early if there are no neighbors to send to
+        if ( newGroup% nGroupSendNeighbors <=0 ) then
+            return
+        end if
+
+
+        ! Always copy in the main data member first
+        !$acc enter data copyin(newGroup)
+        ! Then the data in the members of the type
+        !$acc enter data copyin(newGroup % recvBuf(:), newGroup % sendBuf(:))
+        ! !$acc enter data copyin(newGroup % sendBuf(:))
+        !$acc enter data copyin(newGroup % fields(:))
+        do i = 1, newGroup % nFields
+            !$acc enter data copyin(newGroup % fields(i))
+            !$acc enter data copyin(newGroup % fields(i) % nSendLists(:,:))
+            !$acc enter data copyin(newGroup % fields(i) % packOffsets(:))
+            !$acc enter data copyin(newGroup % fields(i) % sendListSrc(:,:,:))
+            !$acc enter data copyin(newGroup % fields(i) % sendListDst(:,:,:))
+            !$acc enter data copyin(newGroup % fields(i) % nRecvLists(:,:))
+            !$acc enter data copyin(newGroup % fields(i) % unpackOffsets(:))
+            !$acc enter data copyin(newGroup % fields(i) % recvListSrc(:,:,:))
+            !$acc enter data copyin(newGroup % fields(i) % recvListDst(:,:,:))
+        end do
+
     end subroutine mpas_halo_exch_group_complete
 
 
@@ -349,15 +373,26 @@ subroutine mpas_halo_exch_group_destroy(domain, groupName, iErr)
             deallocate(cursor % fields(i) % compactHaloInfo)
             deallocate(cursor % fields(i) % compactSendLists)
             deallocate(cursor % fields(i) % compactRecvLists)
+            !$acc exit data delete(cursor % fields(i) % nSendLists(:,:))
             deallocate(cursor % fields(i) % nSendLists)
+            !$acc exit data delete(cursor % fields(i) % sendListSrc(:,:,:))
             deallocate(cursor % fields(i) % sendListSrc)
+            !$acc exit data delete(cursor % fields(i) % sendListDst(:,:,:))
             deallocate(cursor % fields(i) % sendListDst)
+            !$acc exit data delete(cursor % fields(i) % packOffsets(:))
             deallocate(cursor % fields(i) % packOffsets)
+            !$acc exit data delete(cursor % fields(i) % nRecvLists(:,:))
             deallocate(cursor % fields(i) % nRecvLists)
+            !$acc exit data delete(cursor % fields(i) % recvListSrc(:,:,:))
             deallocate(cursor % fields(i) % recvListSrc)
+            !$acc exit data delete(cursor % fields(i) % recvListDst(:,:,:))
             deallocate(cursor % fields(i) % recvListDst)
+            !$acc exit data delete(cursor % fields(i) % unpackOffsets(:))
             deallocate(cursor % fields(i) % unpackOffsets)
+            !$acc exit data delete(cursor % fields(i))
         end do
+        ! Use finalize here in-case the copyins in ..._complete increment the reference counter
+        !$acc exit data finalize delete(cursor % fields(:))
         deallocate(cursor % fields)
         deallocate(cursor % groupPackOffsets)
         deallocate(cursor % groupSendNeighbors)
@@ -368,10 +403,14 @@ subroutine mpas_halo_exch_group_destroy(domain, groupName, iErr)
         deallocate(cursor % groupToFieldRecvIdx)
         deallocate(cursor % groupRecvOffsets)
         deallocate(cursor % groupRecvCounts)
+        !$acc exit data delete(cursor % sendBuf(:))
         deallocate(cursor % sendBuf)
+        !$acc exit data delete(cursor % recvBuf(:))
         deallocate(cursor % recvBuf)
         deallocate(cursor % sendRequests)
         deallocate(cursor % recvRequests)
+        ! Finalize here as well, just in-case
+        !$acc exit data finalize delete(cursor)
         deallocate(cursor)
 
     end subroutine mpas_halo_exch_group_destroy
@@ -495,7 +534,7 @@ end subroutine mpas_halo_exch_group_add_field
     !>  exchange group.
     !
     !-----------------------------------------------------------------------
-    subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
+    subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, withGPUAwareMPI, iErr)
 
 #ifdef MPAS_USE_MPI_F08
         use mpi_f08, only : MPI_Datatype, MPI_Comm
@@ -508,6 +547,7 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
         use mpas_derived_types, only : domain_type, mpas_halo_group, MPAS_HALO_REAL, MPAS_LOG_CRIT
         use mpas_pool_routines, only : mpas_pool_get_array
         use mpas_log, only : mpas_log_write
+        use mpas_timer, only : mpas_timer_start, mpas_timer_stop
 
         ! Parameters
 #ifdef MPAS_USE_MPI_F08
@@ -527,6 +567,7 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
         ! Arguments
         type (domain_type), intent(inout) :: domain
         character (len=*), intent(in) :: groupName
+        logical, optional, intent(in) :: withGPUAwareMPI
         integer, optional, intent(out) :: iErr
 
         ! Local variables
@@ -542,6 +583,7 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
         integer :: comm
 #endif
         integer :: mpi_ierr
+        logical:: useGPUAwareMPI
         type (mpas_halo_group), pointer :: group
         integer, dimension(:), pointer :: compactHaloInfo
         integer, dimension(:), pointer :: compactSendLists
@@ -554,12 +596,17 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
         integer :: maxNRecvList
         integer, dimension(:,:,:), CONTIGUOUS pointer :: recvListSrc, recvListDst
         integer, dimension(:), CONTIGUOUS pointer :: unpackOffsets
-
+        
 
         if (present(iErr)) then
            iErr = 0
         end if
 
+        useGPUAwareMPI = .false.
+        if (present(withGPUAwareMPI)) then
+           useGPUAwareMPI = withGPUAwareMPI
+        end if
+        
         !
         ! Find this halo exhange group in the list of groups
         !
@@ -577,6 +624,12 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                                 messageType=MPAS_LOG_CRIT)
         end if
 
+        ! Logic to return early if there are no neighbors to send to
+        if ( group% nGroupSendNeighbors <= 0 ) then
+            return
+        end if
+
+        call mpas_timer_start('full_halo_exch')
         !
         ! Get the rank of this task and the MPI communicator to use from the first field in
         ! the group; all fields should be using the same communicator, so this should not
@@ -589,6 +642,7 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
 #endif
         rank = group % fields(1) % compactHaloInfo(8)
 
+        !$acc data present(group % recvBuf(:), group % sendBuf(:)) if(useGPUAwareMPI)
 
         !
         ! Initiate non-blocking MPI receives for all neighbors
@@ -598,9 +652,11 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                 bufstart = group % groupRecvOffsets(i)
                 bufend = group % groupRecvOffsets(i) + group % groupRecvCounts(i) - 1
 !TO DO: how do we determine appropriate type here?
+                !$acc host_data use_device(group % recvBuf) if(useGPUAwareMPI)
                 call MPI_Irecv(group % recvBuf(bufstart:bufend), group % groupRecvCounts(i), MPI_REALKIND, &
                                group % groupRecvNeighbors(i), group % groupRecvNeighbors(i), comm, &
                                group % recvRequests(i), mpi_ierr)
+                !$acc end host_data
             else
                 group % recvRequests(i) = MPI_REQUEST_NULL
             end if
@@ -638,12 +694,16 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                 case (1)
                     call mpas_pool_get_array(domain % blocklist % allFields, trim(group % fields(i) % fieldName), &
                                              group % fields(i) % r1arr, timeLevel=group % fields(i) % timeLevel)
-
+                    
                     !
                     ! Pack send buffer for all neighbors for current field
                     !
+                    call mpas_timer_start('packing_halo_exch')
+                    !$acc parallel default(present) attach(group % fields(i) % r1arr) if(useGPUAwareMPI)
+                    !$acc loop gang collapse(2)
                     do iEndp = 1, nSendEndpts
                         do iHalo = 1, nHalos
+                            !$acc loop vector
                             do j = 1, maxNSendList
                                 if (j <= nSendLists(iHalo,iEndp)) then
                                     group % sendBuf(packOffsets(iEndp) + sendListDst(j,iHalo,iEndp)) = &
@@ -652,20 +712,31 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                             end do
                         end do
                     end do
-
+                    !$acc end parallel
+                    call mpas_timer_stop('packing_halo_exch')
+                    
                 !
                 ! Packing code for 2-d real-valued fields
                 !
                 case (2)
                     call mpas_pool_get_array(domain % blocklist % allFields, trim(group % fields(i) % fieldName), &
                                              group % fields(i) % r2arr, timeLevel=group % fields(i) % timeLevel)
-
+                    
                     !
                     ! Pack send buffer for all neighbors for current field
-                    !
+                    !                                             
+                    ! Use data regions for specificity and so the reference or attachment counters are easier to make sense of
+                    ! Present should also cause an attach action. OpenACC Spec2.7 Section 2.7.2 describes 'attach action'
+                    ! !$acc data present(group) present(group % fields(i)) present(group % sendBuf(:), group % fields(i) % sendListSrc(:,:,:))
+                    
+
+                    call mpas_timer_start('packing_halo_exch')
+                    !$acc parallel default(present) attach(group % fields(i) % r2arr) if(useGPUAwareMPI)
+                    !$acc loop gang collapse(3)
                     do iEndp = 1, nSendEndpts
                         do iHalo = 1, nHalos
                             do j = 1, maxNSendList
+                                !$acc loop vector
                                 do i1 = 1, dim1
                                     if (j <= nSendLists(iHalo,iEndp)) then
                                         group % sendBuf(packOffsets(iEndp) + dim1 * (sendListDst(j,iHalo,iEndp) - 1) + i1) = &
@@ -675,21 +746,27 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                             end do
                         end do
                     end do
-
+                    !$acc end parallel
+                    call mpas_timer_stop('packing_halo_exch')
+                    
                 !
                 ! Packing code for 3-d real-valued fields
                 !
                 case (3)
                     call mpas_pool_get_array(domain % blocklist % allFields, trim(group % fields(i) % fieldName), &
-                                             group % fields(i) % r3arr, group % fields(i) % timeLevel)
-
+                                             group % fields(i) % r3arr, group % fields(i) % timeLevel)                    
+                    
                     !
                     ! Pack send buffer for all neighbors for current field
                     !
+                    call mpas_timer_start('packing_halo_exch')
+                    !$acc parallel default(present) attach(group % fields(i) % r3arr) if(useGPUAwareMPI)
+                    !$acc loop gang collapse(4)
                     do iEndp = 1, nSendEndpts
                         do iHalo = 1, nHalos
                             do j = 1, maxNSendList
                                 do i2 = 1, dim2
+                                !$acc loop vector
                                 do i1 = 1, dim1
                                     if (j <= nSendLists(iHalo,iEndp)) then
                                         group % sendBuf(packOffsets(iEndp) + dim1*dim2*(sendListDst(j,iHalo,iEndp) - 1) &
@@ -701,6 +778,8 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                             end do
                         end do
                     end do
+                    !$acc end parallel
+                    call mpas_timer_stop('packing_halo_exch')
 
                 end select
             end if
@@ -714,9 +793,11 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                 bufstart = group % groupSendOffsets(i)
                 bufend = group % groupSendOffsets(i) + group % groupSendCounts(i) - 1
 !TO DO: how do we determine appropriate type here?
+                !$acc host_data use_device(group % sendBuf) if(useGPUAwareMPI)
                 call MPI_Isend(group % sendBuf(bufstart:bufend), group % groupSendCounts(i), MPI_REALKIND, &
                                group % groupSendNeighbors(i), rank, comm, &
                                group % sendRequests(i), mpi_ierr)
+                !$acc end host_data
             else
                 group % sendRequests(i) = MPI_REQUEST_NULL
             end if
@@ -771,7 +852,11 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                         !
                         ! Unpack recv buffer from all neighbors for current field
                         !
+                        call mpas_timer_start('unpacking_halo_exch')
+                        !$acc parallel default(present) attach(group % fields(i) % r1arr) if(useGPUAwareMPI)
+                        !$acc loop gang
                         do iHalo = 1, nHalos
+                            !$acc loop vector
                             do j = 1, maxNRecvList
                                 if (j <= nRecvLists(iHalo,iEndp)) then
                                     group % fields(i) % r1arr(recvListDst(j,iHalo,iEndp)) = &
@@ -779,6 +864,8 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                                 end if
                             end do
                         end do
+                        !$acc end parallel
+                        call mpas_timer_stop('unpacking_halo_exch')
 
                     !
                     ! Unpacking code for 2-d real-valued fields
@@ -787,8 +874,13 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                         !
                         ! Unpack recv buffer from all neighbors for current field
                         !
+                        call mpas_timer_start('unpacking_halo_exch')
+                        !$acc parallel default(present) attach(group % fields(i) % r2arr) if(useGPUAwareMPI)
+                        !$acc loop gang
                         do iHalo = 1, nHalos
+                            !$acc loop worker
                             do j = 1, maxNRecvList
+                                !$acc loop vector
                                 do i1 = 1, dim1
                                     if (j <= nRecvLists(iHalo,iEndp)) then
                                         group % fields(i) % r2arr(i1, recvListDst(j,iHalo,iEndp)) = &
@@ -797,6 +889,8 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                                 end do
                             end do
                         end do
+                        !$acc end parallel
+                        call mpas_timer_stop('unpacking_halo_exch')
 
                     !
                     ! Unpacking code for 3-d real-valued fields
@@ -805,8 +899,12 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                         !
                         ! Unpack recv buffer from all neighbors for current field
                         !
+                        call mpas_timer_start('unpacking_halo_exch')
+                        !$acc parallel default(present) attach(group % fields(i) % r3arr) if(useGPUAwareMPI)
+                        !$acc loop gang collapse(2)
                         do iHalo = 1, nHalos
                             do j = 1, maxNRecvList
+                                !$acc loop vector collapse(2)
                                 do i2 = 1, dim2
                                 do i1 = 1, dim1
                                     if (j <= nRecvLists(iHalo,iEndp)) then
@@ -818,12 +916,16 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
                                 end do
                             end do
                         end do
+                        !$acc end parallel
+                        call mpas_timer_stop('unpacking_halo_exch')
 
                     end select
                 end if
             end do
         end do
-
+        ! For the present(group % recvBuf(:), group % sendBuf(:))
+        !$acc end data
+        
         !
         ! Nullify array pointers - not necessary for correctness, but helpful when debugging
         ! to not leave pointers to what might later be incorrect targets
@@ -843,6 +945,8 @@ subroutine mpas_halo_exch_group_full_halo_exch(domain, groupName, iErr)
         !
         call MPI_Waitall(group % nGroupSendNeighbors, group % sendRequests, MPI_STATUSES_IGNORE, mpi_ierr)
 
+        call mpas_timer_stop('full_halo_exch')
+
     end subroutine mpas_halo_exch_group_full_halo_exch
 
 
diff --git a/src/operators/mpas_vector_reconstruction.F b/src/operators/mpas_vector_reconstruction.F
index 605da9cd6d..88d87474ab 100644
--- a/src/operators/mpas_vector_reconstruction.F
+++ b/src/operators/mpas_vector_reconstruction.F
@@ -202,7 +202,8 @@ end subroutine mpas_init_reconstruct!}}}
 !>  Input: grid meta data and vector component data residing at cell edges
 !>  Output: reconstructed vector field (measured in X,Y,Z) located at cell centers
 !-----------------------------------------------------------------------
-  subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uReconstructZ, uReconstructZonal, uReconstructMeridional, includeHalos)!{{{
+  subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uReconstructZ, &
+                          uReconstructZonal, uReconstructMeridional, includeHalos, lACC)!{{{
 
     implicit none
 
@@ -214,9 +215,11 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
     real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructZonal !< Output: Zonal Component of velocity reconstructed to cell centers
     real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructMeridional !< Output: Meridional Component of velocity reconstructed to cell centers
     logical, optional, intent(in) :: includeHalos !< Input: Optional logical that allows reconstruction over halo regions
+    logical, optional, intent(in) :: lACC !< Input: Optional logical that controls execution on the GPU with OpenACC
 
     !   temporary arrays needed in the compute procedure
     logical :: includeHalosLocal
+    logical :: lACCLocal
     integer, pointer :: nCells_ptr, nVertLevels_ptr
     integer :: nCells, nVertLevels
     integer, dimension(:,:), pointer :: edgesOnCell
@@ -236,6 +239,12 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
        includeHalosLocal = .false.
     end if
 
+    if ( present(lACC) ) then
+       lACCLocal = lACC
+    else
+       lACCLocal = .false.
+    end if
+
     ! stored arrays used during compute procedure
     call mpas_pool_get_array(meshPool, 'coeffs_reconstruct', coeffs_reconstruct)
 
@@ -258,19 +267,9 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
 
     call mpas_pool_get_config(meshPool, 'on_a_sphere', on_a_sphere)
 
-    MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]')
-    ! Only use sections needed, nCells may be all cells or only non-halo cells
-    !$acc enter data copyin(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), &
-    !$acc                   edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells))
-    !$acc enter data copyin(u(:,:))
-    !$acc enter data create(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), &
-    !$acc                   uReconstructZ(:,1:nCells),uReconstructZonal(:,1:nCells), &
-    !$acc                   uReconstructMeridional(:,1:nCells))
-    MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]')
-
     ! loop over cell centers
     !$omp do schedule(runtime)
-    !$acc parallel default(present)
+    !$acc parallel default(present) if(lACCLocal)
     !$acc loop gang
     do iCell = 1, nCells
       ! initialize the reconstructed vectors
@@ -305,7 +304,7 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
 
     if (on_a_sphere) then
       !$omp do schedule(runtime)
-      !$acc parallel default(present)
+      !$acc parallel default(present) if(lACCLocal)
       !$acc loop gang
       do iCell = 1, nCells
         clat = cos(latCell(iCell))
@@ -325,7 +324,7 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
       !$omp end do
     else
       !$omp do schedule(runtime)
-      !$acc parallel default(present)
+      !$acc parallel default(present) if(lACCLocal)
       !$acc loop gang vector collapse(2)
       do iCell = 1, nCells
         do k = 1, nVertLevels
@@ -337,6 +336,109 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
       !$omp end do
     end if
 
+  end subroutine mpas_reconstruct_2d!}}}
+
+
+  subroutine mpas_reconstruct_2d_h2d(meshPool, u, uReconstructX, uReconstructY, uReconstructZ, uReconstructZonal, uReconstructMeridional, includeHalos)!{{{
+
+    implicit none
+
+    type (mpas_pool_type), intent(in) :: meshPool !< Input: Mesh information
+    real (kind=RKIND), dimension(:,:), intent(in) :: u !< Input: Velocity field on edges
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructX !< Output: X Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructY !< Output: Y Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructZ !< Output: Z Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructZonal !< Output: Zonal Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructMeridional !< Output: Meridional Component of velocity reconstructed to cell centers
+    logical, optional, intent(in) :: includeHalos !< Input: Optional logical that allows reconstruction over halo regions
+
+    logical :: includeHalosLocal
+    integer, dimension(:,:), pointer :: edgesOnCell
+    integer, dimension(:), pointer :: nEdgesOnCell
+    integer :: nCells
+    integer, pointer :: nCells_ptr
+    real(kind=RKIND), dimension(:), pointer :: latCell, lonCell
+    real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct
+
+    if ( present(includeHalos) ) then
+       includeHalosLocal = includeHalos
+    else
+       includeHalosLocal = .false.
+    end if
+
+    ! stored arrays used during compute procedure
+    call mpas_pool_get_array(meshPool, 'coeffs_reconstruct', coeffs_reconstruct)
+
+    ! temporary variables
+    call mpas_pool_get_array(meshPool, 'nEdgesOnCell', nEdgesOnCell)
+    call mpas_pool_get_array(meshPool, 'edgesOnCell', edgesOnCell)
+    call mpas_pool_get_array(meshPool, 'latCell', latCell)
+    call mpas_pool_get_array(meshPool, 'lonCell', lonCell)
+
+    if ( includeHalosLocal ) then
+       call mpas_pool_get_dimension(meshPool, 'nCells', nCells_ptr)
+    else
+       call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells_ptr)
+    end if
+    nCells = nCells_ptr
+
+    MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]')
+    ! Only use sections needed, nCells may be all cells or only non-halo cells
+    !$acc enter data copyin(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), &
+    !$acc                   edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells))
+    !$acc enter data copyin(u(:,:))
+    !$acc enter data create(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), &
+    !$acc                   uReconstructZ(:,1:nCells),uReconstructZonal(:,1:nCells), &
+    !$acc                   uReconstructMeridional(:,1:nCells))
+    MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]')
+
+  end subroutine mpas_reconstruct_2d_h2d
+
+
+
+  subroutine mpas_reconstruct_2d_d2h(meshPool, u, uReconstructX, uReconstructY, uReconstructZ, uReconstructZonal, uReconstructMeridional, includeHalos)!{{{
+
+    implicit none
+
+    type (mpas_pool_type), intent(in) :: meshPool !< Input: Mesh information
+    real (kind=RKIND), dimension(:,:), intent(in) :: u !< Input: Velocity field on edges
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructX !< Output: X Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructY !< Output: Y Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructZ !< Output: Z Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructZonal !< Output: Zonal Component of velocity reconstructed to cell centers
+    real (kind=RKIND), dimension(:,:), intent(out) :: uReconstructMeridional !< Output: Meridional Component of velocity reconstructed to cell centers
+    logical, optional, intent(in) :: includeHalos !< Input: Optional logical that allows reconstruction over halo regions
+
+    logical :: includeHalosLocal
+    integer, dimension(:,:), pointer :: edgesOnCell
+    integer, dimension(:), pointer :: nEdgesOnCell
+    integer :: nCells
+    integer, pointer :: nCells_ptr
+    real(kind=RKIND), dimension(:), pointer :: latCell, lonCell
+    real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct
+
+    if ( present(includeHalos) ) then
+       includeHalosLocal = includeHalos
+    else
+       includeHalosLocal = .false.
+    end if
+
+    ! stored arrays used during compute procedure
+    call mpas_pool_get_array(meshPool, 'coeffs_reconstruct', coeffs_reconstruct)
+
+    ! temporary variables
+    call mpas_pool_get_array(meshPool, 'nEdgesOnCell', nEdgesOnCell)
+    call mpas_pool_get_array(meshPool, 'edgesOnCell', edgesOnCell)
+    call mpas_pool_get_array(meshPool, 'latCell', latCell)
+    call mpas_pool_get_array(meshPool, 'lonCell', lonCell)
+
+    if ( includeHalosLocal ) then
+       call mpas_pool_get_dimension(meshPool, 'nCells', nCells_ptr)
+    else
+       call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells_ptr)
+    end if
+    nCells = nCells_ptr
+
     MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]')
     !$acc exit data delete(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), &
     !$acc                   edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells))
@@ -346,7 +448,7 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon
     !$acc                   uReconstructMeridional(:,1:nCells))
     MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]')
 
-  end subroutine mpas_reconstruct_2d!}}}
+  end subroutine mpas_reconstruct_2d_d2h
 
 
 !***********************************************************************