From 80fd4710d83c83ede368457aa5aae85e62524658 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 8 Dec 2025 22:59:33 +0800 Subject: [PATCH 01/13] feat(tddft): Add CUDA acceleration for snap_psibeta with constant memory grids - Implement GPU-accelerated snap_psibeta_neighbor_batch_kernel - Use constant memory for Lebedev and Gauss-Legendre integration grids - Add multi-GPU support via set_device_by_rank - Initialize/finalize GPU resources in each calculate_HR call - Remove static global variables for cleaner resource management - CPU fallback when GPU processing fails --- .../module_operator_lcao/td_nonlocal_lcao.cpp | 116 +++-- source/source_lcao/module_rt/CMakeLists.txt | 7 + .../kernels/cuda/snap_psibeta_gpu.cu | 351 +++++++++++++ .../kernels/cuda/snap_psibeta_kernel.cu | 460 ++++++++++++++++++ .../kernels/cuda/snap_psibeta_kernel.cuh | 188 +++++++ .../module_rt/kernels/snap_psibeta_gpu.h | 73 +++ 6 files changed, 1161 insertions(+), 34 deletions(-) create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh create mode 100644 source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp index a5eba57688..cefe006150 100644 --- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp @@ -1,16 +1,21 @@ #include "td_nonlocal_lcao.h" -#include "source_io/module_parameter/parameter.h" #include "source_base/timer.h" #include "source_base/tool_title.h" #include "source_cell/module_neighbor/sltk_grid_driver.h" -#include "source_lcao/module_operator_lcao/operator_lcao.h" +#include "source_io/module_parameter/parameter.h" #include "source_lcao/module_hcontainer/hcontainer_funcs.h" +#include "source_lcao/module_operator_lcao/operator_lcao.h" #include "source_lcao/module_rt/snap_psibeta_half_tddft.h" +#ifdef __CUDA +#include "source_base/module_device/device.h" +#include "source_lcao/module_rt/kernels/snap_psibeta_gpu.h" +#endif + #include "source_pw/module_pwdft/global.h" #ifdef _OPENMP -#include #include +#include #endif template @@ -127,6 +132,16 @@ void hamilt::TDNonlocal>::calculate_HR() ModuleBase::TITLE("TDNonlocal", "calculate_HR"); ModuleBase::timer::tick("TDNonlocal", "calculate_HR"); +#ifdef __CUDA + // Initialize GPU resources before entering the computation loop + // Use set_device_by_rank for multi-GPU support + int dev_id = 0; +#ifdef __MPI + dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD); +#endif + module_rt::gpu::initialize_gpu_resources(); +#endif + const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV(); const int npol = this->ucell->get_npol(); const int nlm_dim = TD_info::out_current ? 4 : 1; @@ -145,9 +160,12 @@ void hamilt::TDNonlocal>::calculate_HR() nlm_tot[i].resize(nlm_dim); } - #pragma omp parallel +#pragma omp parallel { - #pragma omp for schedule(dynamic) +#ifdef __CUDA + cudaSetDevice(dev_id); +#endif +#pragma omp for schedule(dynamic) for (int ad = 0; ad < adjs.adj_num + 1; ++ad) { const int T1 = adjs.ntype[ad]; @@ -160,31 +178,57 @@ void hamilt::TDNonlocal>::calculate_HR() all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); std::sort(all_indexes.begin(), all_indexes.end()); all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); - for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) + +#ifdef __CUDA + // GPU path: batch process all orbitals for this neighbor + // Use GPU when there are enough orbitals to benefit from parallelism + constexpr int GPU_THRESHOLD = 4; + bool gpu_success = false; + if (all_indexes.size() / npol >= GPU_THRESHOLD) + { + gpu_success = module_rt::gpu::snap_psibeta_neighbor_batch_gpu(orb_, + this->ucell->infoNL, + T1, + tau1 * this->ucell->lat0, + atom1, + all_indexes, + npol, + T0, + tau0 * this->ucell->lat0, + cart_At, + nlm_tot[ad], + TD_info::out_current); + } + if (!gpu_success) +#endif { - const int iw1 = all_indexes[iw1l] / npol; - std::vector>> nlm; - // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false - // and size of outer vector is 4 when out_current is true (3 for , 1 for - // ) inner loop : all projectors (L0,M0) - - // snap_psibeta_half_tddft() are used to calculate - // and as well if current are needed - module_rt::snap_psibeta_half_tddft(orb_, - this->ucell->infoNL, - nlm, - tau1 * this->ucell->lat0, - T1, - atom1->iw2l[iw1], - atom1->iw2m[iw1], - atom1->iw2n[iw1], - tau0 * this->ucell->lat0, - T0, - cart_At, - TD_info::out_current); - for (int dir = 0; dir < nlm_dim; dir++) + // CPU path: loop over orbitals + for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { - nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + const int iw1 = all_indexes[iw1l] / npol; + std::vector>> nlm; + // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false + // and size of outer vector is 4 when out_current is true (3 for , 1 + // for ) inner loop : all projectors (L0,M0) + + // snap_psibeta_half_tddft() are used to calculate + // and as well if current are needed + module_rt::snap_psibeta_half_tddft(orb_, + this->ucell->infoNL, + nlm, + tau1 * this->ucell->lat0, + T1, + atom1->iw2l[iw1], + atom1->iw2m[iw1], + atom1->iw2n[iw1], + tau0 * this->ucell->lat0, + T0, + cart_At, + TD_info::out_current); + for (int dir = 0; dir < nlm_dim; dir++) + { + nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + } } } } @@ -205,7 +249,7 @@ void hamilt::TDNonlocal>::calculate_HR() const int thread_id = omp_get_thread_num(); std::set ad_atom_set_thread; int i = 0; - for(const auto iat1 : ad_atom_set) + for (const auto iat1: ad_atom_set) { if (i % num_threads == thread_id) { @@ -228,7 +272,7 @@ void hamilt::TDNonlocal>::calculate_HR() continue; } #endif - + const ModuleBase::Vector3& R_index1 = adjs.box[ad1]; for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2) { @@ -250,8 +294,8 @@ void hamilt::TDNonlocal>::calculate_HR() for (int i = 0; i < 3; i++) { tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i) - ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]) - ->get_pointer(); + ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]) + ->get_pointer(); } this->cal_HR_IJR(iat1, iat2, @@ -279,10 +323,16 @@ void hamilt::TDNonlocal>::calculate_HR() } } +#ifdef __CUDA + // Release GPU resources at end of calculation + module_rt::gpu::finalize_gpu_resources(); +#endif + ModuleBase::timer::tick("TDNonlocal", "calculate_HR"); } // cal_HR_IJR() + template void hamilt::TDNonlocal>::cal_HR_IJR( const int& iat1, @@ -396,7 +446,6 @@ void hamilt::TDNonlocal>::set_HR_fixed(void* hR_tmp this->allocated = false; } - // contributeHR() template void hamilt::TDNonlocal>::contributeHR() @@ -436,7 +485,6 @@ void hamilt::TDNonlocal>::contributeHR() return; } - template void hamilt::TDNonlocal>::contributeHk(int ik) { diff --git a/source/source_lcao/module_rt/CMakeLists.txt b/source/source_lcao/module_rt/CMakeLists.txt index cca79af03a..35a65a5aa7 100644 --- a/source/source_lcao/module_rt/CMakeLists.txt +++ b/source/source_lcao/module_rt/CMakeLists.txt @@ -18,6 +18,13 @@ if(ENABLE_LCAO) boundary_fix.cpp ) + if(USE_CUDA) + list(APPEND objects + kernels/cuda/snap_psibeta_kernel.cu + kernels/cuda/snap_psibeta_gpu.cu + ) + endif() + add_library( tddft OBJECT diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu new file mode 100644 index 0000000000..c9eafe1e71 --- /dev/null +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -0,0 +1,351 @@ +#include "../snap_psibeta_gpu.h" +#include "snap_psibeta_kernel.cuh" + +#include +#include +#include + +namespace module_rt +{ +namespace gpu +{ + +//============================================================================= +// CUDA Error Checking Macro +//============================================================================= + +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) \ + { \ + fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + return false; \ + } \ + } while (0) + +//============================================================================= +// Resource Management - Simplified (using constant memory for grids) +//============================================================================= + +void initialize_gpu_resources() +{ + // Check for CUDA device + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) + { + fprintf(stderr, "[CUDA] No CUDA devices found or error getting device count!\n"); + return; + } + + // Copy integration grids (Lebedev and Gauss-Legendre) to constant memory + copy_grids_to_device(); + + // Sync to ensure all initialization is complete + cudaDeviceSynchronize(); +} + +void finalize_gpu_resources() +{ + // No explicit cleanup needed - constant memory is managed by CUDA runtime + // Just reset any CUDA errors that may have accumulated + cudaGetLastError(); +} + +//============================================================================= +// Main GPU Interface Function +//============================================================================= + +bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, + const InfoNonlocal& infoNL_, + const int T1, + const ModuleBase::Vector3& R1, + const Atom* atom1, + const std::vector& all_indexes, + const int npol, + const int T0, + const ModuleBase::Vector3& R0, + const ModuleBase::Vector3& A, + std::vector>>>& nlm_all, + const bool calc_r) +{ + + // Get projector info + const int nproj = infoNL_.nproj[T0]; + if (nproj == 0) + { + return true; + } + + // Calculate number of orbitals + int num_orbitals = all_indexes.size() / npol; + if (num_orbitals == 0) + { + return true; + } + + // Calculate natomwfc (total projector components) + int natomwfc = 0; + std::vector proj_m0_offset_h(nproj); + for (int ip = 0; ip < nproj; ip++) + { + proj_m0_offset_h[ip] = natomwfc; + int L0 = infoNL_.Beta[T0].Proj[ip].getL(); + + // Check L0 against MAX_L + if (L0 > MAX_L) + { + fprintf(stderr, "[CUDA] Warning: L0=%d exceeds MAX_L=%d, falling back to CPU\n", L0, MAX_L); + return false; + } + + natomwfc += 2 * L0 + 1; + } + + int nlm_dim = calc_r ? 4 : 1; + + // Resize output + if (nlm_all.size() != nlm_dim) + { + nlm_all.resize(nlm_dim); + } + + //========================================================================= + // Prepare Host Data + //========================================================================= + + // Orbital data + std::vector orbitals_h(num_orbitals); + + // Calculate total psi size and offsets + std::vector psi_offsets_h(num_orbitals); + int total_psi_size = 0; + + for (int i = 0; i < num_orbitals; i++) + { + int iw1l = i * npol; + int iw1 = all_indexes[iw1l] / npol; + + int L1 = atom1->iw2l[iw1]; + int m1 = atom1->iw2m[iw1]; + int N1 = atom1->iw2n[iw1]; + + // Check L1 against MAX_L + if (L1 > MAX_L) + { + fprintf(stderr, "[CUDA] Warning: L1=%d exceeds MAX_L=%d, falling back to CPU\n", L1, MAX_L); + return false; + } + + const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1); + + orbitals_h[i].L1 = L1; + orbitals_h[i].m1 = m1; + orbitals_h[i].N1 = N1; + orbitals_h[i].iw_index = all_indexes[iw1l]; + orbitals_h[i].psi_offset = total_psi_size; + orbitals_h[i].psi_mesh = phi_ln.getNr(); + orbitals_h[i].psi_dk = phi_ln.getDk(); + orbitals_h[i].psi_rcut = phi_ln.getRcut(); + + psi_offsets_h[i] = total_psi_size; + total_psi_size += phi_ln.getNr(); + } + + // Copy psi radial data + std::vector psi_radial_h(total_psi_size); + for (int i = 0; i < num_orbitals; i++) + { + int iw1l = i * npol; + int iw1 = all_indexes[iw1l] / npol; + + int L1 = atom1->iw2l[iw1]; + int N1 = atom1->iw2n[iw1]; + + const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1); + int mesh = phi_ln.getNr(); + const double* psi = phi_ln.getPsi(); + + for (int j = 0; j < mesh; j++) + { + psi_radial_h[orbitals_h[i].psi_offset + j] = psi[j]; + } + } + + // Projector data + std::vector projectors_h(nproj); + std::vector beta_offsets_h(nproj); + int total_beta_size = 0; + + for (int ip = 0; ip < nproj; ip++) + { + const auto& proj = infoNL_.Beta[T0].Proj[ip]; + + projectors_h[ip].L0 = proj.getL(); + projectors_h[ip].beta_offset = total_beta_size; + projectors_h[ip].beta_mesh = proj.getNr(); + projectors_h[ip].beta_dk = proj.getDk(); + projectors_h[ip].beta_rcut = proj.getRcut(); + + // Get r_min and r_max from radial grid + const double* radial = proj.getRadial(); + int mesh = proj.getNr(); + projectors_h[ip].r_min = radial[0]; + projectors_h[ip].r_max = radial[mesh - 1]; + + beta_offsets_h[ip] = total_beta_size; + total_beta_size += mesh; + } + + // Copy beta radial data + std::vector beta_radial_h(total_beta_size); + for (int ip = 0; ip < nproj; ip++) + { + const auto& proj = infoNL_.Beta[T0].Proj[ip]; + int mesh = proj.getNr(); + const double* beta_r = proj.getBeta_r(); + + for (int j = 0; j < mesh; j++) + { + beta_radial_h[projectors_h[ip].beta_offset + j] = beta_r[j]; + } + } + + //========================================================================= + // Allocate Device Memory + //========================================================================= + + OrbitalData* orbitals_d = nullptr; + ProjectorData* projectors_d = nullptr; + double* psi_radial_d = nullptr; + double* beta_radial_d = nullptr; + int* proj_m0_offset_d = nullptr; + cuDoubleComplex* nlm_out_d = nullptr; + + size_t output_size = num_orbitals * nlm_dim * natomwfc; + + CUDA_CHECK(cudaMalloc(&orbitals_d, num_orbitals * sizeof(OrbitalData))); + CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData))); + CUDA_CHECK(cudaMalloc(&psi_radial_d, total_psi_size * sizeof(double))); + CUDA_CHECK(cudaMalloc(&beta_radial_d, total_beta_size * sizeof(double))); + CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int))); + CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex))); + + //========================================================================= + // Copy Data to Device + //========================================================================= + + CUDA_CHECK(cudaMemcpy(orbitals_d, orbitals_h.data(), num_orbitals * sizeof(OrbitalData), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(psi_radial_d, psi_radial_h.data(), total_psi_size * sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(beta_radial_d, beta_radial_h.data(), total_beta_size * sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice)); + + // Initialize output to zero + CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex))); + + //========================================================================= + // Launch Kernel + //========================================================================= + + double3 R1_d3 = make_double3(R1.x, R1.y, R1.z); + double3 R0_d3 = make_double3(R0.x, R0.y, R0.z); + double3 A_d3 = make_double3(A.x, A.y, A.z); + + dim3 grid(num_orbitals, nproj, 1); + dim3 block(BLOCK_SIZE, 1, 1); + + snap_psibeta_neighbor_batch_kernel<<>>(R1_d3, + R0_d3, + A_d3, + orbitals_d, + projectors_d, + psi_radial_d, + beta_radial_d, + proj_m0_offset_d, + num_orbitals, + nproj, + natomwfc, + nlm_dim, + nlm_out_d); + + // Check for kernel errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + fprintf(stderr, "[CUDA] Kernel launch error: %s\n", cudaGetErrorString(err)); + // Cleanup and return + cudaFree(orbitals_d); + cudaFree(projectors_d); + cudaFree(psi_radial_d); + cudaFree(beta_radial_d); + cudaFree(proj_m0_offset_d); + cudaFree(nlm_out_d); + return false; + } + + CUDA_CHECK(cudaDeviceSynchronize()); + + // Check for errors after synchronization + err = cudaGetLastError(); + if (err != cudaSuccess) + { + fprintf(stderr, "[CUDA] Kernel execution error: %s\n", cudaGetErrorString(err)); + cudaFree(orbitals_d); + cudaFree(projectors_d); + cudaFree(psi_radial_d); + cudaFree(beta_radial_d); + cudaFree(proj_m0_offset_d); + cudaFree(nlm_out_d); + return false; + } + + //========================================================================= + // Copy Results Back + //========================================================================= + + std::vector nlm_out_h(output_size); + CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost)); + + // Reconstruct output into nlm_all format + for (int i = 0; i < num_orbitals; i++) + { + int iw_index = orbitals_h[i].iw_index; + + // Create nlm vectors for this orbital + std::vector>> nlm(nlm_dim); + for (int d = 0; d < nlm_dim; d++) + { + nlm[d].resize(natomwfc); + for (int k = 0; k < natomwfc; k++) + { + size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k; + nlm[d][k] = std::complex(nlm_out_h[idx].x, nlm_out_h[idx].y); + } + } + + // Insert into output maps + for (int dir = 0; dir < nlm_dim; dir++) + { + nlm_all[dir].insert({iw_index, nlm[dir]}); + } + } + + //========================================================================= + // Cleanup + //========================================================================= + + cudaFree(orbitals_d); + cudaFree(projectors_d); + cudaFree(psi_radial_d); + cudaFree(beta_radial_d); + cudaFree(proj_m0_offset_d); + cudaFree(nlm_out_d); + return true; +} + +} // namespace gpu +} // namespace module_rt diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu new file mode 100644 index 0000000000..1e752d6d86 --- /dev/null +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -0,0 +1,460 @@ +#include "snap_psibeta_kernel.cuh" +#include "source_base/math_integral.h" + +#include +#include + +namespace module_rt +{ +namespace gpu +{ + +//============================================================================= +// Constant Memory for Integration Grids +//============================================================================= + +// Lebedev-Laikov angular grid (110 points) +__constant__ double d_lebedev_x[ANGULAR_GRID_NUM]; +__constant__ double d_lebedev_y[ANGULAR_GRID_NUM]; +__constant__ double d_lebedev_z[ANGULAR_GRID_NUM]; +__constant__ double d_lebedev_w[ANGULAR_GRID_NUM]; + +// Gauss-Legendre radial grid (140 points) +__constant__ double d_gl_x[RADIAL_GRID_NUM]; +__constant__ double d_gl_w[RADIAL_GRID_NUM]; + +//============================================================================= +// Spherical Harmonics Implementation +//============================================================================= + +/** + * @brief Compute real spherical harmonics Y_lm + * Based on the recursive formula used in ModuleBase::Ylm + */ +__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm) +{ + const double PI = 3.14159265358979323846; + const double FOUR_PI = 4.0 * PI; + const double SQRT2 = 1.41421356237309504880; + + // Y_00 + ylm[0] = 0.5 * sqrt(1.0 / PI); + + if (L == 0) + { + return; + } + + // Compute theta and phi + double rxy = sqrt(x * x + y * y); + double r = sqrt(x * x + y * y + z * z); + + double cost, phi; + if (r < 1e-10) + { + cost = 1.0; + phi = 0.0; + } + else + { + cost = z / r; + if (rxy < 1e-10) + { + phi = 0.0; + } + else if (x > 1e-10) + { + phi = atan(y / x); + } + else if (x < -1e-10) + { + phi = atan(y / x) + PI; + } + else + { + phi = (y >= 0.0) ? PI / 2.0 : -PI / 2.0; + } + } + + double sint = sqrt(1.0 - cost * cost); + if (sint < 0.0) + { + sint = 0.0; + } + + // Associated Legendre polynomials P_l^m + double p[MAX_L + 1][MAX_L + 1]; + + for (int l = 0; l <= L; l++) + { + for (int m = 0; m <= L; m++) + { + p[l][m] = 0.0; + } + } + + p[0][0] = 1.0; + + if (L >= 1) + { + p[1][0] = cost; + p[1][1] = -sint; + } + + for (int l = 2; l <= L; l++) + { + for (int m = 0; m <= l - 2; m++) + { + p[l][m] = ((2 * l - 1) * cost * p[l - 1][m] - (l - 1 + m) * p[l - 2][m]) / (double)(l - m); + } + p[l][l - 1] = (2 * l - 1) * cost * p[l - 1][l - 1]; + double fact = 1.0; + for (int i = 1; i <= 2 * l - 1; i += 2) + { + fact *= i; + } + double sint_pow = 1.0; + for (int i = 0; i < l; i++) + { + sint_pow *= sint; + } + p[l][l] = fact * sint_pow; + if (l % 2 == 1) + { + p[l][l] = -p[l][l]; + } + } + + // Compute Y_lm + int lm = 0; + for (int l = 0; l <= L; l++) + { + double c = sqrt((2.0 * l + 1.0) / FOUR_PI); + + ylm[lm] = c * p[l][0]; + lm++; + + for (int m = 1; m <= l; m++) + { + double fact_ratio = 1.0; + for (int i = l - m + 1; i <= l + m; i++) + { + fact_ratio *= i; + } + double same = c * sqrt(1.0 / fact_ratio) * SQRT2; + + ylm[lm] = same * p[l][m] * cos(m * phi); + lm++; + + ylm[lm] = same * p[l][m] * sin(m * phi); + lm++; + } + } +} + +//============================================================================= +// Warp Reduction for double +//============================================================================= + +__device__ __forceinline__ double warp_reduce_sum(double val) +{ + for (int offset = 16; offset > 0; offset /= 2) + { + val += __shfl_down_sync(0xffffffff, val, offset); + } + return val; +} + +//============================================================================= +// Main Kernel Implementation - Memory Efficient Version +//============================================================================= + +__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1, + double3 R0, + double3 A, + const OrbitalData* __restrict__ orbitals, + const ProjectorData* __restrict__ projectors, + const double* __restrict__ psi_radial, + const double* __restrict__ beta_radial, + const int* __restrict__ proj_m0_offset, + int num_orbitals, + int nproj, + int natomwfc, + int nlm_dim, + cuDoubleComplex* __restrict__ nlm_out) +{ + + int orb_idx = blockIdx.x; + int proj_idx = blockIdx.y; + int tid = threadIdx.x; + + if (orb_idx >= num_orbitals || proj_idx >= nproj) + { + return; + } + + // Load orbital and projector data + OrbitalData orb = orbitals[orb_idx]; + ProjectorData proj = projectors[proj_idx]; + + int L1 = orb.L1; + int m1 = orb.m1; + int L0 = proj.L0; + int num_m0 = 2 * L0 + 1; + + // Distance check + double3 dRa = make_double3(R0.x - R1.x, R0.y - R1.y, R0.z - R1.z); + double distance = sqrt(dRa.x * dRa.x + dRa.y * dRa.y + dRa.z * dRa.z); + + // Check for NaN + if (isnan(distance)) + { + return; + } + + double Rcut_sum = orb.psi_rcut + proj.beta_rcut; + + if (distance > Rcut_sum) + { + // Zero output + if (tid == 0) + { + int m0_offset = proj_m0_offset[proj_idx]; + int out_base = orb_idx * nlm_dim * natomwfc; + for (int m0 = 0; m0 < num_m0; m0++) + { + for (int d = 0; d < nlm_dim; d++) + { + nlm_out[out_base + d * natomwfc + m0_offset + m0] = make_cuDoubleComplex(0.0, 0.0); + } + } + } + return; + } + + // Reduced shared memory: only for reduction (2 doubles per thread * BLOCK_SIZE) + __shared__ double s_temp_re[BLOCK_SIZE]; + __shared__ double s_temp_im[BLOCK_SIZE]; + + // Radial grid mapping + double xl = (proj.r_max - proj.r_min) * 0.5; + double xmean = (proj.r_max + proj.r_min) * 0.5; + + // Precompute A dot R0 + double A_phase_R0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; + cuDoubleComplex exp_iAR0 = cu_exp_i(A_phase_R0); + + // Result accumulators in registers - one per m0 component + double result_re[MAX_YLM_SIZE]; + double result_im[MAX_YLM_SIZE]; + double result_r_re[3][MAX_YLM_SIZE]; + double result_r_im[3][MAX_YLM_SIZE]; + + // Initialize accumulators + for (int m0 = 0; m0 < num_m0; m0++) + { + result_re[m0] = 0.0; + result_im[m0] = 0.0; + if (nlm_dim == 4) + { + for (int d = 0; d < 3; d++) + { + result_r_re[d][m0] = 0.0; + result_r_im[d][m0] = 0.0; + } + } + } + + // Radial loop - using constant memory for Gauss-Legendre grid + for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) + { + double r_val = xmean + xl * d_gl_x[ir]; + double w_rad = xl * d_gl_w[ir]; + + // Angular loop - each thread handles multiple angular points + for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) + { + double leb_x = d_lebedev_x[ian]; + double leb_y = d_lebedev_y[ian]; + double leb_z = d_lebedev_z[ian]; + double w_ang = d_lebedev_w[ian]; + + double rx = r_val * leb_x; + double ry = r_val * leb_y; + double rz = r_val * leb_z; + + double tx = rx + dRa.x; + double ty = ry + dRa.y; + double tz = rz + dRa.z; + double tnorm = sqrt(tx * tx + ty * ty + tz * tz); + + if (tnorm <= orb.psi_rcut) + { + // Compute Y_L1 + double ylm1[MAX_YLM_SIZE]; + if (tnorm > 1e-10) + { + double inv_tnorm = 1.0 / tnorm; + compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); + } + else + { + compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1); + } + + // Compute Y_L0 + double ylm0[MAX_YLM_SIZE]; + compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); + + // Interpolate psi + double psi_val + = interpolate_radial_gpu(psi_radial + orb.psi_offset, orb.psi_mesh, 1.0 / orb.psi_dk, tnorm); + + // Phase factor + double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; + double phase = r_val * A_dot_leb; + cuDoubleComplex exp_iAr = cu_exp_i(phase); + + // Interpolate beta + double beta_val + = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); + + // Y_L1m1 + int offset_L1 = L1 * L1 + m1; + double ylm_L1_val = ylm1[offset_L1]; + + // Combined factor + double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; + cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); + + // Accumulate for all m0 + int offset_L0 = L0 * L0; + for (int m0 = 0; m0 < num_m0; m0++) + { + double ylm0_val = ylm0[offset_L0 + m0]; + + result_re[m0] += common_factor.x * ylm0_val; + result_im[m0] += common_factor.y * ylm0_val; + + if (nlm_dim == 4) + { + double r_op_x = rx + R0.x; + double r_op_y = ry + R0.y; + double r_op_z = rz + R0.z; + + result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x; + result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x; + result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y; + result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y; + result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z; + result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z; + } + } + } + } + } + + // Reduce across threads for each m0 + int m0_offset = proj_m0_offset[proj_idx]; + int out_base = orb_idx * nlm_dim * natomwfc; + + for (int m0 = 0; m0 < num_m0; m0++) + { + // Reduce real part + s_temp_re[tid] = result_re[m0]; + s_temp_im[tid] = result_im[m0]; + __syncthreads(); + + // Standard block reduction + for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + { + if (tid < s) + { + s_temp_re[tid] += s_temp_re[tid + s]; + s_temp_im[tid] += s_temp_im[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) + { + cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); + result = cu_mul(result, exp_iAR0); + result = cu_conj(result); + nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result; + } + __syncthreads(); + + // Position operator terms + if (nlm_dim == 4) + { + for (int d = 0; d < 3; d++) + { + s_temp_re[tid] = result_r_re[d][m0]; + s_temp_im[tid] = result_r_im[d][m0]; + __syncthreads(); + + for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + { + if (tid < s) + { + s_temp_re[tid] += s_temp_re[tid + s]; + s_temp_im[tid] += s_temp_im[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) + { + cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); + result_r = cu_mul(result_r, exp_iAR0); + result_r = cu_conj(result_r); + nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r; + } + __syncthreads(); + } + } + } +} + +//============================================================================= +// Host-side Helper Function Implementations +//============================================================================= + +#define CUDA_CHECK_KERNEL(call) \ + do \ + { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) \ + { \ + fprintf(stderr, "[CUDA Kernel] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + } \ + } while (0) + +void copy_grids_to_device() +{ + // Copy Lebedev-Laikov angular grid to constant memory + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x, + ModuleBase::Integral::Lebedev_Laikov_grid110_x, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_y, + ModuleBase::Integral::Lebedev_Laikov_grid110_y, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_z, + ModuleBase::Integral::Lebedev_Laikov_grid110_z, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_w, + ModuleBase::Integral::Lebedev_Laikov_grid110_w, + ANGULAR_GRID_NUM * sizeof(double))); + + // Prepare and copy Gauss-Legendre radial grid to constant memory + std::vector h_gl_x(RADIAL_GRID_NUM); + std::vector h_gl_w(RADIAL_GRID_NUM); + ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data()); + + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double))); + CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double))); +} + +} // namespace gpu +} // namespace module_rt diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh new file mode 100644 index 0000000000..085361e81e --- /dev/null +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -0,0 +1,188 @@ +#ifndef SNAP_PSIBETA_KERNEL_CUH +#define SNAP_PSIBETA_KERNEL_CUH + +#include +#include + +namespace module_rt +{ +namespace gpu +{ + +// Grid constants +constexpr int RADIAL_GRID_NUM = 140; +constexpr int ANGULAR_GRID_NUM = 110; +constexpr int BLOCK_SIZE = 128; // >= ANGULAR_GRID_NUM for efficient reduction +constexpr int MAX_L = 4; // Maximum angular momentum supported +constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25 + +//============================================================================= +// Device Helper Functions +//============================================================================= + +/** + * @brief GPU version of cubic interpolation for radial functions + */ +__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi, + int mesh, + double inv_dk, + double distance) +{ + double position = distance * inv_dk; + int iq = __double2int_rd(position); // floor + + if (iq > mesh - 4) + return 0.0; + if (iq < 0) + return 0.0; + + double x0 = position - static_cast(iq); + double x1 = 1.0 - x0; + double x2 = 2.0 - x0; + double x3 = 3.0 - x0; + + return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0; +} + +/** + * @brief Compute exp(i * theta) = cos(theta) + i * sin(theta) + */ +__device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta) +{ + double s, c; + sincos(theta, &s, &c); + return make_cuDoubleComplex(c, s); +} + +/** + * @brief Complex multiplication + */ +__device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleComplex b) +{ + return make_cuDoubleComplex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); +} + +/** + * @brief Complex addition + */ +__device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleComplex b) +{ + return make_cuDoubleComplex(a.x + b.x, a.y + b.y); +} + +/** + * @brief Complex conjugate + */ +__device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a) +{ + return make_cuDoubleComplex(a.x, -a.y); +} + +/** + * @brief Complex * real + */ +__device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double r) +{ + return make_cuDoubleComplex(a.x * r, a.y * r); +} + +/** + * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z) + * This is a simplified version for small L values + * + * @param L Angular momentum (0 <= L <= MAX_L) + * @param x, y, z Direction (should be normalized) + * @param ylm Output array of size (L+1)^2 + */ +__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm); + +//============================================================================= +// Kernel Input/Output Structures +//============================================================================= + +/** + * @brief Information about a single orbital for batch processing + */ +struct OrbitalData +{ + int L1; // Angular momentum + int m1; // Magnetic quantum number + int N1; // Radial quantum number + int iw_index; // Original orbital index for output mapping + int psi_offset; // Offset in flattened psi array + int psi_mesh; // Number of mesh points + double psi_dk; // Grid spacing + double psi_rcut; // Cutoff radius +}; + +/** + * @brief Information about a single projector + */ +struct ProjectorData +{ + int L0; // Angular momentum + int beta_offset; // Offset in flattened beta array + int beta_mesh; // Number of mesh points + double beta_dk; // Grid spacing + double beta_rcut; // Cutoff radius + double r_min; // Minimum radial value + double r_max; // Maximum radial value +}; + +//============================================================================= +// Main Kernel +//============================================================================= + +/** + * @brief Main kernel for neighbor-level batch processing + * + * Grid: (num_orbitals, nproj, 1) + * Block: (BLOCK_SIZE, 1, 1) + * + * Each block processes one (orbital, projector) pair. + * Threads within a block parallelize over angular integration points. + * + * @param R1 Neighbor atom position + * @param R0 Projector atom position + * @param A Vector potential + * @param orbitals Array of orbital data [num_orbitals] + * @param projectors Array of projector data [nproj] + * @param psi_radial Flattened orbital radial functions + * @param beta_radial Flattened projector radial functions + * @param num_orbitals Number of orbitals in batch + * @param nproj Number of projectors + * @param natomwfc Total number of projector components (sum of 2*L0+1) + * @param calc_r Whether to compute position operator elements + * @param nlm_out Output array [num_orbitals * nlm_dim * natomwfc] + * + * Note: Gauss-Legendre grids (gl_x, gl_w) and Lebedev grids are stored in constant memory + */ +__global__ void snap_psibeta_neighbor_batch_kernel( + double3 R1, + double3 R0, + double3 A, + const OrbitalData* __restrict__ orbitals, + const ProjectorData* __restrict__ projectors, + const double* __restrict__ psi_radial, + const double* __restrict__ beta_radial, + const int* __restrict__ proj_m0_offset, // Offset for each projector in output + int num_orbitals, + int nproj, + int natomwfc, + int nlm_dim, + cuDoubleComplex* __restrict__ nlm_out); + +//============================================================================= +// Host-side Helper Functions +//============================================================================= + +/** + * @brief Copy integration grids (Lebedev and Gauss-Legendre) to constant memory + * Should be called once before kernel execution in each calculate_HR call + */ +void copy_grids_to_device(); + +} // namespace gpu +} // namespace module_rt + +#endif // SNAP_PSIBETA_KERNEL_CUH diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h new file mode 100644 index 0000000000..ab6f55a617 --- /dev/null +++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h @@ -0,0 +1,73 @@ +#ifndef SNAP_PSIBETA_GPU_H +#define SNAP_PSIBETA_GPU_H + +#include "source_base/vector3.h" +#include "source_basis/module_ao/ORB_read.h" +#include "source_cell/setup_nonlocal.h" +#include "source_cell/unitcell.h" + +#include +#include +#include + +#ifdef __CUDA +#include +#endif + +namespace module_rt +{ +namespace gpu +{ + +/** + * @brief Initialize GPU resources (copy grids to constant memory) + * Should be called at the start of each calculate_HR + */ +void initialize_gpu_resources(); + +/** + * @brief Release GPU resources (clear any error states) + * Should be called at the end of each calculate_HR + */ +void finalize_gpu_resources(); + +/** + * @brief Neighbor-level GPU batch processing interface + * + * Computes matrix elements for all orbitals on a neighbor atom with projectors + * Replaces the original orbital loop: + * for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { + * snap_psibeta_half_tddft(...); + * } + * + * @param orb Orbital information + * @param infoNL_ Non-local pseudopotential information + * @param T1 Neighbor atom type + * @param R1 Neighbor atom position (already multiplied by lat0) + * @param atom1 Neighbor atom pointer (for iw2l, iw2m, iw2n) + * @param all_indexes All orbital indices on this neighbor + * @param npol Polarization number + * @param T0 Projector atom type + * @param R0 Projector atom position (already multiplied by lat0) + * @param A Vector potential + * @param nlm_all Output: nlm_all[dir][iw_index] = nlm_vector + * @param calc_r Whether to calculate position operator matrix elements + * @return true if GPU processing succeeded, false if caller should use CPU path + */ +bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, + const InfoNonlocal& infoNL_, + const int T1, + const ModuleBase::Vector3& R1, + const Atom* atom1, + const std::vector& all_indexes, + const int npol, + const int T0, + const ModuleBase::Vector3& R0, + const ModuleBase::Vector3& A, + std::vector>>>& nlm_all, + const bool calc_r); + +} // namespace gpu +} // namespace module_rt + +#endif // SNAP_PSIBETA_GPU_H From e879f6328306aa6e088bf54e883fc89328fe1cb0 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 9 Dec 2025 10:57:36 +0800 Subject: [PATCH 02/13] refactor(snap_psibeta_atom_batch_gpu): add timer and simplify code structure - Add ModuleBase::timer for snap_psibeta_atom_batch_gpu function - Remove GPU fallback to CPU design (return true/false in void function) - Replace fallback returns with error messages and proper early exits - Ensure timer is properly called on all exit paths - Simplify code structure for better readability --- .../module_operator_lcao/td_nonlocal_lcao.cpp | 132 ++++---- .../kernels/cuda/snap_psibeta_gpu.cu | 281 +++++++++++++++++- .../kernels/cuda/snap_psibeta_kernel.cu | 232 +++++++++++++++ .../kernels/cuda/snap_psibeta_kernel.cuh | 90 ++++-- .../module_rt/kernels/snap_psibeta_gpu.h | 55 ++-- 5 files changed, 667 insertions(+), 123 deletions(-) diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp index cefe006150..6d2972b92c 100644 --- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp @@ -160,79 +160,64 @@ void hamilt::TDNonlocal>::calculate_HR() nlm_tot[i].resize(nlm_dim); } -#pragma omp parallel - { #ifdef __CUDA - cudaSetDevice(dev_id); -#endif -#pragma omp for schedule(dynamic) - for (int ad = 0; ad < adjs.adj_num + 1; ++ad) + // GPU path: Atom-level GPU batch processing + module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_, + this->ucell->infoNL, + T0, + tau0 * this->ucell->lat0, + cart_At, + adjs, + this->ucell, + paraV, + npol, + nlm_dim, + nlm_tot); +#else + // CPU path: OpenMP parallel over neighbors to compute nlm_tot +#pragma omp parallel for schedule(dynamic) + for (int ad = 0; ad < adjs.adj_num + 1; ++ad) + { + const int T1 = adjs.ntype[ad]; + const int I1 = adjs.natom[ad]; + const int iat1 = ucell->itia2iat(T1, I1); + const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; + const Atom* atom1 = &ucell->atoms[T1]; + auto all_indexes = paraV->get_indexes_row(iat1); + auto col_indexes = paraV->get_indexes_col(iat1); + all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); + std::sort(all_indexes.begin(), all_indexes.end()); + all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); + + // CPU path: loop over orbitals + for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { - const int T1 = adjs.ntype[ad]; - const int I1 = adjs.natom[ad]; - const int iat1 = ucell->itia2iat(T1, I1); - const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; - const Atom* atom1 = &ucell->atoms[T1]; - auto all_indexes = paraV->get_indexes_row(iat1); - auto col_indexes = paraV->get_indexes_col(iat1); - all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); - std::sort(all_indexes.begin(), all_indexes.end()); - all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); - -#ifdef __CUDA - // GPU path: batch process all orbitals for this neighbor - // Use GPU when there are enough orbitals to benefit from parallelism - constexpr int GPU_THRESHOLD = 4; - bool gpu_success = false; - if (all_indexes.size() / npol >= GPU_THRESHOLD) - { - gpu_success = module_rt::gpu::snap_psibeta_neighbor_batch_gpu(orb_, - this->ucell->infoNL, - T1, - tau1 * this->ucell->lat0, - atom1, - all_indexes, - npol, - T0, - tau0 * this->ucell->lat0, - cart_At, - nlm_tot[ad], - TD_info::out_current); - } - if (!gpu_success) -#endif + const int iw1 = all_indexes[iw1l] / npol; + std::vector>> nlm; + module_rt::snap_psibeta_half_tddft(orb_, + this->ucell->infoNL, + nlm, + tau1 * this->ucell->lat0, + T1, + atom1->iw2l[iw1], + atom1->iw2m[iw1], + atom1->iw2n[iw1], + tau0 * this->ucell->lat0, + T0, + cart_At, + TD_info::out_current); + for (int dir = 0; dir < nlm_dim; dir++) { - // CPU path: loop over orbitals - for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) - { - const int iw1 = all_indexes[iw1l] / npol; - std::vector>> nlm; - // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false - // and size of outer vector is 4 when out_current is true (3 for , 1 - // for ) inner loop : all projectors (L0,M0) - - // snap_psibeta_half_tddft() are used to calculate - // and as well if current are needed - module_rt::snap_psibeta_half_tddft(orb_, - this->ucell->infoNL, - nlm, - tau1 * this->ucell->lat0, - T1, - atom1->iw2l[iw1], - atom1->iw2m[iw1], - atom1->iw2n[iw1], - tau0 * this->ucell->lat0, - T0, - cart_At, - TD_info::out_current); - for (int dir = 0; dir < nlm_dim; dir++) - { - nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); - } - } + nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); } } + } +#endif + // 2. calculate D for each pair of atoms + // This runs for BOTH GPU and CPU paths +#pragma omp parallel + { #ifdef _OPENMP // record the iat number of the adjacent atoms std::set ad_atom_set; @@ -259,7 +244,6 @@ void hamilt::TDNonlocal>::calculate_HR() } #endif - // 2. calculate D for each pair of atoms for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1) { const int T1 = adjs.ntype[ad1]; @@ -291,11 +275,11 @@ void hamilt::TDNonlocal>::calculate_HR() if (TD_info::out_current) { std::complex* tmp_c[3] = {nullptr, nullptr, nullptr}; - for (int i = 0; i < 3; i++) + for (int ii = 0; ii < 3; ii++) { - tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i) - ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]) - ->get_pointer(); + tmp_c[ii] = TD_info::td_vel_op->get_current_term_pointer(ii) + ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]) + ->get_pointer(); } this->cal_HR_IJR(iat1, iat2, @@ -320,8 +304,8 @@ void hamilt::TDNonlocal>::calculate_HR() } } } - } - } + } // end omp parallel for matrix assembly + } // end for iat0 #ifdef __CUDA // Release GPU resources at end of calculation diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu index c9eafe1e71..519b522c8f 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -1,5 +1,6 @@ #include "../snap_psibeta_gpu.h" #include "snap_psibeta_kernel.cuh" +#include "source_base/timer.h" #include #include @@ -21,7 +22,6 @@ namespace gpu if (err != cudaSuccess) \ { \ fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ - return false; \ } \ } while (0) @@ -347,5 +347,284 @@ bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, return true; } +//============================================================================= +// Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel +//============================================================================= + +void snap_psibeta_atom_batch_gpu( + const LCAO_Orbitals& orb, + const InfoNonlocal& infoNL_, + const int T0, + const ModuleBase::Vector3& R0, + const ModuleBase::Vector3& A, + const AdjacentAtomInfo& adjs, + const UnitCell* ucell, + const Parallel_Orbitals* paraV, + const int npol, + const int nlm_dim, + std::vector>>>>& nlm_tot) +{ + ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); + + // Get projector info + const int nproj = infoNL_.nproj[T0]; + if (nproj == 0) + { + ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); + return; + } + + // Calculate natomwfc + int natomwfc = 0; + std::vector proj_m0_offset_h(nproj); + for (int ip = 0; ip < nproj; ip++) + { + proj_m0_offset_h[ip] = natomwfc; + int L0 = infoNL_.Beta[T0].Proj[ip].getL(); + if (L0 > MAX_L) + { + fprintf(stderr, + "[CUDA] Error: L0=%d exceeds MAX_L=%d, GPU kernel may produce incorrect results\n", + L0, + MAX_L); + } + natomwfc += 2 * L0 + 1; + } + + //========================================================================= + // Collect all neighbor-orbital pairs + //========================================================================= + + std::vector neighbor_orbitals_h; + std::vector psi_radial_h; + + // Track neighbor index for each orbital for output reconstruction + struct OrbitalMapping + { + int neighbor_idx; + int iw_index; + }; + std::vector orbital_mappings; + + for (int ad = 0; ad < adjs.adj_num + 1; ++ad) + { + const int T1 = adjs.ntype[ad]; + const int I1 = adjs.natom[ad]; + const int iat1 = ucell->itia2iat(T1, I1); + const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; + const Atom* atom1 = &ucell->atoms[T1]; + + // Get orbital indices + auto all_indexes = paraV->get_indexes_row(iat1); + auto col_indexes = paraV->get_indexes_col(iat1); + all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); + std::sort(all_indexes.begin(), all_indexes.end()); + all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); + + // Process each orbital + for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) + { + const int iw1 = all_indexes[iw1l] / npol; + const int L1 = atom1->iw2l[iw1]; + const int m1 = atom1->iw2m[iw1]; + const int N1 = atom1->iw2n[iw1]; + + if (L1 > MAX_L) + { + continue; // Skip orbitals with L > MAX_L + } + + // Get orbital radial data + // Use getPsi() to match CPU implementation (not getPsi_r() which returns psi*r) + const double* phi_psi = orb.Phi[T1].PhiLN(L1, N1).getPsi(); + int mesh = orb.Phi[T1].PhiLN(L1, N1).getNr(); + double dk = orb.Phi[T1].PhiLN(L1, N1).getDk(); + double rcut = orb.Phi[T1].getRcut(); + + // Add to flattened psi array + size_t psi_offset = psi_radial_h.size(); + psi_radial_h.insert(psi_radial_h.end(), phi_psi, phi_psi + mesh); + + // Create neighbor-orbital data + NeighborOrbitalData norb; + norb.neighbor_idx = ad; + norb.R1 = make_double3(tau1.x * ucell->lat0, tau1.y * ucell->lat0, tau1.z * ucell->lat0); + norb.L1 = L1; + norb.m1 = m1; + norb.N1 = N1; + norb.iw_index = all_indexes[iw1l]; + norb.psi_offset = static_cast(psi_offset); + norb.psi_mesh = mesh; + norb.psi_dk = dk; + norb.psi_rcut = rcut; + + neighbor_orbitals_h.push_back(norb); + + OrbitalMapping mapping; + mapping.neighbor_idx = ad; + mapping.iw_index = all_indexes[iw1l]; + orbital_mappings.push_back(mapping); + } + } + + int total_neighbor_orbitals = static_cast(neighbor_orbitals_h.size()); + if (total_neighbor_orbitals == 0) + { + ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); + return; + } + + //========================================================================= + // Prepare projector data + //========================================================================= + + std::vector projectors_h(nproj); + std::vector beta_radial_h; + + for (int ip = 0; ip < nproj; ip++) + { + const auto& proj = infoNL_.Beta[T0].Proj[ip]; + int L0 = proj.getL(); + int mesh = proj.getNr(); + double dk = proj.getDk(); + double rcut = proj.getRcut(); + const double* beta_r = proj.getBeta_r(); + const double* radial = proj.getRadial(); + + projectors_h[ip].L0 = L0; + projectors_h[ip].beta_offset = static_cast(beta_radial_h.size()); + projectors_h[ip].beta_mesh = mesh; + projectors_h[ip].beta_dk = dk; + projectors_h[ip].beta_rcut = rcut; + // Use actual radial grid range + projectors_h[ip].r_min = radial[0]; + projectors_h[ip].r_max = radial[mesh - 1]; + + beta_radial_h.insert(beta_radial_h.end(), beta_r, beta_r + mesh); + } + + //========================================================================= + // Allocate Device Memory + //========================================================================= + + NeighborOrbitalData* neighbor_orbitals_d = nullptr; + ProjectorData* projectors_d = nullptr; + double* psi_radial_d = nullptr; + double* beta_radial_d = nullptr; + int* proj_m0_offset_d = nullptr; + cuDoubleComplex* nlm_out_d = nullptr; + + size_t output_size = total_neighbor_orbitals * nlm_dim * natomwfc; + + CUDA_CHECK(cudaMalloc(&neighbor_orbitals_d, total_neighbor_orbitals * sizeof(NeighborOrbitalData))); + CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData))); + CUDA_CHECK(cudaMalloc(&psi_radial_d, psi_radial_h.size() * sizeof(double))); + CUDA_CHECK(cudaMalloc(&beta_radial_d, beta_radial_h.size() * sizeof(double))); + CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int))); + CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex))); + + //========================================================================= + // Copy Data to Device + //========================================================================= + + CUDA_CHECK(cudaMemcpy(neighbor_orbitals_d, + neighbor_orbitals_h.data(), + total_neighbor_orbitals * sizeof(NeighborOrbitalData), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(psi_radial_d, psi_radial_h.data(), psi_radial_h.size() * sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(beta_radial_d, beta_radial_h.data(), beta_radial_h.size() * sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex))); + + //========================================================================= + // Launch Kernel - SINGLE launch for ALL neighbors! + //========================================================================= + + double3 R0_d3 = make_double3(R0.x, R0.y, R0.z); + double3 A_d3 = make_double3(A.x, A.y, A.z); + + dim3 grid(total_neighbor_orbitals, nproj, 1); + dim3 block(BLOCK_SIZE, 1, 1); + + snap_psibeta_atom_batch_kernel<<>>(R0_d3, + A_d3, + neighbor_orbitals_d, + projectors_d, + psi_radial_d, + beta_radial_d, + proj_m0_offset_d, + total_neighbor_orbitals, + nproj, + natomwfc, + nlm_dim, + nlm_out_d); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + fprintf(stderr, "[CUDA] Atom batch kernel launch error: %s\n", cudaGetErrorString(err)); + cudaFree(neighbor_orbitals_d); + cudaFree(projectors_d); + cudaFree(psi_radial_d); + cudaFree(beta_radial_d); + cudaFree(proj_m0_offset_d); + cudaFree(nlm_out_d); + ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); + return; + } + + CUDA_CHECK(cudaDeviceSynchronize()); + + //========================================================================= + // Copy Results Back + //========================================================================= + + std::vector nlm_out_h(output_size); + CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost)); + + //========================================================================= + // Reconstruct nlm_tot structure + //========================================================================= + + for (int i = 0; i < total_neighbor_orbitals; i++) + { + int ad = orbital_mappings[i].neighbor_idx; + int iw_index = orbital_mappings[i].iw_index; + + std::vector>> nlm(nlm_dim); + for (int d = 0; d < nlm_dim; d++) + { + nlm[d].resize(natomwfc); + for (int k = 0; k < natomwfc; k++) + { + size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k; + nlm[d][k] = std::complex(nlm_out_h[idx].x, nlm_out_h[idx].y); + } + } + + // Insert into nlm_tot[ad][dir][iw_index] + for (int dir = 0; dir < nlm_dim; dir++) + { + nlm_tot[ad][dir].insert({iw_index, nlm[dir]}); + } + } + + //========================================================================= + // Cleanup + //========================================================================= + + cudaFree(neighbor_orbitals_d); + cudaFree(projectors_d); + cudaFree(psi_radial_d); + cudaFree(beta_radial_d); + cudaFree(proj_m0_offset_d); + cudaFree(nlm_out_d); + + ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); +} + } // namespace gpu } // namespace module_rt diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 1e752d6d86..7feae5907c 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -417,6 +417,238 @@ __global__ void snap_psibeta_neighbor_batch_kernel(double3 R1, } } +//============================================================================= +// Atom-Level Batch Kernel Implementation +// Processes ALL neighbors for a center atom in a single kernel launch +//============================================================================= + +__global__ void snap_psibeta_atom_batch_kernel(double3 R0, + double3 A, + const NeighborOrbitalData* __restrict__ neighbor_orbitals, + const ProjectorData* __restrict__ projectors, + const double* __restrict__ psi_radial, + const double* __restrict__ beta_radial, + const int* __restrict__ proj_m0_offset, + int total_neighbor_orbitals, + int nproj, + int natomwfc, + int nlm_dim, + cuDoubleComplex* __restrict__ nlm_out) +{ + int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair + int proj_idx = blockIdx.y; // Which projector + int tid = threadIdx.x; + + if (norb_idx >= total_neighbor_orbitals || proj_idx >= nproj) + { + return; + } + + // Load neighbor-orbital data (includes R1) + const NeighborOrbitalData& norb = neighbor_orbitals[norb_idx]; + const ProjectorData& proj = projectors[proj_idx]; + + double3 R1 = norb.R1; + int L1 = norb.L1; + int m1 = norb.m1; + int L0 = proj.L0; + int m0_offset = proj_m0_offset[proj_idx]; + + // Skip if L values exceed our precomputed limit + if (L1 > MAX_L || L0 > MAX_L) + { + return; + } + + // Compute geometry + double3 dR = make_double3(R1.x - R0.x, R1.y - R0.y, R1.z - R0.z); + double distance01 = sqrt(dR.x * dR.x + dR.y * dR.y + dR.z * dR.z); + + double r1_max = norb.psi_rcut; + // Integration range: use projector's radial grid range [r_min, r_max] + // This matches the CPU implementation in snap_psibeta_half_tddft.cpp + double r_min = proj.r_min; + double r_max = proj.r_max; + double xl = 0.5 * (r_max - r_min); + double xmean = 0.5 * (r_max + r_min); + + // Phase factor exp(i A · R0) + double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; + cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0); + + // Shared memory for reduction + __shared__ double s_temp_re[BLOCK_SIZE]; + __shared__ double s_temp_im[BLOCK_SIZE]; + + // Initialize accumulators in registers + double result_re[MAX_YLM_SIZE]; + double result_im[MAX_YLM_SIZE]; + double result_r_re[3][MAX_YLM_SIZE]; + double result_r_im[3][MAX_YLM_SIZE]; + + int num_m0 = 2 * L0 + 1; + for (int m0 = 0; m0 < num_m0; m0++) + { + result_re[m0] = 0.0; + result_im[m0] = 0.0; + for (int d = 0; d < 3; d++) + { + result_r_re[d][m0] = 0.0; + result_r_im[d][m0] = 0.0; + } + } + + // Main integration loop + for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) + { + double r_val = xmean + xl * d_gl_x[ir]; + double w_rad = xl * d_gl_w[ir]; + + for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) + { + double leb_x = d_lebedev_x[ian]; + double leb_y = d_lebedev_y[ian]; + double leb_z = d_lebedev_z[ian]; + double w_ang = d_lebedev_w[ian]; + + // Local position relative to R0 + double rx = r_val * leb_x; + double ry = r_val * leb_y; + double rz = r_val * leb_z; + + // Vector from R1 (orbital atom) to integration point (R0 + r_local) + // This matches neighbor_batch: tx = rx + dRa.x where dRa = R0 - R1 + double tx = rx + R0.x - R1.x; + double ty = ry + R0.y - R1.y; + double tz = rz + R0.z - R1.z; + double tnorm = sqrt(tx * tx + ty * ty + tz * tz); + + // Check psi cutoff - matches neighbor_batch logic + if (tnorm <= r1_max) + { + // Compute Y_L1 - match neighbor_batch handling of small tnorm + double ylm1[MAX_YLM_SIZE]; + if (tnorm > 1e-10) + { + double inv_tnorm = 1.0 / tnorm; + compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); + } + else + { + compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1); + } + + // Compute Y_L0 + double ylm0[MAX_YLM_SIZE]; + compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); + + // Interpolate psi at tnorm (distance to orbital atom) + double psi_val + = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); + + // Phase factor - match neighbor_batch: exp(+i A · r_local) + double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; + double phase = r_val * A_dot_leb; + cuDoubleComplex exp_iAr = cu_exp_i(phase); + + // Interpolate beta + double beta_val + = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); + + // Y_L1m1 + int offset_L1 = L1 * L1 + m1; + double ylm_L1_val = ylm1[offset_L1]; + + // Combined factor - match neighbor_batch exactly + double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; + cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); + + // Accumulate for all m0 - match neighbor_batch structure + int offset_L0 = L0 * L0; + for (int m0 = 0; m0 < num_m0; m0++) + { + double ylm0_val = ylm0[offset_L0 + m0]; + + result_re[m0] += common_factor.x * ylm0_val; + result_im[m0] += common_factor.y * ylm0_val; + + if (nlm_dim == 4) + { + double r_op_x = rx + R0.x; + double r_op_y = ry + R0.y; + double r_op_z = rz + R0.z; + + result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x; + result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x; + result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y; + result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y; + result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z; + result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z; + } + } + } + } + } + + // Reduction and output - OUTSIDE radial loop + int out_base = norb_idx * nlm_dim * natomwfc; + + for (int m0 = 0; m0 < num_m0; m0++) + { + s_temp_re[tid] = result_re[m0]; + s_temp_im[tid] = result_im[m0]; + __syncthreads(); + + for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + { + if (tid < s) + { + s_temp_re[tid] += s_temp_re[tid + s]; + s_temp_im[tid] += s_temp_im[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) + { + cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); + result = cu_mul(result, exp_iAR0); + result = cu_conj(result); + nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result; + } + __syncthreads(); + + if (nlm_dim == 4) + { + for (int d = 0; d < 3; d++) + { + s_temp_re[tid] = result_r_re[d][m0]; + s_temp_im[tid] = result_r_im[d][m0]; + __syncthreads(); + + for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + { + if (tid < s) + { + s_temp_re[tid] += s_temp_re[tid + s]; + s_temp_im[tid] += s_temp_im[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) + { + cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); + result_r = cu_mul(result_r, exp_iAR0); + result_r = cu_conj(result_r); + nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r; + } + __syncthreads(); + } + } + } +} + //============================================================================= // Host-side Helper Function Implementations //============================================================================= diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index 085361e81e..f9a620ff5f 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -129,48 +129,84 @@ struct ProjectorData double r_max; // Maximum radial value }; +/** + * @brief Information about a neighbor-orbital pair for atom-level batching + * Used to batch ALL neighbors for a center atom in a single kernel launch + */ +struct NeighborOrbitalData +{ + int neighbor_idx; // Which neighbor (ad index) this orbital belongs to + double3 R1; // Neighbor atom position (tau1 * lat0) + + // Orbital info (same as OrbitalData) + int L1; + int m1; + int N1; + int iw_index; + int psi_offset; + int psi_mesh; + double psi_dk; + double psi_rcut; +}; + //============================================================================= -// Main Kernel +// Main Kernels //============================================================================= /** - * @brief Main kernel for neighbor-level batch processing + * @brief Neighbor-level batch kernel (original version) * * Grid: (num_orbitals, nproj, 1) * Block: (BLOCK_SIZE, 1, 1) + */ +__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1, + double3 R0, + double3 A, + const OrbitalData* __restrict__ orbitals, + const ProjectorData* __restrict__ projectors, + const double* __restrict__ psi_radial, + const double* __restrict__ beta_radial, + const int* __restrict__ proj_m0_offset, + int num_orbitals, + int nproj, + int natomwfc, + int nlm_dim, + cuDoubleComplex* __restrict__ nlm_out); + +/** + * @brief Atom-level batch kernel - processes ALL neighbors for a center atom * - * Each block processes one (orbital, projector) pair. - * Threads within a block parallelize over angular integration points. + * Grid: (total_neighbor_orbitals, nproj, 1) + * Block: (BLOCK_SIZE, 1, 1) * - * @param R1 Neighbor atom position - * @param R0 Projector atom position + * This kernel processes all orbitals from all neighbors in a single launch, + * reducing kernel launch overhead significantly. + * + * @param R0 Center atom position (projector location) * @param A Vector potential - * @param orbitals Array of orbital data [num_orbitals] + * @param neighbor_orbitals Array of neighbor-orbital pairs [total_neighbor_orbitals] * @param projectors Array of projector data [nproj] * @param psi_radial Flattened orbital radial functions * @param beta_radial Flattened projector radial functions - * @param num_orbitals Number of orbitals in batch + * @param proj_m0_offset Offset for each projector's m=0 in output + * @param total_neighbor_orbitals Total number of (neighbor, orbital) pairs * @param nproj Number of projectors - * @param natomwfc Total number of projector components (sum of 2*L0+1) - * @param calc_r Whether to compute position operator elements - * @param nlm_out Output array [num_orbitals * nlm_dim * natomwfc] - * - * Note: Gauss-Legendre grids (gl_x, gl_w) and Lebedev grids are stored in constant memory + * @param natomwfc Total projector components (sum of 2*L0+1) + * @param nlm_dim 1 for no current, 4 for current + * @param nlm_out Output [total_neighbor_orbitals * nlm_dim * natomwfc] */ -__global__ void snap_psibeta_neighbor_batch_kernel( - double3 R1, - double3 R0, - double3 A, - const OrbitalData* __restrict__ orbitals, - const ProjectorData* __restrict__ projectors, - const double* __restrict__ psi_radial, - const double* __restrict__ beta_radial, - const int* __restrict__ proj_m0_offset, // Offset for each projector in output - int num_orbitals, - int nproj, - int natomwfc, - int nlm_dim, - cuDoubleComplex* __restrict__ nlm_out); +__global__ void snap_psibeta_atom_batch_kernel(double3 R0, + double3 A, + const NeighborOrbitalData* __restrict__ neighbor_orbitals, + const ProjectorData* __restrict__ projectors, + const double* __restrict__ psi_radial, + const double* __restrict__ beta_radial, + const int* __restrict__ proj_m0_offset, + int total_neighbor_orbitals, + int nproj, + int natomwfc, + int nlm_dim, + cuDoubleComplex* __restrict__ nlm_out); //============================================================================= // Host-side Helper Functions diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h index ab6f55a617..93e7d483d4 100644 --- a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h +++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h @@ -3,6 +3,8 @@ #include "source_base/vector3.h" #include "source_basis/module_ao/ORB_read.h" +#include "source_basis/module_ao/parallel_orbitals.h" +#include "source_cell/module_neighbor/sltk_grid_driver.h" #include "source_cell/setup_nonlocal.h" #include "source_cell/unitcell.h" @@ -32,27 +34,7 @@ void initialize_gpu_resources(); void finalize_gpu_resources(); /** - * @brief Neighbor-level GPU batch processing interface - * - * Computes matrix elements for all orbitals on a neighbor atom with projectors - * Replaces the original orbital loop: - * for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { - * snap_psibeta_half_tddft(...); - * } - * - * @param orb Orbital information - * @param infoNL_ Non-local pseudopotential information - * @param T1 Neighbor atom type - * @param R1 Neighbor atom position (already multiplied by lat0) - * @param atom1 Neighbor atom pointer (for iw2l, iw2m, iw2n) - * @param all_indexes All orbital indices on this neighbor - * @param npol Polarization number - * @param T0 Projector atom type - * @param R0 Projector atom position (already multiplied by lat0) - * @param A Vector potential - * @param nlm_all Output: nlm_all[dir][iw_index] = nlm_vector - * @param calc_r Whether to calculate position operator matrix elements - * @return true if GPU processing succeeded, false if caller should use CPU path + * @brief Neighbor-level GPU batch processing interface (legacy) */ bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, const InfoNonlocal& infoNL_, @@ -67,6 +49,37 @@ bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, std::vector>>>& nlm_all, const bool calc_r); +/** + * @brief Atom-level GPU batch processing interface + * + * Processes ALL neighbors for a center atom in a SINGLE kernel launch. + * This significantly reduces kernel launch overhead compared to neighbor-level batching. + * + * @param orb Orbital information + * @param infoNL_ Non-local pseudopotential information + * @param T0 Center atom type (projector location) + * @param R0 Center atom position (already multiplied by lat0) + * @param A Vector potential + * @param adjs Adjacent atom information for this center atom + * @param ucell Unit cell pointer + * @param paraV Parallel orbitals information + * @param npol Polarization number + * @param nlm_dim 1 for no current, 4 for current calculation + * @param nlm_tot Output: nlm_tot[ad][dir][iw_index] = nlm_vector + */ +void snap_psibeta_atom_batch_gpu( + const LCAO_Orbitals& orb, + const InfoNonlocal& infoNL_, + const int T0, + const ModuleBase::Vector3& R0, + const ModuleBase::Vector3& A, + const AdjacentAtomInfo& adjs, + const UnitCell* ucell, + const Parallel_Orbitals* paraV, + const int npol, + const int nlm_dim, + std::vector>>>>& nlm_tot); + } // namespace gpu } // namespace module_rt From 6014c905aa6bb3ae925e2c54a9de1c83d9c3273d Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 9 Dec 2025 11:17:10 +0800 Subject: [PATCH 03/13] remove snap_psibeta_neighbor_batch_gpu --- .../kernels/cuda/snap_psibeta_gpu.cu | 293 ------------------ .../kernels/cuda/snap_psibeta_kernel.cu | 252 --------------- .../kernels/cuda/snap_psibeta_kernel.cuh | 53 +--- .../module_rt/kernels/snap_psibeta_gpu.h | 16 - 4 files changed, 9 insertions(+), 605 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu index 519b522c8f..f630aec8c2 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -54,299 +54,6 @@ void finalize_gpu_resources() cudaGetLastError(); } -//============================================================================= -// Main GPU Interface Function -//============================================================================= - -bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, - const InfoNonlocal& infoNL_, - const int T1, - const ModuleBase::Vector3& R1, - const Atom* atom1, - const std::vector& all_indexes, - const int npol, - const int T0, - const ModuleBase::Vector3& R0, - const ModuleBase::Vector3& A, - std::vector>>>& nlm_all, - const bool calc_r) -{ - - // Get projector info - const int nproj = infoNL_.nproj[T0]; - if (nproj == 0) - { - return true; - } - - // Calculate number of orbitals - int num_orbitals = all_indexes.size() / npol; - if (num_orbitals == 0) - { - return true; - } - - // Calculate natomwfc (total projector components) - int natomwfc = 0; - std::vector proj_m0_offset_h(nproj); - for (int ip = 0; ip < nproj; ip++) - { - proj_m0_offset_h[ip] = natomwfc; - int L0 = infoNL_.Beta[T0].Proj[ip].getL(); - - // Check L0 against MAX_L - if (L0 > MAX_L) - { - fprintf(stderr, "[CUDA] Warning: L0=%d exceeds MAX_L=%d, falling back to CPU\n", L0, MAX_L); - return false; - } - - natomwfc += 2 * L0 + 1; - } - - int nlm_dim = calc_r ? 4 : 1; - - // Resize output - if (nlm_all.size() != nlm_dim) - { - nlm_all.resize(nlm_dim); - } - - //========================================================================= - // Prepare Host Data - //========================================================================= - - // Orbital data - std::vector orbitals_h(num_orbitals); - - // Calculate total psi size and offsets - std::vector psi_offsets_h(num_orbitals); - int total_psi_size = 0; - - for (int i = 0; i < num_orbitals; i++) - { - int iw1l = i * npol; - int iw1 = all_indexes[iw1l] / npol; - - int L1 = atom1->iw2l[iw1]; - int m1 = atom1->iw2m[iw1]; - int N1 = atom1->iw2n[iw1]; - - // Check L1 against MAX_L - if (L1 > MAX_L) - { - fprintf(stderr, "[CUDA] Warning: L1=%d exceeds MAX_L=%d, falling back to CPU\n", L1, MAX_L); - return false; - } - - const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1); - - orbitals_h[i].L1 = L1; - orbitals_h[i].m1 = m1; - orbitals_h[i].N1 = N1; - orbitals_h[i].iw_index = all_indexes[iw1l]; - orbitals_h[i].psi_offset = total_psi_size; - orbitals_h[i].psi_mesh = phi_ln.getNr(); - orbitals_h[i].psi_dk = phi_ln.getDk(); - orbitals_h[i].psi_rcut = phi_ln.getRcut(); - - psi_offsets_h[i] = total_psi_size; - total_psi_size += phi_ln.getNr(); - } - - // Copy psi radial data - std::vector psi_radial_h(total_psi_size); - for (int i = 0; i < num_orbitals; i++) - { - int iw1l = i * npol; - int iw1 = all_indexes[iw1l] / npol; - - int L1 = atom1->iw2l[iw1]; - int N1 = atom1->iw2n[iw1]; - - const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1); - int mesh = phi_ln.getNr(); - const double* psi = phi_ln.getPsi(); - - for (int j = 0; j < mesh; j++) - { - psi_radial_h[orbitals_h[i].psi_offset + j] = psi[j]; - } - } - - // Projector data - std::vector projectors_h(nproj); - std::vector beta_offsets_h(nproj); - int total_beta_size = 0; - - for (int ip = 0; ip < nproj; ip++) - { - const auto& proj = infoNL_.Beta[T0].Proj[ip]; - - projectors_h[ip].L0 = proj.getL(); - projectors_h[ip].beta_offset = total_beta_size; - projectors_h[ip].beta_mesh = proj.getNr(); - projectors_h[ip].beta_dk = proj.getDk(); - projectors_h[ip].beta_rcut = proj.getRcut(); - - // Get r_min and r_max from radial grid - const double* radial = proj.getRadial(); - int mesh = proj.getNr(); - projectors_h[ip].r_min = radial[0]; - projectors_h[ip].r_max = radial[mesh - 1]; - - beta_offsets_h[ip] = total_beta_size; - total_beta_size += mesh; - } - - // Copy beta radial data - std::vector beta_radial_h(total_beta_size); - for (int ip = 0; ip < nproj; ip++) - { - const auto& proj = infoNL_.Beta[T0].Proj[ip]; - int mesh = proj.getNr(); - const double* beta_r = proj.getBeta_r(); - - for (int j = 0; j < mesh; j++) - { - beta_radial_h[projectors_h[ip].beta_offset + j] = beta_r[j]; - } - } - - //========================================================================= - // Allocate Device Memory - //========================================================================= - - OrbitalData* orbitals_d = nullptr; - ProjectorData* projectors_d = nullptr; - double* psi_radial_d = nullptr; - double* beta_radial_d = nullptr; - int* proj_m0_offset_d = nullptr; - cuDoubleComplex* nlm_out_d = nullptr; - - size_t output_size = num_orbitals * nlm_dim * natomwfc; - - CUDA_CHECK(cudaMalloc(&orbitals_d, num_orbitals * sizeof(OrbitalData))); - CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData))); - CUDA_CHECK(cudaMalloc(&psi_radial_d, total_psi_size * sizeof(double))); - CUDA_CHECK(cudaMalloc(&beta_radial_d, total_beta_size * sizeof(double))); - CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int))); - CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex))); - - //========================================================================= - // Copy Data to Device - //========================================================================= - - CUDA_CHECK(cudaMemcpy(orbitals_d, orbitals_h.data(), num_orbitals * sizeof(OrbitalData), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(psi_radial_d, psi_radial_h.data(), total_psi_size * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK( - cudaMemcpy(beta_radial_d, beta_radial_h.data(), total_beta_size * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice)); - - // Initialize output to zero - CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex))); - - //========================================================================= - // Launch Kernel - //========================================================================= - - double3 R1_d3 = make_double3(R1.x, R1.y, R1.z); - double3 R0_d3 = make_double3(R0.x, R0.y, R0.z); - double3 A_d3 = make_double3(A.x, A.y, A.z); - - dim3 grid(num_orbitals, nproj, 1); - dim3 block(BLOCK_SIZE, 1, 1); - - snap_psibeta_neighbor_batch_kernel<<>>(R1_d3, - R0_d3, - A_d3, - orbitals_d, - projectors_d, - psi_radial_d, - beta_radial_d, - proj_m0_offset_d, - num_orbitals, - nproj, - natomwfc, - nlm_dim, - nlm_out_d); - - // Check for kernel errors - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - fprintf(stderr, "[CUDA] Kernel launch error: %s\n", cudaGetErrorString(err)); - // Cleanup and return - cudaFree(orbitals_d); - cudaFree(projectors_d); - cudaFree(psi_radial_d); - cudaFree(beta_radial_d); - cudaFree(proj_m0_offset_d); - cudaFree(nlm_out_d); - return false; - } - - CUDA_CHECK(cudaDeviceSynchronize()); - - // Check for errors after synchronization - err = cudaGetLastError(); - if (err != cudaSuccess) - { - fprintf(stderr, "[CUDA] Kernel execution error: %s\n", cudaGetErrorString(err)); - cudaFree(orbitals_d); - cudaFree(projectors_d); - cudaFree(psi_radial_d); - cudaFree(beta_radial_d); - cudaFree(proj_m0_offset_d); - cudaFree(nlm_out_d); - return false; - } - - //========================================================================= - // Copy Results Back - //========================================================================= - - std::vector nlm_out_h(output_size); - CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost)); - - // Reconstruct output into nlm_all format - for (int i = 0; i < num_orbitals; i++) - { - int iw_index = orbitals_h[i].iw_index; - - // Create nlm vectors for this orbital - std::vector>> nlm(nlm_dim); - for (int d = 0; d < nlm_dim; d++) - { - nlm[d].resize(natomwfc); - for (int k = 0; k < natomwfc; k++) - { - size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k; - nlm[d][k] = std::complex(nlm_out_h[idx].x, nlm_out_h[idx].y); - } - } - - // Insert into output maps - for (int dir = 0; dir < nlm_dim; dir++) - { - nlm_all[dir].insert({iw_index, nlm[dir]}); - } - } - - //========================================================================= - // Cleanup - //========================================================================= - - cudaFree(orbitals_d); - cudaFree(projectors_d); - cudaFree(psi_radial_d); - cudaFree(beta_radial_d); - cudaFree(proj_m0_offset_d); - cudaFree(nlm_out_d); - return true; -} - //============================================================================= // Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel //============================================================================= diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 7feae5907c..8a2c914ca7 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -165,258 +165,6 @@ __device__ __forceinline__ double warp_reduce_sum(double val) return val; } -//============================================================================= -// Main Kernel Implementation - Memory Efficient Version -//============================================================================= - -__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1, - double3 R0, - double3 A, - const OrbitalData* __restrict__ orbitals, - const ProjectorData* __restrict__ projectors, - const double* __restrict__ psi_radial, - const double* __restrict__ beta_radial, - const int* __restrict__ proj_m0_offset, - int num_orbitals, - int nproj, - int natomwfc, - int nlm_dim, - cuDoubleComplex* __restrict__ nlm_out) -{ - - int orb_idx = blockIdx.x; - int proj_idx = blockIdx.y; - int tid = threadIdx.x; - - if (orb_idx >= num_orbitals || proj_idx >= nproj) - { - return; - } - - // Load orbital and projector data - OrbitalData orb = orbitals[orb_idx]; - ProjectorData proj = projectors[proj_idx]; - - int L1 = orb.L1; - int m1 = orb.m1; - int L0 = proj.L0; - int num_m0 = 2 * L0 + 1; - - // Distance check - double3 dRa = make_double3(R0.x - R1.x, R0.y - R1.y, R0.z - R1.z); - double distance = sqrt(dRa.x * dRa.x + dRa.y * dRa.y + dRa.z * dRa.z); - - // Check for NaN - if (isnan(distance)) - { - return; - } - - double Rcut_sum = orb.psi_rcut + proj.beta_rcut; - - if (distance > Rcut_sum) - { - // Zero output - if (tid == 0) - { - int m0_offset = proj_m0_offset[proj_idx]; - int out_base = orb_idx * nlm_dim * natomwfc; - for (int m0 = 0; m0 < num_m0; m0++) - { - for (int d = 0; d < nlm_dim; d++) - { - nlm_out[out_base + d * natomwfc + m0_offset + m0] = make_cuDoubleComplex(0.0, 0.0); - } - } - } - return; - } - - // Reduced shared memory: only for reduction (2 doubles per thread * BLOCK_SIZE) - __shared__ double s_temp_re[BLOCK_SIZE]; - __shared__ double s_temp_im[BLOCK_SIZE]; - - // Radial grid mapping - double xl = (proj.r_max - proj.r_min) * 0.5; - double xmean = (proj.r_max + proj.r_min) * 0.5; - - // Precompute A dot R0 - double A_phase_R0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; - cuDoubleComplex exp_iAR0 = cu_exp_i(A_phase_R0); - - // Result accumulators in registers - one per m0 component - double result_re[MAX_YLM_SIZE]; - double result_im[MAX_YLM_SIZE]; - double result_r_re[3][MAX_YLM_SIZE]; - double result_r_im[3][MAX_YLM_SIZE]; - - // Initialize accumulators - for (int m0 = 0; m0 < num_m0; m0++) - { - result_re[m0] = 0.0; - result_im[m0] = 0.0; - if (nlm_dim == 4) - { - for (int d = 0; d < 3; d++) - { - result_r_re[d][m0] = 0.0; - result_r_im[d][m0] = 0.0; - } - } - } - - // Radial loop - using constant memory for Gauss-Legendre grid - for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) - { - double r_val = xmean + xl * d_gl_x[ir]; - double w_rad = xl * d_gl_w[ir]; - - // Angular loop - each thread handles multiple angular points - for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) - { - double leb_x = d_lebedev_x[ian]; - double leb_y = d_lebedev_y[ian]; - double leb_z = d_lebedev_z[ian]; - double w_ang = d_lebedev_w[ian]; - - double rx = r_val * leb_x; - double ry = r_val * leb_y; - double rz = r_val * leb_z; - - double tx = rx + dRa.x; - double ty = ry + dRa.y; - double tz = rz + dRa.z; - double tnorm = sqrt(tx * tx + ty * ty + tz * tz); - - if (tnorm <= orb.psi_rcut) - { - // Compute Y_L1 - double ylm1[MAX_YLM_SIZE]; - if (tnorm > 1e-10) - { - double inv_tnorm = 1.0 / tnorm; - compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); - } - else - { - compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1); - } - - // Compute Y_L0 - double ylm0[MAX_YLM_SIZE]; - compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); - - // Interpolate psi - double psi_val - = interpolate_radial_gpu(psi_radial + orb.psi_offset, orb.psi_mesh, 1.0 / orb.psi_dk, tnorm); - - // Phase factor - double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; - double phase = r_val * A_dot_leb; - cuDoubleComplex exp_iAr = cu_exp_i(phase); - - // Interpolate beta - double beta_val - = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); - - // Y_L1m1 - int offset_L1 = L1 * L1 + m1; - double ylm_L1_val = ylm1[offset_L1]; - - // Combined factor - double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; - cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); - - // Accumulate for all m0 - int offset_L0 = L0 * L0; - for (int m0 = 0; m0 < num_m0; m0++) - { - double ylm0_val = ylm0[offset_L0 + m0]; - - result_re[m0] += common_factor.x * ylm0_val; - result_im[m0] += common_factor.y * ylm0_val; - - if (nlm_dim == 4) - { - double r_op_x = rx + R0.x; - double r_op_y = ry + R0.y; - double r_op_z = rz + R0.z; - - result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x; - result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x; - result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y; - result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y; - result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z; - result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z; - } - } - } - } - } - - // Reduce across threads for each m0 - int m0_offset = proj_m0_offset[proj_idx]; - int out_base = orb_idx * nlm_dim * natomwfc; - - for (int m0 = 0; m0 < num_m0; m0++) - { - // Reduce real part - s_temp_re[tid] = result_re[m0]; - s_temp_im[tid] = result_im[m0]; - __syncthreads(); - - // Standard block reduction - for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) - { - if (tid < s) - { - s_temp_re[tid] += s_temp_re[tid + s]; - s_temp_im[tid] += s_temp_im[tid + s]; - } - __syncthreads(); - } - - if (tid == 0) - { - cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); - result = cu_mul(result, exp_iAR0); - result = cu_conj(result); - nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result; - } - __syncthreads(); - - // Position operator terms - if (nlm_dim == 4) - { - for (int d = 0; d < 3; d++) - { - s_temp_re[tid] = result_r_re[d][m0]; - s_temp_im[tid] = result_r_im[d][m0]; - __syncthreads(); - - for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) - { - if (tid < s) - { - s_temp_re[tid] += s_temp_re[tid + s]; - s_temp_im[tid] += s_temp_im[tid + s]; - } - __syncthreads(); - } - - if (tid == 0) - { - cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); - result_r = cu_mul(result_r, exp_iAR0); - result_r = cu_conj(result_r); - nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r; - } - __syncthreads(); - } - } - } -} - //============================================================================= // Atom-Level Batch Kernel Implementation // Processes ALL neighbors for a center atom in a single kernel launch diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index f9a620ff5f..c3191653eb 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -100,21 +100,6 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm // Kernel Input/Output Structures //============================================================================= -/** - * @brief Information about a single orbital for batch processing - */ -struct OrbitalData -{ - int L1; // Angular momentum - int m1; // Magnetic quantum number - int N1; // Radial quantum number - int iw_index; // Original orbital index for output mapping - int psi_offset; // Offset in flattened psi array - int psi_mesh; // Number of mesh points - double psi_dk; // Grid spacing - double psi_rcut; // Cutoff radius -}; - /** * @brief Information about a single projector */ @@ -138,41 +123,21 @@ struct NeighborOrbitalData int neighbor_idx; // Which neighbor (ad index) this orbital belongs to double3 R1; // Neighbor atom position (tau1 * lat0) - // Orbital info (same as OrbitalData) - int L1; - int m1; - int N1; - int iw_index; - int psi_offset; - int psi_mesh; - double psi_dk; - double psi_rcut; + // Orbital info + int L1; // Angular momentum + int m1; // Magnetic quantum number + int N1; // Radial quantum number + int iw_index; // Original orbital index for output mapping + int psi_offset; // Offset in flattened psi array + int psi_mesh; // Number of mesh points + double psi_dk; // Grid spacing + double psi_rcut; // Cutoff radius }; //============================================================================= // Main Kernels //============================================================================= -/** - * @brief Neighbor-level batch kernel (original version) - * - * Grid: (num_orbitals, nproj, 1) - * Block: (BLOCK_SIZE, 1, 1) - */ -__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1, - double3 R0, - double3 A, - const OrbitalData* __restrict__ orbitals, - const ProjectorData* __restrict__ projectors, - const double* __restrict__ psi_radial, - const double* __restrict__ beta_radial, - const int* __restrict__ proj_m0_offset, - int num_orbitals, - int nproj, - int natomwfc, - int nlm_dim, - cuDoubleComplex* __restrict__ nlm_out); - /** * @brief Atom-level batch kernel - processes ALL neighbors for a center atom * diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h index 93e7d483d4..43b617e188 100644 --- a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h +++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h @@ -33,22 +33,6 @@ void initialize_gpu_resources(); */ void finalize_gpu_resources(); -/** - * @brief Neighbor-level GPU batch processing interface (legacy) - */ -bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb, - const InfoNonlocal& infoNL_, - const int T1, - const ModuleBase::Vector3& R1, - const Atom* atom1, - const std::vector& all_indexes, - const int npol, - const int T0, - const ModuleBase::Vector3& R0, - const ModuleBase::Vector3& A, - std::vector>>>& nlm_all, - const bool calc_r); - /** * @brief Atom-level GPU batch processing interface * From 9047e1b41212a243387b6754a4988f1224f1ab0a Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 19 Dec 2025 16:18:40 +0800 Subject: [PATCH 04/13] perf(gpu): optimize snap_psibeta_atom_batch_kernel with loop restructuring - Move ylm0 computation outside radial loop (saves 140x redundant calculations) - Hoist A_dot_leb and dR calculations outside inner loop - Add #pragma unroll hints for radial and m0 loops Achieves 23.3% speedup on snap_psibeta_gpu (19.27s -> 14.78s). Numerical correctness verified: energy matches baseline (-756.053 Ry). --- .../kernels/cuda/snap_psibeta_kernel.cu | 73 ++++++++++--------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 8a2c914ca7..9d8c57a2ef 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -246,35 +246,50 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } } - // Main integration loop - for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) + // Main integration loop - RESTRUCTURED: angular loop outside, radial inside + // ylm0 only depends on angular direction (leb_x,y,z), not radial distance + // This saves 140x redundant ylm0 computations per angular point + for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) { - double r_val = xmean + xl * d_gl_x[ir]; - double w_rad = xl * d_gl_w[ir]; - - for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) + double leb_x = d_lebedev_x[ian]; + double leb_y = d_lebedev_y[ian]; + double leb_z = d_lebedev_z[ian]; + double w_ang = d_lebedev_w[ian]; + + // Precompute ylm0 ONCE per angular point (doesn't depend on r_val) + double ylm0[MAX_YLM_SIZE]; + compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); + int offset_L0 = L0 * L0; + + // Precompute A dot direction (doesn't depend on r_val) + double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; + + // Precompute dR components + double dRx = R0.x - R1.x; + double dRy = R0.y - R1.y; + double dRz = R0.z - R1.z; + + #pragma unroll 4 + for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) { - double leb_x = d_lebedev_x[ian]; - double leb_y = d_lebedev_y[ian]; - double leb_z = d_lebedev_z[ian]; - double w_ang = d_lebedev_w[ian]; + double r_val = xmean + xl * d_gl_x[ir]; + double w_rad = xl * d_gl_w[ir]; // Local position relative to R0 double rx = r_val * leb_x; double ry = r_val * leb_y; double rz = r_val * leb_z; - // Vector from R1 (orbital atom) to integration point (R0 + r_local) - // This matches neighbor_batch: tx = rx + dRa.x where dRa = R0 - R1 - double tx = rx + R0.x - R1.x; - double ty = ry + R0.y - R1.y; - double tz = rz + R0.z - R1.z; + // Vector from R1 to integration point + double tx = rx + dRx; + double ty = ry + dRy; + double tz = rz + dRz; double tnorm = sqrt(tx * tx + ty * ty + tz * tz); - // Check psi cutoff - matches neighbor_batch logic + // Check psi cutoff if (tnorm <= r1_max) { - // Compute Y_L1 - match neighbor_batch handling of small tnorm + // Compute Y_L1 (depends on direction to R1, varies with r_val) double ylm1[MAX_YLM_SIZE]; if (tnorm > 1e-10) { @@ -286,33 +301,25 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1); } - // Compute Y_L0 - double ylm0[MAX_YLM_SIZE]; - compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); - - // Interpolate psi at tnorm (distance to orbital atom) - double psi_val - = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); + // Interpolate psi + double psi_val = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); - // Phase factor - match neighbor_batch: exp(+i A · r_local) - double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; + // Phase factor double phase = r_val * A_dot_leb; cuDoubleComplex exp_iAr = cu_exp_i(phase); // Interpolate beta - double beta_val - = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); + double beta_val = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); // Y_L1m1 - int offset_L1 = L1 * L1 + m1; - double ylm_L1_val = ylm1[offset_L1]; + double ylm_L1_val = ylm1[L1 * L1 + m1]; - // Combined factor - match neighbor_batch exactly + // Combined factor double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); - // Accumulate for all m0 - match neighbor_batch structure - int offset_L0 = L0 * L0; + // Accumulate for all m0 + #pragma unroll for (int m0 = 0; m0 < num_m0; m0++) { double ylm0_val = ylm0[offset_L0 + m0]; From 97b26d6410a38d549cd0c9239f4522e090fbdfff Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 19 Dec 2025 17:13:31 +0800 Subject: [PATCH 05/13] perf(gpu): optimize compute_ylm_gpu with atan2 and sincos - Replace conditional atan branches with single atan2 call - Use sincos() instead of separate sin/cos calls Achieves 8.4% additional speedup (14.78s -> 13.56s) Combined with loop restructuring: 29.6% total from baseline Numerical correctness verified: -756.053 Ry --- .../kernels/cuda/snap_psibeta_kernel.cu | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 9d8c57a2ef..e706d513b8 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -30,6 +30,7 @@ __constant__ double d_gl_w[RADIAL_GRID_NUM]; /** * @brief Compute real spherical harmonics Y_lm * Based on the recursive formula used in ModuleBase::Ylm + * OPTIMIZED: Use atan2, sincos for better performance */ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm) { @@ -45,38 +46,26 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm return; } - // Compute theta and phi - double rxy = sqrt(x * x + y * y); - double r = sqrt(x * x + y * y + z * z); + // Compute r and cost + double r2 = x * x + y * y + z * z; + double r = sqrt(r2); - double cost, phi; + double cost, sint, phi; if (r < 1e-10) { cost = 1.0; + sint = 0.0; phi = 0.0; } else { cost = z / r; - if (rxy < 1e-10) - { - phi = 0.0; - } - else if (x > 1e-10) - { - phi = atan(y / x); - } - else if (x < -1e-10) - { - phi = atan(y / x) + PI; - } - else - { - phi = (y >= 0.0) ? PI / 2.0 : -PI / 2.0; - } + sint = sqrt(1.0 - cost * cost); + // Use atan2 for robust phi computation (replaces multiple conditionals) + phi = atan2(y, x); } - double sint = sqrt(1.0 - cost * cost); + // Ensure sint is non-negative (numerical safety) if (sint < 0.0) { sint = 0.0; @@ -143,10 +132,14 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm } double same = c * sqrt(1.0 / fact_ratio) * SQRT2; - ylm[lm] = same * p[l][m] * cos(m * phi); + // Use sincos for efficiency (computes both sin and cos together) + double sin_mphi, cos_mphi; + sincos(m * phi, &sin_mphi, &cos_mphi); + + ylm[lm] = same * p[l][m] * cos_mphi; lm++; - ylm[lm] = same * p[l][m] * sin(m * phi); + ylm[lm] = same * p[l][m] * sin_mphi; lm++; } } @@ -269,7 +262,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, double dRy = R0.y - R1.y; double dRz = R0.z - R1.z; - #pragma unroll 4 +#pragma unroll 4 for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) { double r_val = xmean + xl * d_gl_x[ir]; @@ -302,14 +295,16 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } // Interpolate psi - double psi_val = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); + double psi_val + = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); // Phase factor double phase = r_val * A_dot_leb; cuDoubleComplex exp_iAr = cu_exp_i(phase); // Interpolate beta - double beta_val = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); + double beta_val + = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); // Y_L1m1 double ylm_L1_val = ylm1[L1 * L1 + m1]; @@ -318,8 +313,8 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); - // Accumulate for all m0 - #pragma unroll +// Accumulate for all m0 +#pragma unroll for (int m0 = 0; m0 < num_m0; m0++) { double ylm0_val = ylm0[offset_L0 + m0]; From 9bdfdb7aee30839585d0890543cee930c003acda Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 19 Dec 2025 20:19:07 +0800 Subject: [PATCH 06/13] make the code more concise --- .../module_rt/kernels/cuda/snap_psibeta_kernel.cu | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index e706d513b8..25a1cef499 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -72,15 +72,7 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm } // Associated Legendre polynomials P_l^m - double p[MAX_L + 1][MAX_L + 1]; - - for (int l = 0; l <= L; l++) - { - for (int m = 0; m <= L; m++) - { - p[l][m] = 0.0; - } - } + double p[MAX_L + 1][MAX_L + 1] = {0}; p[0][0] = 1.0; From 926fcb6c836a1fb2f60fe84cdff3f4d761cbef69 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 20 Dec 2025 02:35:03 +0800 Subject: [PATCH 07/13] perf(cuda): optimize compute_ylm_gpu with template parameters - Convert compute_ylm_gpu to templated version with L as template param - Use linear array for Legendre polynomials (reduces from 25 to 15 doubles) - Add DISPATCH_YLM macro for runtime-to-template dispatch - Add MAX_M0_SIZE constant for result array sizing - Replace C++17 constexpr if with regular if for C++14 compatibility - Enable compiler loop unrolling with #pragma unroll Performance: snap_psibeta_gpu improved from 13.27s to 9.83s (1.35x speedup) --- .../kernels/cuda/snap_psibeta_kernel.cu | 100 +++++++++++++----- .../kernels/cuda/snap_psibeta_kernel.cuh | 38 ++++++- 2 files changed, 107 insertions(+), 31 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 25a1cef499..de0d6b5dcd 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -32,11 +32,36 @@ __constant__ double d_gl_w[RADIAL_GRID_NUM]; * Based on the recursive formula used in ModuleBase::Ylm * OPTIMIZED: Use atan2, sincos for better performance */ -__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm) +/** + * @brief Helper to access linear-stored P_l^m value + * Linear index for (l,m): sum_{i=0}^{l-1}(i+1) + m = l*(l+1)/2 + m + */ +__device__ __forceinline__ double& p_access(double* p, int l, int m) +{ + return p[l * (l + 1) / 2 + m]; +} + +__device__ __forceinline__ double p_get(const double* p, int l, int m) { - const double PI = 3.14159265358979323846; - const double FOUR_PI = 4.0 * PI; - const double SQRT2 = 1.41421356237309504880; + return p[l * (l + 1) / 2 + m]; +} + +/** + * @brief Compute real spherical harmonics Y_lm (TEMPLATED VERSION) + * L is now a compile-time constant for better optimization + * Uses linear array for Legendre polynomials to reduce register pressure + * + * @tparam L Maximum angular momentum (compile-time constant) + * @param x, y, z Direction components (should be normalized) + * @param ylm Output array of size (L+1)^2 + */ +template +__device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) +{ + constexpr double PI = 3.14159265358979323846; + constexpr double FOUR_PI = 4.0 * PI; + constexpr double SQRT2 = 1.41421356237309504880; + constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage // Y_00 ylm[0] = 0.5 * sqrt(1.0 / PI); @@ -46,7 +71,7 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm return; } - // Compute r and cost + // Compute cost, sint, phi double r2 = x * x + y * y + z * z; double r = sqrt(r2); @@ -61,82 +86,101 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm { cost = z / r; sint = sqrt(1.0 - cost * cost); - // Use atan2 for robust phi computation (replaces multiple conditionals) phi = atan2(y, x); } - // Ensure sint is non-negative (numerical safety) if (sint < 0.0) { sint = 0.0; } - // Associated Legendre polynomials P_l^m - double p[MAX_L + 1][MAX_L + 1] = {0}; + // Associated Legendre polynomials P_l^m in linear storage + double p[P_SIZE]; - p[0][0] = 1.0; + // Initialize + p_access(p, 0, 0) = 1.0; if (L >= 1) { - p[1][0] = cost; - p[1][1] = -sint; + p_access(p, 1, 0) = cost; + p_access(p, 1, 1) = -sint; } +// Recurrence for l >= 2 +#pragma unroll for (int l = 2; l <= L; l++) { +#pragma unroll for (int m = 0; m <= l - 2; m++) { - p[l][m] = ((2 * l - 1) * cost * p[l - 1][m] - (l - 1 + m) * p[l - 2][m]) / (double)(l - m); + p_access(p, l, m) + = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m)) / (double)(l - m); } - p[l][l - 1] = (2 * l - 1) * cost * p[l - 1][l - 1]; + p_access(p, l, l - 1) = (2 * l - 1) * cost * p_get(p, l - 1, l - 1); + + // Compute (2l-1)!! = 1*3*5*...*(2l-1) double fact = 1.0; +#pragma unroll for (int i = 1; i <= 2 * l - 1; i += 2) { fact *= i; } + + // Compute sint^l double sint_pow = 1.0; +#pragma unroll for (int i = 0; i < l; i++) { sint_pow *= sint; } - p[l][l] = fact * sint_pow; + + p_access(p, l, l) = fact * sint_pow; if (l % 2 == 1) { - p[l][l] = -p[l][l]; + p_access(p, l, l) = -p_access(p, l, l); } } - // Compute Y_lm + // Compute Y_lm from P_l^m int lm = 0; +#pragma unroll for (int l = 0; l <= L; l++) { double c = sqrt((2.0 * l + 1.0) / FOUR_PI); - ylm[lm] = c * p[l][0]; + ylm[lm] = c * p_get(p, l, 0); lm++; +#pragma unroll for (int m = 1; m <= l; m++) { double fact_ratio = 1.0; +#pragma unroll for (int i = l - m + 1; i <= l + m; i++) { fact_ratio *= i; } double same = c * sqrt(1.0 / fact_ratio) * SQRT2; - // Use sincos for efficiency (computes both sin and cos together) double sin_mphi, cos_mphi; sincos(m * phi, &sin_mphi, &cos_mphi); - ylm[lm] = same * p[l][m] * cos_mphi; + ylm[lm] = same * p_get(p, l, m) * cos_mphi; lm++; - ylm[lm] = same * p[l][m] * sin_mphi; + ylm[lm] = same * p_get(p, l, m) * sin_mphi; lm++; } } } +// Explicit template instantiations for L = 0, 1, 2, 3, 4 +template __device__ void compute_ylm_gpu<0>(double x, double y, double z, double* ylm); +template __device__ void compute_ylm_gpu<1>(double x, double y, double z, double* ylm); +template __device__ void compute_ylm_gpu<2>(double x, double y, double z, double* ylm); +template __device__ void compute_ylm_gpu<3>(double x, double y, double z, double* ylm); +template __device__ void compute_ylm_gpu<4>(double x, double y, double z, double* ylm); + //============================================================================= // Warp Reduction for double //============================================================================= @@ -214,10 +258,10 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, __shared__ double s_temp_im[BLOCK_SIZE]; // Initialize accumulators in registers - double result_re[MAX_YLM_SIZE]; - double result_im[MAX_YLM_SIZE]; - double result_r_re[3][MAX_YLM_SIZE]; - double result_r_im[3][MAX_YLM_SIZE]; + double result_re[MAX_M0_SIZE]; + double result_im[MAX_M0_SIZE]; + double result_r_re[3][MAX_M0_SIZE]; + double result_r_im[3][MAX_M0_SIZE]; int num_m0 = 2 * L0 + 1; for (int m0 = 0; m0 < num_m0; m0++) @@ -243,7 +287,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, // Precompute ylm0 ONCE per angular point (doesn't depend on r_val) double ylm0[MAX_YLM_SIZE]; - compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0); + DISPATCH_YLM(L0, leb_x, leb_y, leb_z, ylm0); int offset_L0 = L0 * L0; // Precompute A dot direction (doesn't depend on r_val) @@ -279,11 +323,11 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, if (tnorm > 1e-10) { double inv_tnorm = 1.0 / tnorm; - compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); + DISPATCH_YLM(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); } else { - compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1); + DISPATCH_YLM(L1, 0.0, 0.0, 1.0, ylm1); } // Interpolate psi diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index c3191653eb..8c00e4cc5a 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -15,6 +15,7 @@ constexpr int ANGULAR_GRID_NUM = 110; constexpr int BLOCK_SIZE = 128; // >= ANGULAR_GRID_NUM for efficient reduction constexpr int MAX_L = 4; // Maximum angular momentum supported constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25 +constexpr int MAX_M0_SIZE = 2 * MAX_L + 1; // = 9 //============================================================================= // Device Helper Functions @@ -88,13 +89,44 @@ __device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double /** * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z) - * This is a simplified version for small L values + * TEMPLATED VERSION - L is compile-time constant for optimization * - * @param L Angular momentum (0 <= L <= MAX_L) + * @tparam L Maximum angular momentum (0 <= L <= MAX_L) * @param x, y, z Direction (should be normalized) * @param ylm Output array of size (L+1)^2 */ -__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm); +template +__device__ void compute_ylm_gpu(double x, double y, double z, double* ylm); + +/** + * @brief Runtime dispatch macro for templated compute_ylm_gpu + * Converts runtime L to compile-time template call + */ +#define DISPATCH_YLM(L_val, x, y, z, ylm) \ + do \ + { \ + switch (L_val) \ + { \ + case 0: \ + compute_ylm_gpu<0>(x, y, z, ylm); \ + break; \ + case 1: \ + compute_ylm_gpu<1>(x, y, z, ylm); \ + break; \ + case 2: \ + compute_ylm_gpu<2>(x, y, z, ylm); \ + break; \ + case 3: \ + compute_ylm_gpu<3>(x, y, z, ylm); \ + break; \ + case 4: \ + compute_ylm_gpu<4>(x, y, z, ylm); \ + break; \ + default: \ + compute_ylm_gpu<4>(x, y, z, ylm); \ + break; \ + } \ + } while (0) //============================================================================= // Kernel Input/Output Structures From 7ccabc5a04fa628992581ea2c838076f72fcab86 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 20 Dec 2025 02:36:20 +0800 Subject: [PATCH 08/13] perf(cuda): use warp shuffle for reduction in snap_psibeta kernel - Replace shared memory tree reduction with warp shuffle reduction - Use warp_reduce_sum for intra-warp reduction (faster shuffle ops) - Reduce shared memory from BLOCK_SIZE (2KB) to NUM_WARPS (64 bytes) - Cross-warp reduction done by first warp reading from shared memory Reduces register usage from 94 to 88, shared memory from 2KB to 64 bytes. --- .../kernels/cuda/snap_psibeta_kernel.cu | 85 +++++++++++-------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index de0d6b5dcd..9f9fd4c099 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -253,9 +253,10 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0); - // Shared memory for reduction - __shared__ double s_temp_re[BLOCK_SIZE]; - __shared__ double s_temp_im[BLOCK_SIZE]; + // Shared memory for warp-level reduction (only need space for warp sums) + constexpr int NUM_WARPS = BLOCK_SIZE / 32; // 128 / 32 = 4 warps + __shared__ double s_temp_re[NUM_WARPS]; + __shared__ double s_temp_im[NUM_WARPS]; // Initialize accumulators in registers double result_re[MAX_M0_SIZE]; @@ -376,31 +377,40 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } } - // Reduction and output - OUTSIDE radial loop + // Reduction and output using warp shuffle - more efficient than shared memory int out_base = norb_idx * nlm_dim * natomwfc; + int warp_id = tid / 32; + int lane_id = tid % 32; for (int m0 = 0; m0 < num_m0; m0++) { - s_temp_re[tid] = result_re[m0]; - s_temp_im[tid] = result_im[m0]; - __syncthreads(); + // Step 1: Warp-level reduction using shuffle + double sum_re = warp_reduce_sum(result_re[m0]); + double sum_im = warp_reduce_sum(result_im[m0]); - for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + // Step 2: First lane of each warp writes to shared memory + if (lane_id == 0) { - if (tid < s) - { - s_temp_re[tid] += s_temp_re[tid + s]; - s_temp_im[tid] += s_temp_im[tid + s]; - } - __syncthreads(); + s_temp_re[warp_id] = sum_re; + s_temp_im[warp_id] = sum_im; } + __syncthreads(); - if (tid == 0) + // Step 3: First warp reduces across all warps + if (warp_id == 0) { - cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); - result = cu_mul(result, exp_iAR0); - result = cu_conj(result); - nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result; + sum_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0; + sum_im = (lane_id < NUM_WARPS) ? s_temp_im[lane_id] : 0.0; + sum_re = warp_reduce_sum(sum_re); + sum_im = warp_reduce_sum(sum_im); + + if (lane_id == 0) + { + cuDoubleComplex result = make_cuDoubleComplex(sum_re, sum_im); + result = cu_mul(result, exp_iAR0); + result = cu_conj(result); + nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result; + } } __syncthreads(); @@ -408,26 +418,33 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, { for (int d = 0; d < 3; d++) { - s_temp_re[tid] = result_r_re[d][m0]; - s_temp_im[tid] = result_r_im[d][m0]; - __syncthreads(); + // Step 1: Warp-level reduction + double sum_r_re = warp_reduce_sum(result_r_re[d][m0]); + double sum_r_im = warp_reduce_sum(result_r_im[d][m0]); - for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) + // Step 2: First lane writes to shared memory + if (lane_id == 0) { - if (tid < s) - { - s_temp_re[tid] += s_temp_re[tid + s]; - s_temp_im[tid] += s_temp_im[tid + s]; - } - __syncthreads(); + s_temp_re[warp_id] = sum_r_re; + s_temp_im[warp_id] = sum_r_im; } + __syncthreads(); - if (tid == 0) + // Step 3: First warp reduces across warps + if (warp_id == 0) { - cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]); - result_r = cu_mul(result_r, exp_iAR0); - result_r = cu_conj(result_r); - nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r; + sum_r_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0; + sum_r_im = (lane_id < NUM_WARPS) ? s_temp_im[lane_id] : 0.0; + sum_r_re = warp_reduce_sum(sum_r_re); + sum_r_im = warp_reduce_sum(sum_r_im); + + if (lane_id == 0) + { + cuDoubleComplex result_r = make_cuDoubleComplex(sum_r_re, sum_r_im); + result_r = cu_mul(result_r, exp_iAR0); + result_r = cu_conj(result_r); + nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r; + } } __syncthreads(); } From 0afca47efd36c0d9e77d445a586cd83fb69b9190 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 20 Dec 2025 13:35:50 +0800 Subject: [PATCH 09/13] refactor(cuda): improve snap_psibeta_kernel code organization and documentation - Add comprehensive file headers explaining purpose and key features - Organize code into logical sections with clear separators - Add doxygen-style documentation for all functions, structs, and constants - Fix inaccurate comments (BLOCK_SIZE requirement, direction vector normalization) - Remove unused variables (dR, distance01) - Remove finalize_gpu_resources() as it's not needed for constant memory - Improve inline comments explaining algorithms and optimizations --- .../module_operator_lcao/td_nonlocal_lcao.cpp | 6 - .../kernels/cuda/snap_psibeta_gpu.cu | 132 +++++-- .../kernels/cuda/snap_psibeta_kernel.cu | 359 +++++++++++------- .../kernels/cuda/snap_psibeta_kernel.cuh | 244 ++++++++---- 4 files changed, 484 insertions(+), 257 deletions(-) diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp index 6d2972b92c..63f4590056 100644 --- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp @@ -306,12 +306,6 @@ void hamilt::TDNonlocal>::calculate_HR() } } // end omp parallel for matrix assembly } // end for iat0 - -#ifdef __CUDA - // Release GPU resources at end of calculation - module_rt::gpu::finalize_gpu_resources(); -#endif - ModuleBase::timer::tick("TDNonlocal", "calculate_HR"); } diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu index f630aec8c2..026af2d1d0 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -1,3 +1,16 @@ +/** + * @file snap_psibeta_gpu.cu + * @brief Host-side GPU interface for overlap computation + * + * This file provides the high-level interface for GPU-accelerated computation + * of overlap integrals between atomic orbitals (psi) and non-local projectors + * (beta). It handles: + * - GPU resource initialization and cleanup + * - Data marshalling from ABACUS structures to GPU-friendly formats + * - Kernel launch configuration + * - Result unpacking back to ABACUS data structures + */ + #include "../snap_psibeta_gpu.h" #include "snap_psibeta_kernel.cuh" #include "source_base/timer.h" @@ -12,9 +25,12 @@ namespace gpu { //============================================================================= -// CUDA Error Checking Macro +// CUDA Error Handling //============================================================================= +/** + * @brief CUDA error checking macro with file/line information + */ #define CUDA_CHECK(call) \ do \ { \ @@ -26,12 +42,21 @@ namespace gpu } while (0) //============================================================================= -// Resource Management - Simplified (using constant memory for grids) +// GPU Resource Management //============================================================================= +/** + * @brief Initialize GPU resources for snap_psibeta computation + * + * Checks for available CUDA devices and copies integration grids + * (Lebedev-Laikov angular and Gauss-Legendre radial) to constant memory. + * + * @note Call this once at the start of a calculation session before any + * snap_psibeta_atom_batch_gpu calls. + */ void initialize_gpu_resources() { - // Check for CUDA device + // Verify CUDA device availability int device_count = 0; cudaError_t err = cudaGetDeviceCount(&device_count); if (err != cudaSuccess || device_count == 0) @@ -40,24 +65,58 @@ void initialize_gpu_resources() return; } - // Copy integration grids (Lebedev and Gauss-Legendre) to constant memory + // Initialize integration grids in constant memory copy_grids_to_device(); - // Sync to ensure all initialization is complete + // Synchronize to ensure initialization is complete cudaDeviceSynchronize(); } -void finalize_gpu_resources() +//============================================================================= +// Internal Helper Structures +//============================================================================= + +/** + * @brief Mapping structure for reconstructing output data + * + * Associates each orbital in the flattened GPU array with its original + * neighbor and orbital indices for proper result placement. + */ +struct OrbitalMapping { - // No explicit cleanup needed - constant memory is managed by CUDA runtime - // Just reset any CUDA errors that may have accumulated - cudaGetLastError(); -} + int neighbor_idx; ///< Index of neighbor atom in adjacency list + int iw_index; ///< Global orbital index for output mapping +}; //============================================================================= -// Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel +// Main GPU Interface Function //============================================================================= +/** + * @brief Compute overlap integrals on GPU + * + * This function processes ALL neighbor atoms for a single center atom (where + * the projectors are located) in a single kernel launch, providing significant + * performance improvement over per-neighbor processing. + * + * Workflow: + * 1. Collect all (neighbor, orbital) pairs into flattened arrays + * 2. Prepare projector data for the center atom + * 3. Transfer data to GPU and launch kernel + * 4. Retrieve results and reconstruct nlm_tot structure + * + * @param orb LCAO orbital information + * @param infoNL_ Non-local projector information + * @param T0 Atom type of center atom (projector location) + * @param R0 Position of center atom + * @param A Vector potential for phase factor + * @param adjs Adjacent atom information + * @param ucell Unit cell information + * @param paraV Parallel orbital distribution + * @param npol Number of spin polarizations + * @param nlm_dim Output dimension (1 for overlap only, 4 for overlap + current) + * @param nlm_tot Output: overlap integrals indexed as [neighbor][direction][orbital] + */ void snap_psibeta_atom_batch_gpu( const LCAO_Orbitals& orb, const InfoNonlocal& infoNL_, @@ -73,7 +132,10 @@ void snap_psibeta_atom_batch_gpu( { ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); - // Get projector info + //========================================================================= + // Early exit if no projectors on center atom + //========================================================================= + const int nproj = infoNL_.nproj[T0]; if (nproj == 0) { @@ -81,13 +143,19 @@ void snap_psibeta_atom_batch_gpu( return; } - // Calculate natomwfc - int natomwfc = 0; + //========================================================================= + // Compute projector output indices + //========================================================================= + + int natomwfc = 0; // Total number of projector components std::vector proj_m0_offset_h(nproj); + for (int ip = 0; ip < nproj; ip++) { proj_m0_offset_h[ip] = natomwfc; int L0 = infoNL_.Beta[T0].Proj[ip].getL(); + + // Validate angular momentum if (L0 > MAX_L) { fprintf(stderr, @@ -99,18 +167,11 @@ void snap_psibeta_atom_batch_gpu( } //========================================================================= - // Collect all neighbor-orbital pairs + // Collect all (neighbor, orbital) pairs //========================================================================= std::vector neighbor_orbitals_h; std::vector psi_radial_h; - - // Track neighbor index for each orbital for output reconstruction - struct OrbitalMapping - { - int neighbor_idx; - int iw_index; - }; std::vector orbital_mappings; for (int ad = 0; ad < adjs.adj_num + 1; ++ad) @@ -121,7 +182,7 @@ void snap_psibeta_atom_batch_gpu( const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; const Atom* atom1 = &ucell->atoms[T1]; - // Get orbital indices + // Get unique orbital indices (union of row and column indices) auto all_indexes = paraV->get_indexes_row(iat1); auto col_indexes = paraV->get_indexes_col(iat1); all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); @@ -136,19 +197,19 @@ void snap_psibeta_atom_batch_gpu( const int m1 = atom1->iw2m[iw1]; const int N1 = atom1->iw2n[iw1]; + // Skip orbitals with angular momentum beyond supported limit if (L1 > MAX_L) { - continue; // Skip orbitals with L > MAX_L + continue; } - // Get orbital radial data - // Use getPsi() to match CPU implementation (not getPsi_r() which returns psi*r) + // Get orbital radial function (use getPsi(), not getPsi_r()) const double* phi_psi = orb.Phi[T1].PhiLN(L1, N1).getPsi(); int mesh = orb.Phi[T1].PhiLN(L1, N1).getNr(); double dk = orb.Phi[T1].PhiLN(L1, N1).getDk(); double rcut = orb.Phi[T1].getRcut(); - // Add to flattened psi array + // Append to flattened psi array size_t psi_offset = psi_radial_h.size(); psi_radial_h.insert(psi_radial_h.end(), phi_psi, phi_psi + mesh); @@ -167,6 +228,7 @@ void snap_psibeta_atom_batch_gpu( neighbor_orbitals_h.push_back(norb); + // Track mapping for result reconstruction OrbitalMapping mapping; mapping.neighbor_idx = ad; mapping.iw_index = all_indexes[iw1l]; @@ -203,7 +265,6 @@ void snap_psibeta_atom_batch_gpu( projectors_h[ip].beta_mesh = mesh; projectors_h[ip].beta_dk = dk; projectors_h[ip].beta_rcut = rcut; - // Use actual radial grid range projectors_h[ip].r_min = radial[0]; projectors_h[ip].r_max = radial[mesh - 1]; @@ -211,7 +272,7 @@ void snap_psibeta_atom_batch_gpu( } //========================================================================= - // Allocate Device Memory + // Allocate GPU memory //========================================================================= NeighborOrbitalData* neighbor_orbitals_d = nullptr; @@ -231,7 +292,7 @@ void snap_psibeta_atom_batch_gpu( CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex))); //========================================================================= - // Copy Data to Device + // Transfer data to GPU //========================================================================= CUDA_CHECK(cudaMemcpy(neighbor_orbitals_d, @@ -247,7 +308,7 @@ void snap_psibeta_atom_batch_gpu( CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex))); //========================================================================= - // Launch Kernel - SINGLE launch for ALL neighbors! + // Launch kernel //========================================================================= double3 R0_d3 = make_double3(R0.x, R0.y, R0.z); @@ -269,6 +330,7 @@ void snap_psibeta_atom_batch_gpu( nlm_dim, nlm_out_d); + // Check for launch errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -286,14 +348,14 @@ void snap_psibeta_atom_batch_gpu( CUDA_CHECK(cudaDeviceSynchronize()); //========================================================================= - // Copy Results Back + // Retrieve results //========================================================================= std::vector nlm_out_h(output_size); CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost)); //========================================================================= - // Reconstruct nlm_tot structure + // Reconstruct output structure //========================================================================= for (int i = 0; i < total_neighbor_orbitals; i++) @@ -312,7 +374,7 @@ void snap_psibeta_atom_batch_gpu( } } - // Insert into nlm_tot[ad][dir][iw_index] + // Insert into nlm_tot[neighbor][direction][orbital] for (int dir = 0; dir < nlm_dim; dir++) { nlm_tot[ad][dir].insert({iw_index, nlm[dir]}); @@ -320,7 +382,7 @@ void snap_psibeta_atom_batch_gpu( } //========================================================================= - // Cleanup + // Cleanup GPU memory //========================================================================= cudaFree(neighbor_orbitals_d); diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 9f9fd4c099..8d29015add 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -1,3 +1,16 @@ +/** + * @file snap_psibeta_kernel.cu + * @brief CUDA kernel implementation for overlap integrals + * + * This file implements the GPU-accelerated numerical integration for computing + * overlap integrals between atomic orbitals (psi) and non-local projectors (beta). + * The implementation uses: + * - Lebedev-Laikov quadrature (110 points) for angular integration + * - Gauss-Legendre quadrature (140 points) for radial integration + * - Templated spherical harmonics with compile-time L for optimization + * - Warp-level shuffle reduction for efficient parallel summation + */ + #include "snap_psibeta_kernel.cuh" #include "source_base/math_integral.h" @@ -10,60 +23,75 @@ namespace gpu { //============================================================================= -// Constant Memory for Integration Grids +// Constant Memory - Integration Grids //============================================================================= -// Lebedev-Laikov angular grid (110 points) -__constant__ double d_lebedev_x[ANGULAR_GRID_NUM]; -__constant__ double d_lebedev_y[ANGULAR_GRID_NUM]; -__constant__ double d_lebedev_z[ANGULAR_GRID_NUM]; -__constant__ double d_lebedev_w[ANGULAR_GRID_NUM]; +// Lebedev-Laikov angular quadrature grid (110 points) +__constant__ double d_lebedev_x[ANGULAR_GRID_NUM]; ///< x-direction cosines +__constant__ double d_lebedev_y[ANGULAR_GRID_NUM]; ///< y-direction cosines +__constant__ double d_lebedev_z[ANGULAR_GRID_NUM]; ///< z-direction cosines +__constant__ double d_lebedev_w[ANGULAR_GRID_NUM]; ///< Angular integration weights -// Gauss-Legendre radial grid (140 points) -__constant__ double d_gl_x[RADIAL_GRID_NUM]; -__constant__ double d_gl_w[RADIAL_GRID_NUM]; +// Gauss-Legendre radial quadrature grid (140 points) +__constant__ double d_gl_x[RADIAL_GRID_NUM]; ///< Quadrature abscissae on [-1, 1] +__constant__ double d_gl_w[RADIAL_GRID_NUM]; ///< Quadrature weights //============================================================================= -// Spherical Harmonics Implementation +// Spherical Harmonics - Helper Functions //============================================================================= /** - * @brief Compute real spherical harmonics Y_lm - * Based on the recursive formula used in ModuleBase::Ylm - * OPTIMIZED: Use atan2, sincos for better performance - */ -/** - * @brief Helper to access linear-stored P_l^m value - * Linear index for (l,m): sum_{i=0}^{l-1}(i+1) + m = l*(l+1)/2 + m + * @brief Access element in lower-triangular stored Legendre polynomial array + * + * For associated Legendre polynomials P_l^m, we only need 0 <= m <= l. + * Storage layout: P_0^0, P_1^0, P_1^1, P_2^0, P_2^1, P_2^2, ... + * Linear index: l*(l+1)/2 + m */ __device__ __forceinline__ double& p_access(double* p, int l, int m) { return p[l * (l + 1) / 2 + m]; } +/** + * @brief Read-only access to Legendre polynomial array + */ __device__ __forceinline__ double p_get(const double* p, int l, int m) { return p[l * (l + 1) / 2 + m]; } +//============================================================================= +// Spherical Harmonics - Main Implementation +//============================================================================= + /** - * @brief Compute real spherical harmonics Y_lm (TEMPLATED VERSION) - * L is now a compile-time constant for better optimization - * Uses linear array for Legendre polynomials to reduce register pressure + * @brief Compute real spherical harmonics Y_lm (templated version) + * + * Uses the recursive computation of associated Legendre polynomials: + * P_l^m = ((2l-1)*cos(theta)*P_{l-1}^m - (l-1+m)*P_{l-2}^m) / (l-m) + * P_l^{l-1} = (2l-1)*cos(theta)*P_{l-1}^{l-1} + * P_l^l = (-1)^l * (2l-1)!! * sin^l(theta) + * + * Real spherical harmonics are defined as: + * Y_{lm} = c_l * P_l^0 for m = 0 + * Y_{l,2m-1} = c_l * sqrt(2/(l-m)!/(l+m)!) * P_l^m * cos(m*phi) for m > 0 + * Y_{l,2m} = c_l * sqrt(2/(l-m)!/(l+m)!) * P_l^m * sin(m*phi) for m > 0 + * where c_l = sqrt((2l+1)/(4*pi)) * * @tparam L Maximum angular momentum (compile-time constant) - * @param x, y, z Direction components (should be normalized) - * @param ylm Output array of size (L+1)^2 + * @param x, y, z Direction vector components (need not be normalized) + * @param ylm Output array storing Y_lm values in order: Y_00, Y_10, Y_11c, Y_11s, ... */ template __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) { + // Mathematical constants constexpr double PI = 3.14159265358979323846; constexpr double FOUR_PI = 4.0 * PI; constexpr double SQRT2 = 1.41421356237309504880; - constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage + constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage size - // Y_00 + // Y_00 = 1/(2*sqrt(pi)) ylm[0] = 0.5 * sqrt(1.0 / PI); if (L == 0) @@ -71,13 +99,14 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) return; } - // Compute cost, sint, phi + // Compute spherical angles double r2 = x * x + y * y + z * z; double r = sqrt(r2); double cost, sint, phi; if (r < 1e-10) { + // At origin, default to z-axis direction cost = 1.0; sint = 0.0; phi = 0.0; @@ -89,86 +118,92 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) phi = atan2(y, x); } + // Ensure sint is non-negative (numerical safety) if (sint < 0.0) { sint = 0.0; } - // Associated Legendre polynomials P_l^m in linear storage + // Associated Legendre polynomials P_l^m in lower-triangular storage double p[P_SIZE]; - // Initialize + // Base cases p_access(p, 0, 0) = 1.0; if (L >= 1) { - p_access(p, 1, 0) = cost; - p_access(p, 1, 1) = -sint; + p_access(p, 1, 0) = cost; // P_1^0 = cos(theta) + p_access(p, 1, 1) = -sint; // P_1^1 = -sin(theta) } -// Recurrence for l >= 2 + // Recurrence relations for l >= 2 #pragma unroll for (int l = 2; l <= L; l++) { + // P_l^m for m = 0 to l-2: standard recurrence #pragma unroll for (int m = 0; m <= l - 2; m++) { - p_access(p, l, m) - = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m)) / (double)(l - m); + p_access(p, l, m) = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m)) + / static_cast(l - m); } + + // P_l^{l-1} = (2l-1) * cos(theta) * P_{l-1}^{l-1} p_access(p, l, l - 1) = (2 * l - 1) * cost * p_get(p, l - 1, l - 1); - // Compute (2l-1)!! = 1*3*5*...*(2l-1) - double fact = 1.0; + // P_l^l = (-1)^l * (2l-1)!! * sin^l(theta) + double double_factorial = 1.0; #pragma unroll for (int i = 1; i <= 2 * l - 1; i += 2) { - fact *= i; + double_factorial *= i; } - // Compute sint^l - double sint_pow = 1.0; + double sint_power = 1.0; #pragma unroll for (int i = 0; i < l; i++) { - sint_pow *= sint; + sint_power *= sint; } - p_access(p, l, l) = fact * sint_pow; + p_access(p, l, l) = double_factorial * sint_power; if (l % 2 == 1) { p_access(p, l, l) = -p_access(p, l, l); } } - // Compute Y_lm from P_l^m + // Transform Legendre polynomials to real spherical harmonics int lm = 0; #pragma unroll for (int l = 0; l <= L; l++) { double c = sqrt((2.0 * l + 1.0) / FOUR_PI); + // m = 0 component ylm[lm] = c * p_get(p, l, 0); lm++; + // m > 0 components (cosine and sine parts) #pragma unroll for (int m = 1; m <= l; m++) { - double fact_ratio = 1.0; + // Compute normalization factor: sqrt(2 * (l-m)! / (l+m)!) + double factorial_ratio = 1.0; #pragma unroll for (int i = l - m + 1; i <= l + m; i++) { - fact_ratio *= i; + factorial_ratio *= i; } - double same = c * sqrt(1.0 / fact_ratio) * SQRT2; + double norm = c * sqrt(1.0 / factorial_ratio) * SQRT2; double sin_mphi, cos_mphi; sincos(m * phi, &sin_mphi, &cos_mphi); - ylm[lm] = same * p_get(p, l, m) * cos_mphi; + ylm[lm] = norm * p_get(p, l, m) * cos_mphi; // Y_{l,m} cosine part lm++; - ylm[lm] = same * p_get(p, l, m) * sin_mphi; + ylm[lm] = norm * p_get(p, l, m) * sin_mphi; // Y_{l,m} sine part lm++; } } @@ -182,9 +217,18 @@ template __device__ void compute_ylm_gpu<3>(double x, double y, double z, double template __device__ void compute_ylm_gpu<4>(double x, double y, double z, double* ylm); //============================================================================= -// Warp Reduction for double +// Warp-Level Reduction //============================================================================= +/** + * @brief Warp-level sum reduction using shuffle instructions + * + * Performs a parallel reduction within a warp (32 threads) using __shfl_down_sync. + * After this function, lane 0 contains the sum of all input values in the warp. + * + * @param val Input value from each thread + * @return Sum across all threads in the warp (valid only in lane 0) + */ __device__ __forceinline__ double warp_reduce_sum(double val) { for (int offset = 16; offset > 0; offset /= 2) @@ -195,10 +239,20 @@ __device__ __forceinline__ double warp_reduce_sum(double val) } //============================================================================= -// Atom-Level Batch Kernel Implementation -// Processes ALL neighbors for a center atom in a single kernel launch +// Main Kernel Implementation //============================================================================= +/** + * @brief Atom-level batch kernel for overlap integrals + * + * Integration is performed using restructured loops for efficiency: + * - Outer loop: angular points (each thread handles different angles) + * - Inner loop: radial points (each thread accumulates all radii) + * + * This structure exploits the fact that Y_lm for the projector (ylm0) only + * depends on the angular direction, not the radial distance, saving + * RADIAL_GRID_NUM redundant ylm0 computations per angular point. + */ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, double3 A, const NeighborOrbitalData* __restrict__ neighbor_orbitals, @@ -212,59 +266,74 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, int nlm_dim, cuDoubleComplex* __restrict__ nlm_out) { - int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair - int proj_idx = blockIdx.y; // Which projector - int tid = threadIdx.x; + // Thread/block indices + const int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair + const int proj_idx = blockIdx.y; // Which projector + const int tid = threadIdx.x; + // Early exit for out-of-bounds blocks if (norb_idx >= total_neighbor_orbitals || proj_idx >= nproj) { return; } - // Load neighbor-orbital data (includes R1) + //------------------------------------------------------------------------- + // Load input data + //------------------------------------------------------------------------- + const NeighborOrbitalData& norb = neighbor_orbitals[norb_idx]; const ProjectorData& proj = projectors[proj_idx]; - double3 R1 = norb.R1; - int L1 = norb.L1; - int m1 = norb.m1; - int L0 = proj.L0; - int m0_offset = proj_m0_offset[proj_idx]; + const double3 R1 = norb.R1; + const int L1 = norb.L1; + const int m1 = norb.m1; + const int L0 = proj.L0; + const int m0_offset = proj_m0_offset[proj_idx]; - // Skip if L values exceed our precomputed limit + // Skip if angular momentum exceeds supported limit if (L1 > MAX_L || L0 > MAX_L) { return; } + //------------------------------------------------------------------------- // Compute geometry - double3 dR = make_double3(R1.x - R0.x, R1.y - R0.y, R1.z - R0.z); - double distance01 = sqrt(dR.x * dR.x + dR.y * dR.y + dR.z * dR.z); - - double r1_max = norb.psi_rcut; - // Integration range: use projector's radial grid range [r_min, r_max] - // This matches the CPU implementation in snap_psibeta_half_tddft.cpp - double r_min = proj.r_min; - double r_max = proj.r_max; - double xl = 0.5 * (r_max - r_min); - double xmean = 0.5 * (r_max + r_min); - - // Phase factor exp(i A · R0) - double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; - cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0); - - // Shared memory for warp-level reduction (only need space for warp sums) + //------------------------------------------------------------------------- + + // Note: dR (R1 - R0) is computed inline as dRx/dRy/dRz in the integration loop + + // Orbital cutoff + const double r1_max = norb.psi_rcut; + + // Integration range from projector radial grid + const double r_min = proj.r_min; + const double r_max = proj.r_max; + const double xl = 0.5 * (r_max - r_min); // Half-range for Gauss-Legendre + const double xmean = 0.5 * (r_max + r_min); // Midpoint + + // Phase factor exp(i * A · R0) + const double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z; + const cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0); + + //------------------------------------------------------------------------- + // Shared memory for warp reduction + //------------------------------------------------------------------------- + constexpr int NUM_WARPS = BLOCK_SIZE / 32; // 128 / 32 = 4 warps __shared__ double s_temp_re[NUM_WARPS]; __shared__ double s_temp_im[NUM_WARPS]; - // Initialize accumulators in registers + //------------------------------------------------------------------------- + // Initialize accumulators (per-thread registers) + //------------------------------------------------------------------------- + + const int num_m0 = 2 * L0 + 1; + double result_re[MAX_M0_SIZE]; double result_im[MAX_M0_SIZE]; - double result_r_re[3][MAX_M0_SIZE]; + double result_r_re[3][MAX_M0_SIZE]; // For current operator: x, y, z components double result_r_im[3][MAX_M0_SIZE]; - int num_m0 = 2 * L0 + 1; for (int m0 = 0; m0 < num_m0; m0++) { result_re[m0] = 0.0; @@ -276,54 +345,60 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } } - // Main integration loop - RESTRUCTURED: angular loop outside, radial inside - // ylm0 only depends on angular direction (leb_x,y,z), not radial distance - // This saves 140x redundant ylm0 computations per angular point + //------------------------------------------------------------------------- + // Main integration loop + // Outer: angular points (parallelized across threads) + // Inner: radial points (accumulated per thread) + //------------------------------------------------------------------------- + for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE) { - double leb_x = d_lebedev_x[ian]; - double leb_y = d_lebedev_y[ian]; - double leb_z = d_lebedev_z[ian]; - double w_ang = d_lebedev_w[ian]; + // Load angular grid point + const double leb_x = d_lebedev_x[ian]; + const double leb_y = d_lebedev_y[ian]; + const double leb_z = d_lebedev_z[ian]; + const double w_ang = d_lebedev_w[ian]; - // Precompute ylm0 ONCE per angular point (doesn't depend on r_val) + // Precompute Y_lm for projector (independent of radial distance) double ylm0[MAX_YLM_SIZE]; DISPATCH_YLM(L0, leb_x, leb_y, leb_z, ylm0); - int offset_L0 = L0 * L0; + const int offset_L0 = L0 * L0; - // Precompute A dot direction (doesn't depend on r_val) - double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; + // Precompute A · direction (for phase factor) + const double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z; - // Precompute dR components - double dRx = R0.x - R1.x; - double dRy = R0.y - R1.y; - double dRz = R0.z - R1.z; + // Vector from R1 to R0 (for computing distance to orbital center) + const double dRx = R0.x - R1.x; + const double dRy = R0.y - R1.y; + const double dRz = R0.z - R1.z; + // Radial integration #pragma unroll 4 for (int ir = 0; ir < RADIAL_GRID_NUM; ir++) { - double r_val = xmean + xl * d_gl_x[ir]; - double w_rad = xl * d_gl_w[ir]; + // Transform Gauss-Legendre point from [-1,1] to [r_min, r_max] + const double r_val = xmean + xl * d_gl_x[ir]; + const double w_rad = xl * d_gl_w[ir]; - // Local position relative to R0 - double rx = r_val * leb_x; - double ry = r_val * leb_y; - double rz = r_val * leb_z; + // Integration point position relative to R0 + const double rx = r_val * leb_x; + const double ry = r_val * leb_y; + const double rz = r_val * leb_z; // Vector from R1 to integration point - double tx = rx + dRx; - double ty = ry + dRy; - double tz = rz + dRz; - double tnorm = sqrt(tx * tx + ty * ty + tz * tz); + const double tx = rx + dRx; + const double ty = ry + dRy; + const double tz = rz + dRz; + const double tnorm = sqrt(tx * tx + ty * ty + tz * tz); - // Check psi cutoff + // Check if within orbital cutoff if (tnorm <= r1_max) { - // Compute Y_L1 (depends on direction to R1, varies with r_val) + // Compute Y_lm for orbital (depends on direction from R1) double ylm1[MAX_YLM_SIZE]; if (tnorm > 1e-10) { - double inv_tnorm = 1.0 / tnorm; + const double inv_tnorm = 1.0 / tnorm; DISPATCH_YLM(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1); } else @@ -331,39 +406,40 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, DISPATCH_YLM(L1, 0.0, 0.0, 1.0, ylm1); } - // Interpolate psi - double psi_val + // Interpolate orbital radial function + const double psi_val = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm); - // Phase factor - double phase = r_val * A_dot_leb; - cuDoubleComplex exp_iAr = cu_exp_i(phase); - - // Interpolate beta - double beta_val + // Interpolate projector radial function + const double beta_val = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val); - // Y_L1m1 - double ylm_L1_val = ylm1[L1 * L1 + m1]; + // Phase factor exp(i * A · r) + const double phase = r_val * A_dot_leb; + const cuDoubleComplex exp_iAr = cu_exp_i(phase); + + // Orbital Y_lm value + const double ylm_L1_val = ylm1[L1 * L1 + m1]; - // Combined factor - double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; - cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); + // Combined integration factor: Y_L1m1 * psi * beta * r * dr * dOmega + const double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang; + const cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor); -// Accumulate for all m0 + // Accumulate for all m0 components of projector #pragma unroll for (int m0 = 0; m0 < num_m0; m0++) { - double ylm0_val = ylm0[offset_L0 + m0]; + const double ylm0_val = ylm0[offset_L0 + m0]; result_re[m0] += common_factor.x * ylm0_val; result_im[m0] += common_factor.y * ylm0_val; + // Current operator contribution (if requested) if (nlm_dim == 4) { - double r_op_x = rx + R0.x; - double r_op_y = ry + R0.y; - double r_op_z = rz + R0.z; + const double r_op_x = rx + R0.x; + const double r_op_y = ry + R0.y; + const double r_op_z = rz + R0.z; result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x; result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x; @@ -374,13 +450,17 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } } } - } - } + } // End radial loop + } // End angular loop + + //------------------------------------------------------------------------- + // Parallel reduction and output + // Uses warp shuffle for efficiency, followed by cross-warp reduction + //------------------------------------------------------------------------- - // Reduction and output using warp shuffle - more efficient than shared memory - int out_base = norb_idx * nlm_dim * natomwfc; - int warp_id = tid / 32; - int lane_id = tid % 32; + const int out_base = norb_idx * nlm_dim * natomwfc; + const int warp_id = tid / 32; + const int lane_id = tid % 32; for (int m0 = 0; m0 < num_m0; m0++) { @@ -396,7 +476,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } __syncthreads(); - // Step 3: First warp reduces across all warps + // Step 3: First warp reduces across all warps and writes output if (warp_id == 0) { sum_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0; @@ -414,15 +494,14 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } __syncthreads(); + // Process current operator components (if nlm_dim == 4) if (nlm_dim == 4) { for (int d = 0; d < 3; d++) { - // Step 1: Warp-level reduction double sum_r_re = warp_reduce_sum(result_r_re[d][m0]); double sum_r_im = warp_reduce_sum(result_r_im[d][m0]); - // Step 2: First lane writes to shared memory if (lane_id == 0) { s_temp_re[warp_id] = sum_r_re; @@ -430,7 +509,6 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } __syncthreads(); - // Step 3: First warp reduces across warps if (warp_id == 0) { sum_r_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0; @@ -453,9 +531,12 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } //============================================================================= -// Host-side Helper Function Implementations +// Host-side Helper Functions //============================================================================= +/** + * @brief CUDA error checking macro for kernel-related operations + */ #define CUDA_CHECK_KERNEL(call) \ do \ { \ @@ -466,9 +547,15 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, } \ } while (0) +/** + * @brief Copy integration grids to GPU constant memory + * + * Initializes the constant memory arrays with Lebedev-Laikov angular grid + * and Gauss-Legendre radial grid for use in kernel integration. + */ void copy_grids_to_device() { - // Copy Lebedev-Laikov angular grid to constant memory + // Copy Lebedev-Laikov 110-point angular quadrature grid CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x, ModuleBase::Integral::Lebedev_Laikov_grid110_x, ANGULAR_GRID_NUM * sizeof(double))); @@ -482,7 +569,7 @@ void copy_grids_to_device() ModuleBase::Integral::Lebedev_Laikov_grid110_w, ANGULAR_GRID_NUM * sizeof(double))); - // Prepare and copy Gauss-Legendre radial grid to constant memory + // Compute and copy Gauss-Legendre radial quadrature grid std::vector h_gl_x(RADIAL_GRID_NUM); std::vector h_gl_w(RADIAL_GRID_NUM); ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data()); diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index 8c00e4cc5a..42280b1498 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -1,3 +1,19 @@ +/** + * @file snap_psibeta_kernel.cuh + * @brief CUDA kernel declarations for computing overlap integrals + * + * This file provides GPU-accelerated computation of overlap integrals between + * atomic orbitals (psi) and non-local projectors (beta) for real-time TDDFT + * calculations. The implementation uses numerical integration on a combined + * radial (Gauss-Legendre) and angular (Lebedev-Laikov) grid. + * + * Key Features: + * - Atom-level batching: processes all neighbors for a center atom in single kernel + * - Templated spherical harmonics for compile-time optimization + * - Efficient memory access via constant memory for integration grids + * - Warp-level reduction for high-performance summation + */ + #ifndef SNAP_PSIBETA_KERNEL_CUH #define SNAP_PSIBETA_KERNEL_CUH @@ -9,44 +25,36 @@ namespace module_rt namespace gpu { -// Grid constants +//============================================================================= +// Configuration Constants +//============================================================================= + +/// Number of points in radial Gauss-Legendre grid constexpr int RADIAL_GRID_NUM = 140; + +/// Number of points in angular Lebedev-Laikov grid (110-point rule) constexpr int ANGULAR_GRID_NUM = 110; -constexpr int BLOCK_SIZE = 128; // >= ANGULAR_GRID_NUM for efficient reduction -constexpr int MAX_L = 4; // Maximum angular momentum supported -constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25 -constexpr int MAX_M0_SIZE = 2 * MAX_L + 1; // = 9 -//============================================================================= -// Device Helper Functions -//============================================================================= +/// Thread block size for kernel execution +constexpr int BLOCK_SIZE = 128; -/** - * @brief GPU version of cubic interpolation for radial functions - */ -__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi, - int mesh, - double inv_dk, - double distance) -{ - double position = distance * inv_dk; - int iq = __double2int_rd(position); // floor +/// Maximum supported angular momentum quantum number L +constexpr int MAX_L = 4; - if (iq > mesh - 4) - return 0.0; - if (iq < 0) - return 0.0; +/// Size of spherical harmonics array: (MAX_L + 1)^2 = 25 +constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); - double x0 = position - static_cast(iq); - double x1 = 1.0 - x0; - double x2 = 2.0 - x0; - double x3 = 3.0 - x0; +/// Maximum number of magnetic quantum numbers for a single L: 2*MAX_L + 1 = 9 +constexpr int MAX_M0_SIZE = 2 * MAX_L + 1; - return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0; -} +//============================================================================= +// Device Helper Functions - Complex Arithmetic +//============================================================================= /** * @brief Compute exp(i * theta) = cos(theta) + i * sin(theta) + * @param theta Phase angle in radians + * @return Complex exponential as cuDoubleComplex */ __device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta) { @@ -56,7 +64,7 @@ __device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta) } /** - * @brief Complex multiplication + * @brief Complex multiplication: a * b */ __device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleComplex b) { @@ -64,7 +72,7 @@ __device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleCom } /** - * @brief Complex addition + * @brief Complex addition: a + b */ __device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleComplex b) { @@ -72,7 +80,7 @@ __device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleCom } /** - * @brief Complex conjugate + * @brief Complex conjugate: conj(a) */ __device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a) { @@ -80,27 +88,83 @@ __device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a) } /** - * @brief Complex * real + * @brief Complex times real: a * r */ __device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double r) { return make_cuDoubleComplex(a.x * r, a.y * r); } +//============================================================================= +// Device Helper Functions - Radial Interpolation +//============================================================================= + /** - * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z) - * TEMPLATED VERSION - L is compile-time constant for optimization + * @brief Cubic spline interpolation for radial functions + * + * Implements cubic polynomial interpolation using 4 consecutive grid points. + * This is the GPU equivalent of CPU-side PolyInt::Polynomial_Interpolation. + * + * @param psi Radial function values on uniform grid + * @param mesh Number of grid points + * @param inv_dk Inverse of grid spacing (1/dk) + * @param distance Radial distance r at which to interpolate + * @return Interpolated function value + */ +__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi, + int mesh, + double inv_dk, + double distance) +{ + double position = distance * inv_dk; + int iq = __double2int_rd(position); // floor(position) + + // Boundary checks + if (iq > mesh - 4 || iq < 0) + { + return 0.0; + } + + // Lagrange interpolation weights + double x0 = position - static_cast(iq); + double x1 = 1.0 - x0; + double x2 = 2.0 - x0; + double x3 = 3.0 - x0; + + // 4-point Lagrange interpolation formula + return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0; +} + +//============================================================================= +// Device Helper Functions - Spherical Harmonics +//============================================================================= + +/** + * @brief Compute real spherical harmonics Y_lm at a given direction + * + * TEMPLATED VERSION: L is a compile-time constant enabling loop unrolling + * and register allocation optimizations by the compiler. + * + * Uses the recursive computation of associated Legendre polynomials + * followed by transformation to real spherical harmonics with proper + * normalization (same as ModuleBase::Ylm). * * @tparam L Maximum angular momentum (0 <= L <= MAX_L) - * @param x, y, z Direction (should be normalized) - * @param ylm Output array of size (L+1)^2 + * @param x, y, z Direction vector components (need not be normalized, normalization is done internally) + * @param ylm Output array of size (L+1)^2, indexed as ylm[l*l + l + m] */ template __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm); /** * @brief Runtime dispatch macro for templated compute_ylm_gpu - * Converts runtime L to compile-time template call + * + * Converts a runtime L value to the appropriate compile-time template + * instantiation for optimal performance. + * + * @param L_val Runtime angular momentum value + * @param x, y, z Direction vector components + * @param ylm Output array for spherical harmonics */ #define DISPATCH_YLM(L_val, x, y, z, ylm) \ do \ @@ -129,68 +193,83 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm); } while (0) //============================================================================= -// Kernel Input/Output Structures +// Data Structures for Kernel Input //============================================================================= /** - * @brief Information about a single projector + * @brief Non-local projector (beta function) information + * + * Contains all data needed to evaluate a single projector during integration. */ struct ProjectorData { - int L0; // Angular momentum - int beta_offset; // Offset in flattened beta array - int beta_mesh; // Number of mesh points - double beta_dk; // Grid spacing - double beta_rcut; // Cutoff radius - double r_min; // Minimum radial value - double r_max; // Maximum radial value + int L0; ///< Angular momentum quantum number + int beta_offset; ///< Offset into flattened beta radial array + int beta_mesh; ///< Number of radial mesh points + double beta_dk; ///< Radial grid spacing + double beta_rcut; ///< Cutoff radius for projector + double r_min; ///< Minimum radial grid value (integration start) + double r_max; ///< Maximum radial grid value (integration end) }; /** - * @brief Information about a neighbor-orbital pair for atom-level batching - * Used to batch ALL neighbors for a center atom in a single kernel launch + * @brief Neighbor atom orbital information for atom-level batching + * + * Each structure represents one (neighbor_atom, orbital) pair that contributes + * to the overlap integral. This enables processing ALL neighbors for a center + * atom in a single kernel launch, minimizing launch overhead. */ struct NeighborOrbitalData { - int neighbor_idx; // Which neighbor (ad index) this orbital belongs to - double3 R1; // Neighbor atom position (tau1 * lat0) - - // Orbital info - int L1; // Angular momentum - int m1; // Magnetic quantum number - int N1; // Radial quantum number - int iw_index; // Original orbital index for output mapping - int psi_offset; // Offset in flattened psi array - int psi_mesh; // Number of mesh points - double psi_dk; // Grid spacing - double psi_rcut; // Cutoff radius + int neighbor_idx; ///< Index of neighbor atom (ad index in adjacency list) + double3 R1; ///< Neighbor atom position in Cartesian coordinates (tau * lat0) + + // Orbital information + int L1; ///< Angular momentum of orbital + int m1; ///< Magnetic quantum number of orbital + int N1; ///< Radial quantum number of orbital + int iw_index; ///< Global orbital index for output mapping + int psi_offset; ///< Offset into flattened psi radial array + int psi_mesh; ///< Number of radial mesh points for orbital + double psi_dk; ///< Radial grid spacing for orbital + double psi_rcut; ///< Cutoff radius for orbital }; //============================================================================= -// Main Kernels +// Main CUDA Kernel Declaration //============================================================================= /** - * @brief Atom-level batch kernel - processes ALL neighbors for a center atom + * @brief Atom-level batch kernel for overlap computation * - * Grid: (total_neighbor_orbitals, nproj, 1) - * Block: (BLOCK_SIZE, 1, 1) + * This kernel processes ALL neighbor orbitals for a single center atom in one + * launch, significantly reducing kernel launch overhead. Each thread block + * handles the integration for one (neighbor_orbital, projector) pair. * - * This kernel processes all orbitals from all neighbors in a single launch, - * reducing kernel launch overhead significantly. + * Grid Configuration: + * - gridDim.x = total_neighbor_orbitals (all orbitals from all neighbors) + * - gridDim.y = nproj (number of projectors on center atom) * - * @param R0 Center atom position (projector location) - * @param A Vector potential - * @param neighbor_orbitals Array of neighbor-orbital pairs [total_neighbor_orbitals] - * @param projectors Array of projector data [nproj] - * @param psi_radial Flattened orbital radial functions - * @param beta_radial Flattened projector radial functions - * @param proj_m0_offset Offset for each projector's m=0 in output + * Block Configuration: + * - blockDim.x = BLOCK_SIZE threads for parallel integration + * + * Integration Strategy: + * - Angular loop (outer): each thread processes different angular points + * - Radial loop (inner): each thread accumulates over all radial points + * - Warp shuffle reduction for efficient summation + * + * @param R0 Center atom position (projector location) + * @param A Vector potential for phase factor + * @param neighbor_orbitals Array of neighbor-orbital data [total_neighbor_orbitals] + * @param projectors Array of projector data [nproj] + * @param psi_radial Flattened array of orbital radial functions + * @param beta_radial Flattened array of projector radial functions + * @param proj_m0_offset Starting index of each projector's m=0 component in output * @param total_neighbor_orbitals Total number of (neighbor, orbital) pairs - * @param nproj Number of projectors - * @param natomwfc Total projector components (sum of 2*L0+1) - * @param nlm_dim 1 for no current, 4 for current - * @param nlm_out Output [total_neighbor_orbitals * nlm_dim * natomwfc] + * @param nproj Number of projectors on center atom + * @param natomwfc Total projector components: sum of (2*L0+1) for all projectors + * @param nlm_dim Output dimension: 1 for overlap only, 4 for overlap + current + * @param nlm_out Output array [total_neighbor_orbitals * nlm_dim * natomwfc] */ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, double3 A, @@ -206,12 +285,17 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, cuDoubleComplex* __restrict__ nlm_out); //============================================================================= -// Host-side Helper Functions +// Host-side Initialization //============================================================================= /** - * @brief Copy integration grids (Lebedev and Gauss-Legendre) to constant memory - * Should be called once before kernel execution in each calculate_HR call + * @brief Copy integration grids to GPU constant memory + * + * Copies the Lebedev-Laikov angular grid (110 points) and Gauss-Legendre + * radial grid (140 points) to CUDA constant memory for fast access during + * kernel execution. + * + * @note Must be called once before any kernel launches in a calculation session. */ void copy_grids_to_device(); From 8f96081f9f921e0e1a04a4dea2d780f74065d8a5 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 26 Dec 2025 13:40:08 +0800 Subject: [PATCH 10/13] refactor(td_nonlocal_lcao): use runtime check for GPU/CPU branch selection - Add use_gpu runtime flag that checks both __CUDA macro and PARAM.inp.device - GPU path is now only enabled when __CUDA is defined AND device == "gpu" - Makes the conditional logic clearer with if/else instead of nested #ifdef --- .../module_operator_lcao/td_nonlocal_lcao.cpp | 118 ++++++++++-------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp index 63f4590056..6f39ad039f 100644 --- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp +++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp @@ -132,15 +132,26 @@ void hamilt::TDNonlocal>::calculate_HR() ModuleBase::TITLE("TDNonlocal", "calculate_HR"); ModuleBase::timer::tick("TDNonlocal", "calculate_HR"); + // Determine whether to use GPU path: + // GPU is only used when both __CUDA is defined AND device is set to "gpu" #ifdef __CUDA - // Initialize GPU resources before entering the computation loop - // Use set_device_by_rank for multi-GPU support - int dev_id = 0; + const bool use_gpu = (PARAM.inp.device == "gpu"); +#else + const bool use_gpu = false; +#endif + + // Initialize GPU resources if using GPU + if (use_gpu) + { +#ifdef __CUDA + // Use set_device_by_rank for multi-GPU support + int dev_id = 0; #ifdef __MPI - dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD); + dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD); #endif - module_rt::gpu::initialize_gpu_resources(); + module_rt::gpu::initialize_gpu_resources(); #endif + } const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV(); const int npol = this->ucell->get_npol(); @@ -160,59 +171,64 @@ void hamilt::TDNonlocal>::calculate_HR() nlm_tot[i].resize(nlm_dim); } + if (use_gpu) + { #ifdef __CUDA - // GPU path: Atom-level GPU batch processing - module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_, - this->ucell->infoNL, - T0, - tau0 * this->ucell->lat0, - cart_At, - adjs, - this->ucell, - paraV, - npol, - nlm_dim, - nlm_tot); -#else - // CPU path: OpenMP parallel over neighbors to compute nlm_tot -#pragma omp parallel for schedule(dynamic) - for (int ad = 0; ad < adjs.adj_num + 1; ++ad) + // GPU path: Atom-level GPU batch processing + module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_, + this->ucell->infoNL, + T0, + tau0 * this->ucell->lat0, + cart_At, + adjs, + this->ucell, + paraV, + npol, + nlm_dim, + nlm_tot); +#endif + } + else { - const int T1 = adjs.ntype[ad]; - const int I1 = adjs.natom[ad]; - const int iat1 = ucell->itia2iat(T1, I1); - const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; - const Atom* atom1 = &ucell->atoms[T1]; - auto all_indexes = paraV->get_indexes_row(iat1); - auto col_indexes = paraV->get_indexes_col(iat1); - all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); - std::sort(all_indexes.begin(), all_indexes.end()); - all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); - - // CPU path: loop over orbitals - for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) + // CPU path: OpenMP parallel over neighbors to compute nlm_tot +#pragma omp parallel for schedule(dynamic) + for (int ad = 0; ad < adjs.adj_num + 1; ++ad) { - const int iw1 = all_indexes[iw1l] / npol; - std::vector>> nlm; - module_rt::snap_psibeta_half_tddft(orb_, - this->ucell->infoNL, - nlm, - tau1 * this->ucell->lat0, - T1, - atom1->iw2l[iw1], - atom1->iw2m[iw1], - atom1->iw2n[iw1], - tau0 * this->ucell->lat0, - T0, - cart_At, - TD_info::out_current); - for (int dir = 0; dir < nlm_dim; dir++) + const int T1 = adjs.ntype[ad]; + const int I1 = adjs.natom[ad]; + const int iat1 = ucell->itia2iat(T1, I1); + const ModuleBase::Vector3& tau1 = adjs.adjacent_tau[ad]; + const Atom* atom1 = &ucell->atoms[T1]; + auto all_indexes = paraV->get_indexes_row(iat1); + auto col_indexes = paraV->get_indexes_col(iat1); + all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end()); + std::sort(all_indexes.begin(), all_indexes.end()); + all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end()); + + // CPU path: loop over orbitals + for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) { - nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + const int iw1 = all_indexes[iw1l] / npol; + std::vector>> nlm; + module_rt::snap_psibeta_half_tddft(orb_, + this->ucell->infoNL, + nlm, + tau1 * this->ucell->lat0, + T1, + atom1->iw2l[iw1], + atom1->iw2m[iw1], + atom1->iw2n[iw1], + tau0 * this->ucell->lat0, + T0, + cart_At, + TD_info::out_current); + for (int dir = 0; dir < nlm_dim; dir++) + { + nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]}); + } } } } -#endif // 2. calculate D for each pair of atoms // This runs for BOTH GPU and CPU paths From 97d8611e47ee30fe53b2e8a8af6884bf6d5798b6 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 26 Dec 2025 14:05:06 +0800 Subject: [PATCH 11/13] refactor(snap_psibeta): unify CUDA error checking macros - Move CUDA_CHECK macro to shared header snap_psibeta_kernel.cuh - Remove duplicate CUDA_CHECK definition from snap_psibeta_gpu.cu - Remove CUDA_CHECK_KERNEL macro and replace all usages with CUDA_CHECK - Reduces code duplication and improves consistency --- .../kernels/cuda/snap_psibeta_gpu.cu | 17 -------- .../kernels/cuda/snap_psibeta_kernel.cu | 41 +++++++------------ .../kernels/cuda/snap_psibeta_kernel.cuh | 21 ++++++++++ 3 files changed, 35 insertions(+), 44 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu index 026af2d1d0..9cfab0cb87 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -24,23 +24,6 @@ namespace module_rt namespace gpu { -//============================================================================= -// CUDA Error Handling -//============================================================================= - -/** - * @brief CUDA error checking macro with file/line information - */ -#define CUDA_CHECK(call) \ - do \ - { \ - cudaError_t err = (call); \ - if (err != cudaSuccess) \ - { \ - fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ - } \ - } while (0) - //============================================================================= // GPU Resource Management //============================================================================= diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 8d29015add..073a5fbd7d 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -534,19 +534,6 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, // Host-side Helper Functions //============================================================================= -/** - * @brief CUDA error checking macro for kernel-related operations - */ -#define CUDA_CHECK_KERNEL(call) \ - do \ - { \ - cudaError_t err = (call); \ - if (err != cudaSuccess) \ - { \ - fprintf(stderr, "[CUDA Kernel] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ - } \ - } while (0) - /** * @brief Copy integration grids to GPU constant memory * @@ -556,26 +543,26 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0, void copy_grids_to_device() { // Copy Lebedev-Laikov 110-point angular quadrature grid - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x, - ModuleBase::Integral::Lebedev_Laikov_grid110_x, - ANGULAR_GRID_NUM * sizeof(double))); - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_y, - ModuleBase::Integral::Lebedev_Laikov_grid110_y, - ANGULAR_GRID_NUM * sizeof(double))); - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_z, - ModuleBase::Integral::Lebedev_Laikov_grid110_z, - ANGULAR_GRID_NUM * sizeof(double))); - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_w, - ModuleBase::Integral::Lebedev_Laikov_grid110_w, - ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_x, + ModuleBase::Integral::Lebedev_Laikov_grid110_x, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_y, + ModuleBase::Integral::Lebedev_Laikov_grid110_y, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_z, + ModuleBase::Integral::Lebedev_Laikov_grid110_z, + ANGULAR_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_w, + ModuleBase::Integral::Lebedev_Laikov_grid110_w, + ANGULAR_GRID_NUM * sizeof(double))); // Compute and copy Gauss-Legendre radial quadrature grid std::vector h_gl_x(RADIAL_GRID_NUM); std::vector h_gl_w(RADIAL_GRID_NUM); ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data()); - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double))); - CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double))); + CUDA_CHECK(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double))); } } // namespace gpu diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index 42280b1498..02537c477e 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -17,9 +17,30 @@ #ifndef SNAP_PSIBETA_KERNEL_CUH #define SNAP_PSIBETA_KERNEL_CUH +#include #include #include +//============================================================================= +// CUDA Error Checking Macro +//============================================================================= + +/** + * @brief CUDA error checking macro with file/line information + * + * Checks the return value of CUDA API calls and prints an error message + * with file and line information if the call fails. + */ +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) \ + { \ + fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + } \ + } while (0) + namespace module_rt { namespace gpu From b0f94276c10e7d95dfe6f5643e0fdd20f7b7c5eb Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 26 Dec 2025 14:07:26 +0800 Subject: [PATCH 12/13] refactor(snap_psibeta_kernel): use ModuleBase constants - Replace local PI, FOUR_PI, SQRT2 definitions with ModuleBase:: versions - Add include for source_base/constants.h --- .../module_rt/kernels/cuda/snap_psibeta_kernel.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu index 073a5fbd7d..60ea9ad26b 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu @@ -12,6 +12,7 @@ */ #include "snap_psibeta_kernel.cuh" +#include "source_base/constants.h" #include "source_base/math_integral.h" #include @@ -85,14 +86,11 @@ __device__ __forceinline__ double p_get(const double* p, int l, int m) template __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) { - // Mathematical constants - constexpr double PI = 3.14159265358979323846; - constexpr double FOUR_PI = 4.0 * PI; - constexpr double SQRT2 = 1.41421356237309504880; + constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage size // Y_00 = 1/(2*sqrt(pi)) - ylm[0] = 0.5 * sqrt(1.0 / PI); + ylm[0] = 0.5 * sqrt(1.0 / ModuleBase::PI); if (L == 0) { @@ -178,7 +176,7 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) #pragma unroll for (int l = 0; l <= L; l++) { - double c = sqrt((2.0 * l + 1.0) / FOUR_PI); + double c = sqrt((2.0 * l + 1.0) / ModuleBase::FOUR_PI); // m = 0 component ylm[lm] = c * p_get(p, l, 0); @@ -195,7 +193,7 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm) { factorial_ratio *= i; } - double norm = c * sqrt(1.0 / factorial_ratio) * SQRT2; + double norm = c * sqrt(1.0 / factorial_ratio) * ModuleBase::SQRT2; double sin_mphi, cos_mphi; sincos(m * phi, &sin_mphi, &cos_mphi); From 9c60e9f5b1d752ae8a6531af2180be47a599f51f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 26 Dec 2025 14:15:22 +0800 Subject: [PATCH 13/13] refactor(snap_psibeta): use ModuleBase::WARNING_QUIT for error handling - Replace fprintf(stderr, ...) with ModuleBase::WARNING_QUIT - Update CUDA_CHECK macro to use WARNING_QUIT instead of fprintf - Add includes for tool_quit.h and string header - Consistent error handling with ABACUS codebase conventions --- .../module_rt/kernels/cuda/snap_psibeta_gpu.cu | 16 +++++++--------- .../kernels/cuda/snap_psibeta_kernel.cuh | 11 ++++++++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu index 9cfab0cb87..cb381ece78 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu @@ -14,9 +14,11 @@ #include "../snap_psibeta_gpu.h" #include "snap_psibeta_kernel.cuh" #include "source_base/timer.h" +#include "source_base/tool_quit.h" #include #include +#include #include namespace module_rt @@ -44,8 +46,7 @@ void initialize_gpu_resources() cudaError_t err = cudaGetDeviceCount(&device_count); if (err != cudaSuccess || device_count == 0) { - fprintf(stderr, "[CUDA] No CUDA devices found or error getting device count!\n"); - return; + ModuleBase::WARNING_QUIT("snap_psibeta_gpu", "No CUDA devices found or error getting device count!"); } // Initialize integration grids in constant memory @@ -141,10 +142,8 @@ void snap_psibeta_atom_batch_gpu( // Validate angular momentum if (L0 > MAX_L) { - fprintf(stderr, - "[CUDA] Error: L0=%d exceeds MAX_L=%d, GPU kernel may produce incorrect results\n", - L0, - MAX_L); + ModuleBase::WARNING_QUIT("snap_psibeta_gpu", + "L0=" + std::to_string(L0) + " exceeds MAX_L=" + std::to_string(MAX_L)); } natomwfc += 2 * L0 + 1; } @@ -317,15 +316,14 @@ void snap_psibeta_atom_batch_gpu( cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - fprintf(stderr, "[CUDA] Atom batch kernel launch error: %s\n", cudaGetErrorString(err)); cudaFree(neighbor_orbitals_d); cudaFree(projectors_d); cudaFree(psi_radial_d); cudaFree(beta_radial_d); cudaFree(proj_m0_offset_d); cudaFree(nlm_out_d); - ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu"); - return; + ModuleBase::WARNING_QUIT("snap_psibeta_gpu", + std::string("Atom batch kernel launch error: ") + cudaGetErrorString(err)); } CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh index 02537c477e..157224ea57 100644 --- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh +++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh @@ -17,9 +17,12 @@ #ifndef SNAP_PSIBETA_KERNEL_CUH #define SNAP_PSIBETA_KERNEL_CUH +#include "source_base/tool_quit.h" + #include #include #include +#include //============================================================================= // CUDA Error Checking Macro @@ -28,8 +31,8 @@ /** * @brief CUDA error checking macro with file/line information * - * Checks the return value of CUDA API calls and prints an error message - * with file and line information if the call fails. + * Checks the return value of CUDA API calls and calls WARNING_QUIT + * with error information if the call fails. */ #define CUDA_CHECK(call) \ do \ @@ -37,7 +40,9 @@ cudaError_t err = (call); \ if (err != cudaSuccess) \ { \ - fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + ModuleBase::WARNING_QUIT("CUDA_CHECK", \ + std::string("Error at ") + __FILE__ + ":" + std::to_string(__LINE__) + " - " \ + + cudaGetErrorString(err)); \ } \ } while (0)