From 80fd4710d83c83ede368457aa5aae85e62524658 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Mon, 8 Dec 2025 22:59:33 +0800
Subject: [PATCH 01/13] feat(tddft): Add CUDA acceleration for snap_psibeta
 with constant memory grids

- Implement GPU-accelerated snap_psibeta_neighbor_batch_kernel
- Use constant memory for Lebedev and Gauss-Legendre integration grids
- Add multi-GPU support via set_device_by_rank
- Initialize/finalize GPU resources in each calculate_HR call
- Remove static global variables for cleaner resource management
- CPU fallback when GPU processing fails
---
 .../module_operator_lcao/td_nonlocal_lcao.cpp | 116 +++--
 source/source_lcao/module_rt/CMakeLists.txt   |   7 +
 .../kernels/cuda/snap_psibeta_gpu.cu          | 351 +++++++++++++
 .../kernels/cuda/snap_psibeta_kernel.cu       | 460 ++++++++++++++++++
 .../kernels/cuda/snap_psibeta_kernel.cuh      | 188 +++++++
 .../module_rt/kernels/snap_psibeta_gpu.h      |  73 +++
 6 files changed, 1161 insertions(+), 34 deletions(-)
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
 create mode 100644 source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
index a5eba57688..cefe006150 100644
--- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
@@ -1,16 +1,21 @@
 #include "td_nonlocal_lcao.h"
 
-#include "source_io/module_parameter/parameter.h"
 #include "source_base/timer.h"
 #include "source_base/tool_title.h"
 #include "source_cell/module_neighbor/sltk_grid_driver.h"
-#include "source_lcao/module_operator_lcao/operator_lcao.h"
+#include "source_io/module_parameter/parameter.h"
 #include "source_lcao/module_hcontainer/hcontainer_funcs.h"
+#include "source_lcao/module_operator_lcao/operator_lcao.h"
 #include "source_lcao/module_rt/snap_psibeta_half_tddft.h"
+#ifdef __CUDA
+#include "source_base/module_device/device.h"
+#include "source_lcao/module_rt/kernels/snap_psibeta_gpu.h"
+#endif
+
 #include "source_pw/module_pwdft/global.h"
 #ifdef _OPENMP
-#include <unordered_set>
 #include <omp.h>
+#include <unordered_set>
 #endif
 
 template <typename TK, typename TR>
@@ -127,6 +132,16 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
     ModuleBase::TITLE("TDNonlocal", "calculate_HR");
     ModuleBase::timer::tick("TDNonlocal", "calculate_HR");
 
+#ifdef __CUDA
+    // Initialize GPU resources before entering the computation loop
+    // Use set_device_by_rank for multi-GPU support
+    int dev_id = 0;
+#ifdef __MPI
+    dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD);
+#endif
+    module_rt::gpu::initialize_gpu_resources();
+#endif
+
     const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV();
     const int npol = this->ucell->get_npol();
     const int nlm_dim = TD_info::out_current ? 4 : 1;
@@ -145,9 +160,12 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             nlm_tot[i].resize(nlm_dim);
         }
 
-        #pragma omp parallel
+#pragma omp parallel
         {
-            #pragma omp for schedule(dynamic)
+#ifdef __CUDA
+            cudaSetDevice(dev_id);
+#endif
+#pragma omp for schedule(dynamic)
             for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
             {
                 const int T1 = adjs.ntype[ad];
@@ -160,31 +178,57 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                 all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
                 std::sort(all_indexes.begin(), all_indexes.end());
                 all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
-                for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+
+#ifdef __CUDA
+                // GPU path: batch process all orbitals for this neighbor
+                // Use GPU when there are enough orbitals to benefit from parallelism
+                constexpr int GPU_THRESHOLD = 4;
+                bool gpu_success = false;
+                if (all_indexes.size() / npol >= GPU_THRESHOLD)
+                {
+                    gpu_success = module_rt::gpu::snap_psibeta_neighbor_batch_gpu(orb_,
+                                                                                  this->ucell->infoNL,
+                                                                                  T1,
+                                                                                  tau1 * this->ucell->lat0,
+                                                                                  atom1,
+                                                                                  all_indexes,
+                                                                                  npol,
+                                                                                  T0,
+                                                                                  tau0 * this->ucell->lat0,
+                                                                                  cart_At,
+                                                                                  nlm_tot[ad],
+                                                                                  TD_info::out_current);
+                }
+                if (!gpu_success)
+#endif
                 {
-                    const int iw1 = all_indexes[iw1l] / npol;
-                    std::vector<std::vector<std::complex<double>>> nlm;
-                    // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false
-                    // and size of outer vector is 4 when out_current is true (3 for <psi|r_i * exp(-iAr)|beta>, 1 for
-                    // <psi|exp(-iAr)|beta>) inner loop : all projectors (L0,M0)
-
-                    // snap_psibeta_half_tddft() are used to calculate <psi|exp(-iAr)|beta>
-                    // and <psi|rexp(-iAr)|beta> as well if current are needed
-                    module_rt::snap_psibeta_half_tddft(orb_,
-                                                          this->ucell->infoNL,
-                                                          nlm,
-                                                          tau1 * this->ucell->lat0,
-                                                          T1,
-                                                          atom1->iw2l[iw1],
-                                                          atom1->iw2m[iw1],
-                                                          atom1->iw2n[iw1],
-                                                          tau0 * this->ucell->lat0,
-                                                          T0,
-                                                          cart_At,
-                                                          TD_info::out_current);
-                    for (int dir = 0; dir < nlm_dim; dir++)
+                    // CPU path: loop over orbitals
+                    for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
                     {
-                        nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
+                        const int iw1 = all_indexes[iw1l] / npol;
+                        std::vector<std::vector<std::complex<double>>> nlm;
+                        // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false
+                        // and size of outer vector is 4 when out_current is true (3 for <psi|r_i * exp(-iAr)|beta>, 1
+                        // for <psi|exp(-iAr)|beta>) inner loop : all projectors (L0,M0)
+
+                        // snap_psibeta_half_tddft() are used to calculate <psi|exp(-iAr)|beta>
+                        // and <psi|rexp(-iAr)|beta> as well if current are needed
+                        module_rt::snap_psibeta_half_tddft(orb_,
+                                                           this->ucell->infoNL,
+                                                           nlm,
+                                                           tau1 * this->ucell->lat0,
+                                                           T1,
+                                                           atom1->iw2l[iw1],
+                                                           atom1->iw2m[iw1],
+                                                           atom1->iw2n[iw1],
+                                                           tau0 * this->ucell->lat0,
+                                                           T0,
+                                                           cart_At,
+                                                           TD_info::out_current);
+                        for (int dir = 0; dir < nlm_dim; dir++)
+                        {
+                            nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
+                        }
                     }
                 }
             }
@@ -205,7 +249,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             const int thread_id = omp_get_thread_num();
             std::set<int> ad_atom_set_thread;
             int i = 0;
-            for(const auto iat1 : ad_atom_set)
+            for (const auto iat1: ad_atom_set)
             {
                 if (i % num_threads == thread_id)
                 {
@@ -228,7 +272,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                     continue;
                 }
 #endif
-                
+
                 const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
                 for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
                 {
@@ -250,8 +294,8 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                             for (int i = 0; i < 3; i++)
                             {
                                 tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i)
-                                                ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])
-                                                ->get_pointer();
+                                               ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])
+                                               ->get_pointer();
                             }
                             this->cal_HR_IJR(iat1,
                                              iat2,
@@ -279,10 +323,16 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
         }
     }
 
+#ifdef __CUDA
+    // Release GPU resources at end of calculation
+    module_rt::gpu::finalize_gpu_resources();
+#endif
+
     ModuleBase::timer::tick("TDNonlocal", "calculate_HR");
 }
 
 // cal_HR_IJR()
+
 template <typename TK, typename TR>
 void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
     const int& iat1,
@@ -396,7 +446,6 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::set_HR_fixed(void* hR_tmp
     this->allocated = false;
 }
 
-
 // contributeHR()
 template <typename TK, typename TR>
 void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
@@ -436,7 +485,6 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
     return;
 }
 
-
 template <typename TK, typename TR>
 void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
 {
diff --git a/source/source_lcao/module_rt/CMakeLists.txt b/source/source_lcao/module_rt/CMakeLists.txt
index cca79af03a..35a65a5aa7 100644
--- a/source/source_lcao/module_rt/CMakeLists.txt
+++ b/source/source_lcao/module_rt/CMakeLists.txt
@@ -18,6 +18,13 @@ if(ENABLE_LCAO)
         boundary_fix.cpp
     )
 
+    if(USE_CUDA)
+        list(APPEND objects
+            kernels/cuda/snap_psibeta_kernel.cu
+            kernels/cuda/snap_psibeta_gpu.cu
+        )
+    endif()
+
     add_library(
         tddft
         OBJECT
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
new file mode 100644
index 0000000000..c9eafe1e71
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -0,0 +1,351 @@
+#include "../snap_psibeta_gpu.h"
+#include "snap_psibeta_kernel.cuh"
+
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <vector>
+
+namespace module_rt
+{
+namespace gpu
+{
+
+//=============================================================================
+// CUDA Error Checking Macro
+//=============================================================================
+
+#define CUDA_CHECK(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t err = (call);                                                                                      \
+        if (err != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));              \
+            return false;                                                                                              \
+        }                                                                                                              \
+    } while (0)
+
+//=============================================================================
+// Resource Management - Simplified (using constant memory for grids)
+//=============================================================================
+
+void initialize_gpu_resources()
+{
+    // Check for CUDA device
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0)
+    {
+        fprintf(stderr, "[CUDA] No CUDA devices found or error getting device count!\n");
+        return;
+    }
+
+    // Copy integration grids (Lebedev and Gauss-Legendre) to constant memory
+    copy_grids_to_device();
+
+    // Sync to ensure all initialization is complete
+    cudaDeviceSynchronize();
+}
+
+void finalize_gpu_resources()
+{
+    // No explicit cleanup needed - constant memory is managed by CUDA runtime
+    // Just reset any CUDA errors that may have accumulated
+    cudaGetLastError();
+}
+
+//=============================================================================
+// Main GPU Interface Function
+//=============================================================================
+
+bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
+                                     const InfoNonlocal& infoNL_,
+                                     const int T1,
+                                     const ModuleBase::Vector3<double>& R1,
+                                     const Atom* atom1,
+                                     const std::vector<int>& all_indexes,
+                                     const int npol,
+                                     const int T0,
+                                     const ModuleBase::Vector3<double>& R0,
+                                     const ModuleBase::Vector3<double>& A,
+                                     std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>& nlm_all,
+                                     const bool calc_r)
+{
+
+    // Get projector info
+    const int nproj = infoNL_.nproj[T0];
+    if (nproj == 0)
+    {
+        return true;
+    }
+
+    // Calculate number of orbitals
+    int num_orbitals = all_indexes.size() / npol;
+    if (num_orbitals == 0)
+    {
+        return true;
+    }
+
+    // Calculate natomwfc (total projector components)
+    int natomwfc = 0;
+    std::vector<int> proj_m0_offset_h(nproj);
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        proj_m0_offset_h[ip] = natomwfc;
+        int L0 = infoNL_.Beta[T0].Proj[ip].getL();
+
+        // Check L0 against MAX_L
+        if (L0 > MAX_L)
+        {
+            fprintf(stderr, "[CUDA] Warning: L0=%d exceeds MAX_L=%d, falling back to CPU\n", L0, MAX_L);
+            return false;
+        }
+
+        natomwfc += 2 * L0 + 1;
+    }
+
+    int nlm_dim = calc_r ? 4 : 1;
+
+    // Resize output
+    if (nlm_all.size() != nlm_dim)
+    {
+        nlm_all.resize(nlm_dim);
+    }
+
+    //=========================================================================
+    // Prepare Host Data
+    //=========================================================================
+
+    // Orbital data
+    std::vector<OrbitalData> orbitals_h(num_orbitals);
+
+    // Calculate total psi size and offsets
+    std::vector<int> psi_offsets_h(num_orbitals);
+    int total_psi_size = 0;
+
+    for (int i = 0; i < num_orbitals; i++)
+    {
+        int iw1l = i * npol;
+        int iw1 = all_indexes[iw1l] / npol;
+
+        int L1 = atom1->iw2l[iw1];
+        int m1 = atom1->iw2m[iw1];
+        int N1 = atom1->iw2n[iw1];
+
+        // Check L1 against MAX_L
+        if (L1 > MAX_L)
+        {
+            fprintf(stderr, "[CUDA] Warning: L1=%d exceeds MAX_L=%d, falling back to CPU\n", L1, MAX_L);
+            return false;
+        }
+
+        const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1);
+
+        orbitals_h[i].L1 = L1;
+        orbitals_h[i].m1 = m1;
+        orbitals_h[i].N1 = N1;
+        orbitals_h[i].iw_index = all_indexes[iw1l];
+        orbitals_h[i].psi_offset = total_psi_size;
+        orbitals_h[i].psi_mesh = phi_ln.getNr();
+        orbitals_h[i].psi_dk = phi_ln.getDk();
+        orbitals_h[i].psi_rcut = phi_ln.getRcut();
+
+        psi_offsets_h[i] = total_psi_size;
+        total_psi_size += phi_ln.getNr();
+    }
+
+    // Copy psi radial data
+    std::vector<double> psi_radial_h(total_psi_size);
+    for (int i = 0; i < num_orbitals; i++)
+    {
+        int iw1l = i * npol;
+        int iw1 = all_indexes[iw1l] / npol;
+
+        int L1 = atom1->iw2l[iw1];
+        int N1 = atom1->iw2n[iw1];
+
+        const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1);
+        int mesh = phi_ln.getNr();
+        const double* psi = phi_ln.getPsi();
+
+        for (int j = 0; j < mesh; j++)
+        {
+            psi_radial_h[orbitals_h[i].psi_offset + j] = psi[j];
+        }
+    }
+
+    // Projector data
+    std::vector<ProjectorData> projectors_h(nproj);
+    std::vector<int> beta_offsets_h(nproj);
+    int total_beta_size = 0;
+
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        const auto& proj = infoNL_.Beta[T0].Proj[ip];
+
+        projectors_h[ip].L0 = proj.getL();
+        projectors_h[ip].beta_offset = total_beta_size;
+        projectors_h[ip].beta_mesh = proj.getNr();
+        projectors_h[ip].beta_dk = proj.getDk();
+        projectors_h[ip].beta_rcut = proj.getRcut();
+
+        // Get r_min and r_max from radial grid
+        const double* radial = proj.getRadial();
+        int mesh = proj.getNr();
+        projectors_h[ip].r_min = radial[0];
+        projectors_h[ip].r_max = radial[mesh - 1];
+
+        beta_offsets_h[ip] = total_beta_size;
+        total_beta_size += mesh;
+    }
+
+    // Copy beta radial data
+    std::vector<double> beta_radial_h(total_beta_size);
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        const auto& proj = infoNL_.Beta[T0].Proj[ip];
+        int mesh = proj.getNr();
+        const double* beta_r = proj.getBeta_r();
+
+        for (int j = 0; j < mesh; j++)
+        {
+            beta_radial_h[projectors_h[ip].beta_offset + j] = beta_r[j];
+        }
+    }
+
+    //=========================================================================
+    // Allocate Device Memory
+    //=========================================================================
+
+    OrbitalData* orbitals_d = nullptr;
+    ProjectorData* projectors_d = nullptr;
+    double* psi_radial_d = nullptr;
+    double* beta_radial_d = nullptr;
+    int* proj_m0_offset_d = nullptr;
+    cuDoubleComplex* nlm_out_d = nullptr;
+
+    size_t output_size = num_orbitals * nlm_dim * natomwfc;
+
+    CUDA_CHECK(cudaMalloc(&orbitals_d, num_orbitals * sizeof(OrbitalData)));
+    CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData)));
+    CUDA_CHECK(cudaMalloc(&psi_radial_d, total_psi_size * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&beta_radial_d, total_beta_size * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex)));
+
+    //=========================================================================
+    // Copy Data to Device
+    //=========================================================================
+
+    CUDA_CHECK(cudaMemcpy(orbitals_d, orbitals_h.data(), num_orbitals * sizeof(OrbitalData), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(psi_radial_d, psi_radial_h.data(), total_psi_size * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(beta_radial_d, beta_radial_h.data(), total_beta_size * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice));
+
+    // Initialize output to zero
+    CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex)));
+
+    //=========================================================================
+    // Launch Kernel
+    //=========================================================================
+
+    double3 R1_d3 = make_double3(R1.x, R1.y, R1.z);
+    double3 R0_d3 = make_double3(R0.x, R0.y, R0.z);
+    double3 A_d3 = make_double3(A.x, A.y, A.z);
+
+    dim3 grid(num_orbitals, nproj, 1);
+    dim3 block(BLOCK_SIZE, 1, 1);
+
+    snap_psibeta_neighbor_batch_kernel<<<grid, block>>>(R1_d3,
+                                                        R0_d3,
+                                                        A_d3,
+                                                        orbitals_d,
+                                                        projectors_d,
+                                                        psi_radial_d,
+                                                        beta_radial_d,
+                                                        proj_m0_offset_d,
+                                                        num_orbitals,
+                                                        nproj,
+                                                        natomwfc,
+                                                        nlm_dim,
+                                                        nlm_out_d);
+
+    // Check for kernel errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "[CUDA] Kernel launch error: %s\n", cudaGetErrorString(err));
+        // Cleanup and return
+        cudaFree(orbitals_d);
+        cudaFree(projectors_d);
+        cudaFree(psi_radial_d);
+        cudaFree(beta_radial_d);
+        cudaFree(proj_m0_offset_d);
+        cudaFree(nlm_out_d);
+        return false;
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Check for errors after synchronization
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "[CUDA] Kernel execution error: %s\n", cudaGetErrorString(err));
+        cudaFree(orbitals_d);
+        cudaFree(projectors_d);
+        cudaFree(psi_radial_d);
+        cudaFree(beta_radial_d);
+        cudaFree(proj_m0_offset_d);
+        cudaFree(nlm_out_d);
+        return false;
+    }
+
+    //=========================================================================
+    // Copy Results Back
+    //=========================================================================
+
+    std::vector<cuDoubleComplex> nlm_out_h(output_size);
+    CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
+
+    // Reconstruct output into nlm_all format
+    for (int i = 0; i < num_orbitals; i++)
+    {
+        int iw_index = orbitals_h[i].iw_index;
+
+        // Create nlm vectors for this orbital
+        std::vector<std::vector<std::complex<double>>> nlm(nlm_dim);
+        for (int d = 0; d < nlm_dim; d++)
+        {
+            nlm[d].resize(natomwfc);
+            for (int k = 0; k < natomwfc; k++)
+            {
+                size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k;
+                nlm[d][k] = std::complex<double>(nlm_out_h[idx].x, nlm_out_h[idx].y);
+            }
+        }
+
+        // Insert into output maps
+        for (int dir = 0; dir < nlm_dim; dir++)
+        {
+            nlm_all[dir].insert({iw_index, nlm[dir]});
+        }
+    }
+
+    //=========================================================================
+    // Cleanup
+    //=========================================================================
+
+    cudaFree(orbitals_d);
+    cudaFree(projectors_d);
+    cudaFree(psi_radial_d);
+    cudaFree(beta_radial_d);
+    cudaFree(proj_m0_offset_d);
+    cudaFree(nlm_out_d);
+    return true;
+}
+
+} // namespace gpu
+} // namespace module_rt
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
new file mode 100644
index 0000000000..1e752d6d86
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -0,0 +1,460 @@
+#include "snap_psibeta_kernel.cuh"
+#include "source_base/math_integral.h"
+
+#include <cstdio>
+#include <vector>
+
+namespace module_rt
+{
+namespace gpu
+{
+
+//=============================================================================
+// Constant Memory for Integration Grids
+//=============================================================================
+
+// Lebedev-Laikov angular grid (110 points)
+__constant__ double d_lebedev_x[ANGULAR_GRID_NUM];
+__constant__ double d_lebedev_y[ANGULAR_GRID_NUM];
+__constant__ double d_lebedev_z[ANGULAR_GRID_NUM];
+__constant__ double d_lebedev_w[ANGULAR_GRID_NUM];
+
+// Gauss-Legendre radial grid (140 points)
+__constant__ double d_gl_x[RADIAL_GRID_NUM];
+__constant__ double d_gl_w[RADIAL_GRID_NUM];
+
+//=============================================================================
+// Spherical Harmonics Implementation
+//=============================================================================
+
+/**
+ * @brief Compute real spherical harmonics Y_lm
+ *        Based on the recursive formula used in ModuleBase::Ylm
+ */
+__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm)
+{
+    const double PI = 3.14159265358979323846;
+    const double FOUR_PI = 4.0 * PI;
+    const double SQRT2 = 1.41421356237309504880;
+
+    // Y_00
+    ylm[0] = 0.5 * sqrt(1.0 / PI);
+
+    if (L == 0)
+    {
+        return;
+    }
+
+    // Compute theta and phi
+    double rxy = sqrt(x * x + y * y);
+    double r = sqrt(x * x + y * y + z * z);
+
+    double cost, phi;
+    if (r < 1e-10)
+    {
+        cost = 1.0;
+        phi = 0.0;
+    }
+    else
+    {
+        cost = z / r;
+        if (rxy < 1e-10)
+        {
+            phi = 0.0;
+        }
+        else if (x > 1e-10)
+        {
+            phi = atan(y / x);
+        }
+        else if (x < -1e-10)
+        {
+            phi = atan(y / x) + PI;
+        }
+        else
+        {
+            phi = (y >= 0.0) ? PI / 2.0 : -PI / 2.0;
+        }
+    }
+
+    double sint = sqrt(1.0 - cost * cost);
+    if (sint < 0.0)
+    {
+        sint = 0.0;
+    }
+
+    // Associated Legendre polynomials P_l^m
+    double p[MAX_L + 1][MAX_L + 1];
+
+    for (int l = 0; l <= L; l++)
+    {
+        for (int m = 0; m <= L; m++)
+        {
+            p[l][m] = 0.0;
+        }
+    }
+
+    p[0][0] = 1.0;
+
+    if (L >= 1)
+    {
+        p[1][0] = cost;
+        p[1][1] = -sint;
+    }
+
+    for (int l = 2; l <= L; l++)
+    {
+        for (int m = 0; m <= l - 2; m++)
+        {
+            p[l][m] = ((2 * l - 1) * cost * p[l - 1][m] - (l - 1 + m) * p[l - 2][m]) / (double)(l - m);
+        }
+        p[l][l - 1] = (2 * l - 1) * cost * p[l - 1][l - 1];
+        double fact = 1.0;
+        for (int i = 1; i <= 2 * l - 1; i += 2)
+        {
+            fact *= i;
+        }
+        double sint_pow = 1.0;
+        for (int i = 0; i < l; i++)
+        {
+            sint_pow *= sint;
+        }
+        p[l][l] = fact * sint_pow;
+        if (l % 2 == 1)
+        {
+            p[l][l] = -p[l][l];
+        }
+    }
+
+    // Compute Y_lm
+    int lm = 0;
+    for (int l = 0; l <= L; l++)
+    {
+        double c = sqrt((2.0 * l + 1.0) / FOUR_PI);
+
+        ylm[lm] = c * p[l][0];
+        lm++;
+
+        for (int m = 1; m <= l; m++)
+        {
+            double fact_ratio = 1.0;
+            for (int i = l - m + 1; i <= l + m; i++)
+            {
+                fact_ratio *= i;
+            }
+            double same = c * sqrt(1.0 / fact_ratio) * SQRT2;
+
+            ylm[lm] = same * p[l][m] * cos(m * phi);
+            lm++;
+
+            ylm[lm] = same * p[l][m] * sin(m * phi);
+            lm++;
+        }
+    }
+}
+
+//=============================================================================
+// Warp Reduction for double
+//=============================================================================
+
+__device__ __forceinline__ double warp_reduce_sum(double val)
+{
+    for (int offset = 16; offset > 0; offset /= 2)
+    {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+
+//=============================================================================
+// Main Kernel Implementation - Memory Efficient Version
+//=============================================================================
+
+__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1,
+                                                   double3 R0,
+                                                   double3 A,
+                                                   const OrbitalData* __restrict__ orbitals,
+                                                   const ProjectorData* __restrict__ projectors,
+                                                   const double* __restrict__ psi_radial,
+                                                   const double* __restrict__ beta_radial,
+                                                   const int* __restrict__ proj_m0_offset,
+                                                   int num_orbitals,
+                                                   int nproj,
+                                                   int natomwfc,
+                                                   int nlm_dim,
+                                                   cuDoubleComplex* __restrict__ nlm_out)
+{
+
+    int orb_idx = blockIdx.x;
+    int proj_idx = blockIdx.y;
+    int tid = threadIdx.x;
+
+    if (orb_idx >= num_orbitals || proj_idx >= nproj)
+    {
+        return;
+    }
+
+    // Load orbital and projector data
+    OrbitalData orb = orbitals[orb_idx];
+    ProjectorData proj = projectors[proj_idx];
+
+    int L1 = orb.L1;
+    int m1 = orb.m1;
+    int L0 = proj.L0;
+    int num_m0 = 2 * L0 + 1;
+
+    // Distance check
+    double3 dRa = make_double3(R0.x - R1.x, R0.y - R1.y, R0.z - R1.z);
+    double distance = sqrt(dRa.x * dRa.x + dRa.y * dRa.y + dRa.z * dRa.z);
+
+    // Check for NaN
+    if (isnan(distance))
+    {
+        return;
+    }
+
+    double Rcut_sum = orb.psi_rcut + proj.beta_rcut;
+
+    if (distance > Rcut_sum)
+    {
+        // Zero output
+        if (tid == 0)
+        {
+            int m0_offset = proj_m0_offset[proj_idx];
+            int out_base = orb_idx * nlm_dim * natomwfc;
+            for (int m0 = 0; m0 < num_m0; m0++)
+            {
+                for (int d = 0; d < nlm_dim; d++)
+                {
+                    nlm_out[out_base + d * natomwfc + m0_offset + m0] = make_cuDoubleComplex(0.0, 0.0);
+                }
+            }
+        }
+        return;
+    }
+
+    // Reduced shared memory: only for reduction (2 doubles per thread * BLOCK_SIZE)
+    __shared__ double s_temp_re[BLOCK_SIZE];
+    __shared__ double s_temp_im[BLOCK_SIZE];
+
+    // Radial grid mapping
+    double xl = (proj.r_max - proj.r_min) * 0.5;
+    double xmean = (proj.r_max + proj.r_min) * 0.5;
+
+    // Precompute A dot R0
+    double A_phase_R0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
+    cuDoubleComplex exp_iAR0 = cu_exp_i(A_phase_R0);
+
+    // Result accumulators in registers - one per m0 component
+    double result_re[MAX_YLM_SIZE];
+    double result_im[MAX_YLM_SIZE];
+    double result_r_re[3][MAX_YLM_SIZE];
+    double result_r_im[3][MAX_YLM_SIZE];
+
+    // Initialize accumulators
+    for (int m0 = 0; m0 < num_m0; m0++)
+    {
+        result_re[m0] = 0.0;
+        result_im[m0] = 0.0;
+        if (nlm_dim == 4)
+        {
+            for (int d = 0; d < 3; d++)
+            {
+                result_r_re[d][m0] = 0.0;
+                result_r_im[d][m0] = 0.0;
+            }
+        }
+    }
+
+    // Radial loop - using constant memory for Gauss-Legendre grid
+    for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
+    {
+        double r_val = xmean + xl * d_gl_x[ir];
+        double w_rad = xl * d_gl_w[ir];
+
+        // Angular loop - each thread handles multiple angular points
+        for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
+        {
+            double leb_x = d_lebedev_x[ian];
+            double leb_y = d_lebedev_y[ian];
+            double leb_z = d_lebedev_z[ian];
+            double w_ang = d_lebedev_w[ian];
+
+            double rx = r_val * leb_x;
+            double ry = r_val * leb_y;
+            double rz = r_val * leb_z;
+
+            double tx = rx + dRa.x;
+            double ty = ry + dRa.y;
+            double tz = rz + dRa.z;
+            double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
+
+            if (tnorm <= orb.psi_rcut)
+            {
+                // Compute Y_L1
+                double ylm1[MAX_YLM_SIZE];
+                if (tnorm > 1e-10)
+                {
+                    double inv_tnorm = 1.0 / tnorm;
+                    compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
+                }
+                else
+                {
+                    compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1);
+                }
+
+                // Compute Y_L0
+                double ylm0[MAX_YLM_SIZE];
+                compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
+
+                // Interpolate psi
+                double psi_val
+                    = interpolate_radial_gpu(psi_radial + orb.psi_offset, orb.psi_mesh, 1.0 / orb.psi_dk, tnorm);
+
+                // Phase factor
+                double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
+                double phase = r_val * A_dot_leb;
+                cuDoubleComplex exp_iAr = cu_exp_i(phase);
+
+                // Interpolate beta
+                double beta_val
+                    = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
+
+                // Y_L1m1
+                int offset_L1 = L1 * L1 + m1;
+                double ylm_L1_val = ylm1[offset_L1];
+
+                // Combined factor
+                double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
+                cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
+
+                // Accumulate for all m0
+                int offset_L0 = L0 * L0;
+                for (int m0 = 0; m0 < num_m0; m0++)
+                {
+                    double ylm0_val = ylm0[offset_L0 + m0];
+
+                    result_re[m0] += common_factor.x * ylm0_val;
+                    result_im[m0] += common_factor.y * ylm0_val;
+
+                    if (nlm_dim == 4)
+                    {
+                        double r_op_x = rx + R0.x;
+                        double r_op_y = ry + R0.y;
+                        double r_op_z = rz + R0.z;
+
+                        result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x;
+                        result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x;
+                        result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y;
+                        result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y;
+                        result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z;
+                        result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z;
+                    }
+                }
+            }
+        }
+    }
+
+    // Reduce across threads for each m0
+    int m0_offset = proj_m0_offset[proj_idx];
+    int out_base = orb_idx * nlm_dim * natomwfc;
+
+    for (int m0 = 0; m0 < num_m0; m0++)
+    {
+        // Reduce real part
+        s_temp_re[tid] = result_re[m0];
+        s_temp_im[tid] = result_im[m0];
+        __syncthreads();
+
+        // Standard block reduction
+        for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                s_temp_re[tid] += s_temp_re[tid + s];
+                s_temp_im[tid] += s_temp_im[tid + s];
+            }
+            __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+            cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
+            result = cu_mul(result, exp_iAR0);
+            result = cu_conj(result);
+            nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result;
+        }
+        __syncthreads();
+
+        // Position operator terms
+        if (nlm_dim == 4)
+        {
+            for (int d = 0; d < 3; d++)
+            {
+                s_temp_re[tid] = result_r_re[d][m0];
+                s_temp_im[tid] = result_r_im[d][m0];
+                __syncthreads();
+
+                for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+                {
+                    if (tid < s)
+                    {
+                        s_temp_re[tid] += s_temp_re[tid + s];
+                        s_temp_im[tid] += s_temp_im[tid + s];
+                    }
+                    __syncthreads();
+                }
+
+                if (tid == 0)
+                {
+                    cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
+                    result_r = cu_mul(result_r, exp_iAR0);
+                    result_r = cu_conj(result_r);
+                    nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r;
+                }
+                __syncthreads();
+            }
+        }
+    }
+}
+
+//=============================================================================
+// Host-side Helper Function Implementations
+//=============================================================================
+
+#define CUDA_CHECK_KERNEL(call)                                                                                        \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t err = (call);                                                                                      \
+        if (err != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            fprintf(stderr, "[CUDA Kernel] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));       \
+        }                                                                                                              \
+    } while (0)
+
+void copy_grids_to_device()
+{
+    // Copy Lebedev-Laikov angular grid to constant memory
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x,
+                                         ModuleBase::Integral::Lebedev_Laikov_grid110_x,
+                                         ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_y,
+                                         ModuleBase::Integral::Lebedev_Laikov_grid110_y,
+                                         ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_z,
+                                         ModuleBase::Integral::Lebedev_Laikov_grid110_z,
+                                         ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_w,
+                                         ModuleBase::Integral::Lebedev_Laikov_grid110_w,
+                                         ANGULAR_GRID_NUM * sizeof(double)));
+
+    // Prepare and copy Gauss-Legendre radial grid to constant memory
+    std::vector<double> h_gl_x(RADIAL_GRID_NUM);
+    std::vector<double> h_gl_w(RADIAL_GRID_NUM);
+    ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data());
+
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double)));
+    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double)));
+}
+
+} // namespace gpu
+} // namespace module_rt
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
new file mode 100644
index 0000000000..085361e81e
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -0,0 +1,188 @@
+#ifndef SNAP_PSIBETA_KERNEL_CUH
+#define SNAP_PSIBETA_KERNEL_CUH
+
+#include <cuComplex.h>
+#include <cuda_runtime.h>
+
+namespace module_rt
+{
+namespace gpu
+{
+
+// Grid constants
+constexpr int RADIAL_GRID_NUM = 140;
+constexpr int ANGULAR_GRID_NUM = 110;
+constexpr int BLOCK_SIZE = 128;                         // >= ANGULAR_GRID_NUM for efficient reduction
+constexpr int MAX_L = 4;                                // Maximum angular momentum supported
+constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25
+
+//=============================================================================
+// Device Helper Functions
+//=============================================================================
+
+/**
+ * @brief GPU version of cubic interpolation for radial functions
+ */
+__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi,
+                                                         int mesh,
+                                                         double inv_dk,
+                                                         double distance)
+{
+    double position = distance * inv_dk;
+    int iq = __double2int_rd(position); // floor
+
+    if (iq > mesh - 4)
+        return 0.0;
+    if (iq < 0)
+        return 0.0;
+
+    double x0 = position - static_cast<double>(iq);
+    double x1 = 1.0 - x0;
+    double x2 = 2.0 - x0;
+    double x3 = 3.0 - x0;
+
+    return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0;
+}
+
+/**
+ * @brief Compute exp(i * theta) = cos(theta) + i * sin(theta)
+ */
+__device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta)
+{
+    double s, c;
+    sincos(theta, &s, &c);
+    return make_cuDoubleComplex(c, s);
+}
+
+/**
+ * @brief Complex multiplication
+ */
+__device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleComplex b)
+{
+    return make_cuDoubleComplex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+}
+
+/**
+ * @brief Complex addition
+ */
+__device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleComplex b)
+{
+    return make_cuDoubleComplex(a.x + b.x, a.y + b.y);
+}
+
+/**
+ * @brief Complex conjugate
+ */
+__device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a)
+{
+    return make_cuDoubleComplex(a.x, -a.y);
+}
+
+/**
+ * @brief Complex * real
+ */
+__device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double r)
+{
+    return make_cuDoubleComplex(a.x * r, a.y * r);
+}
+
+/**
+ * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z)
+ *        This is a simplified version for small L values
+ *
+ * @param L Angular momentum (0 <= L <= MAX_L)
+ * @param x, y, z Direction (should be normalized)
+ * @param ylm Output array of size (L+1)^2
+ */
+__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm);
+
+//=============================================================================
+// Kernel Input/Output Structures
+//=============================================================================
+
+/**
+ * @brief Information about a single orbital for batch processing
+ */
+struct OrbitalData
+{
+    int L1;          // Angular momentum
+    int m1;          // Magnetic quantum number
+    int N1;          // Radial quantum number
+    int iw_index;    // Original orbital index for output mapping
+    int psi_offset;  // Offset in flattened psi array
+    int psi_mesh;    // Number of mesh points
+    double psi_dk;   // Grid spacing
+    double psi_rcut; // Cutoff radius
+};
+
+/**
+ * @brief Information about a single projector
+ */
+struct ProjectorData
+{
+    int L0;           // Angular momentum
+    int beta_offset;  // Offset in flattened beta array
+    int beta_mesh;    // Number of mesh points
+    double beta_dk;   // Grid spacing
+    double beta_rcut; // Cutoff radius
+    double r_min;     // Minimum radial value
+    double r_max;     // Maximum radial value
+};
+
+//=============================================================================
+// Main Kernel
+//=============================================================================
+
+/**
+ * @brief Main kernel for neighbor-level batch processing
+ *
+ * Grid: (num_orbitals, nproj, 1)
+ * Block: (BLOCK_SIZE, 1, 1)
+ *
+ * Each block processes one (orbital, projector) pair.
+ * Threads within a block parallelize over angular integration points.
+ *
+ * @param R1 Neighbor atom position
+ * @param R0 Projector atom position
+ * @param A Vector potential
+ * @param orbitals Array of orbital data [num_orbitals]
+ * @param projectors Array of projector data [nproj]
+ * @param psi_radial Flattened orbital radial functions
+ * @param beta_radial Flattened projector radial functions
+ * @param num_orbitals Number of orbitals in batch
+ * @param nproj Number of projectors
+ * @param natomwfc Total number of projector components (sum of 2*L0+1)
+ * @param calc_r Whether to compute position operator elements
+ * @param nlm_out Output array [num_orbitals * nlm_dim * natomwfc]
+ *
+ * Note: Gauss-Legendre grids (gl_x, gl_w) and Lebedev grids are stored in constant memory
+ */
+__global__ void snap_psibeta_neighbor_batch_kernel(
+    double3 R1,
+    double3 R0,
+    double3 A,
+    const OrbitalData* __restrict__ orbitals,
+    const ProjectorData* __restrict__ projectors,
+    const double* __restrict__ psi_radial,
+    const double* __restrict__ beta_radial,
+    const int* __restrict__ proj_m0_offset, // Offset for each projector in output
+    int num_orbitals,
+    int nproj,
+    int natomwfc,
+    int nlm_dim,
+    cuDoubleComplex* __restrict__ nlm_out);
+
+//=============================================================================
+// Host-side Helper Functions
+//=============================================================================
+
+/**
+ * @brief Copy integration grids (Lebedev and Gauss-Legendre) to constant memory
+ *        Should be called once before kernel execution in each calculate_HR call
+ */
+void copy_grids_to_device();
+
+} // namespace gpu
+} // namespace module_rt
+
+#endif // SNAP_PSIBETA_KERNEL_CUH
diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
new file mode 100644
index 0000000000..ab6f55a617
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
@@ -0,0 +1,73 @@
+#ifndef SNAP_PSIBETA_GPU_H
+#define SNAP_PSIBETA_GPU_H
+
+#include "source_base/vector3.h"
+#include "source_basis/module_ao/ORB_read.h"
+#include "source_cell/setup_nonlocal.h"
+#include "source_cell/unitcell.h"
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#ifdef __CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace module_rt
+{
+namespace gpu
+{
+
+/**
+ * @brief Initialize GPU resources (copy grids to constant memory)
+ *        Should be called at the start of each calculate_HR
+ */
+void initialize_gpu_resources();
+
+/**
+ * @brief Release GPU resources (clear any error states)
+ *        Should be called at the end of each calculate_HR
+ */
+void finalize_gpu_resources();
+
+/**
+ * @brief Neighbor-level GPU batch processing interface
+ *
+ * Computes matrix elements for all orbitals on a neighbor atom with projectors
+ * Replaces the original orbital loop:
+ *   for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) {
+ *       snap_psibeta_half_tddft(...);
+ *   }
+ *
+ * @param orb Orbital information
+ * @param infoNL_ Non-local pseudopotential information
+ * @param T1 Neighbor atom type
+ * @param R1 Neighbor atom position (already multiplied by lat0)
+ * @param atom1 Neighbor atom pointer (for iw2l, iw2m, iw2n)
+ * @param all_indexes All orbital indices on this neighbor
+ * @param npol Polarization number
+ * @param T0 Projector atom type
+ * @param R0 Projector atom position (already multiplied by lat0)
+ * @param A Vector potential
+ * @param nlm_all Output: nlm_all[dir][iw_index] = nlm_vector
+ * @param calc_r Whether to calculate position operator matrix elements
+ * @return true if GPU processing succeeded, false if caller should use CPU path
+ */
+bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
+                                     const InfoNonlocal& infoNL_,
+                                     const int T1,
+                                     const ModuleBase::Vector3<double>& R1,
+                                     const Atom* atom1,
+                                     const std::vector<int>& all_indexes,
+                                     const int npol,
+                                     const int T0,
+                                     const ModuleBase::Vector3<double>& R0,
+                                     const ModuleBase::Vector3<double>& A,
+                                     std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>& nlm_all,
+                                     const bool calc_r);
+
+} // namespace gpu
+} // namespace module_rt
+
+#endif // SNAP_PSIBETA_GPU_H

From e879f6328306aa6e088bf54e883fc89328fe1cb0 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Tue, 9 Dec 2025 10:57:36 +0800
Subject: [PATCH 02/13] refactor(snap_psibeta_atom_batch_gpu): add timer and
 simplify code structure

- Add ModuleBase::timer for snap_psibeta_atom_batch_gpu function
- Remove GPU fallback to CPU design (return true/false in void function)
- Replace fallback returns with error messages and proper early exits
- Ensure timer is properly called on all exit paths
- Simplify code structure for better readability
---
 .../module_operator_lcao/td_nonlocal_lcao.cpp | 132 ++++----
 .../kernels/cuda/snap_psibeta_gpu.cu          | 281 +++++++++++++++++-
 .../kernels/cuda/snap_psibeta_kernel.cu       | 232 +++++++++++++++
 .../kernels/cuda/snap_psibeta_kernel.cuh      |  90 ++++--
 .../module_rt/kernels/snap_psibeta_gpu.h      |  55 ++--
 5 files changed, 667 insertions(+), 123 deletions(-)

diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
index cefe006150..6d2972b92c 100644
--- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
@@ -160,79 +160,64 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             nlm_tot[i].resize(nlm_dim);
         }
 
-#pragma omp parallel
-        {
 #ifdef __CUDA
-            cudaSetDevice(dev_id);
-#endif
-#pragma omp for schedule(dynamic)
-            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        // GPU path: Atom-level GPU batch processing
+        module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_,
+                                                    this->ucell->infoNL,
+                                                    T0,
+                                                    tau0 * this->ucell->lat0,
+                                                    cart_At,
+                                                    adjs,
+                                                    this->ucell,
+                                                    paraV,
+                                                    npol,
+                                                    nlm_dim,
+                                                    nlm_tot);
+#else
+        // CPU path: OpenMP parallel over neighbors to compute nlm_tot
+#pragma omp parallel for schedule(dynamic)
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T1 = adjs.ntype[ad];
+            const int I1 = adjs.natom[ad];
+            const int iat1 = ucell->itia2iat(T1, I1);
+            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+            const Atom* atom1 = &ucell->atoms[T1];
+            auto all_indexes = paraV->get_indexes_row(iat1);
+            auto col_indexes = paraV->get_indexes_col(iat1);
+            all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+            std::sort(all_indexes.begin(), all_indexes.end());
+            all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+
+            // CPU path: loop over orbitals
+            for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
             {
-                const int T1 = adjs.ntype[ad];
-                const int I1 = adjs.natom[ad];
-                const int iat1 = ucell->itia2iat(T1, I1);
-                const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
-                const Atom* atom1 = &ucell->atoms[T1];
-                auto all_indexes = paraV->get_indexes_row(iat1);
-                auto col_indexes = paraV->get_indexes_col(iat1);
-                all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
-                std::sort(all_indexes.begin(), all_indexes.end());
-                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
-
-#ifdef __CUDA
-                // GPU path: batch process all orbitals for this neighbor
-                // Use GPU when there are enough orbitals to benefit from parallelism
-                constexpr int GPU_THRESHOLD = 4;
-                bool gpu_success = false;
-                if (all_indexes.size() / npol >= GPU_THRESHOLD)
-                {
-                    gpu_success = module_rt::gpu::snap_psibeta_neighbor_batch_gpu(orb_,
-                                                                                  this->ucell->infoNL,
-                                                                                  T1,
-                                                                                  tau1 * this->ucell->lat0,
-                                                                                  atom1,
-                                                                                  all_indexes,
-                                                                                  npol,
-                                                                                  T0,
-                                                                                  tau0 * this->ucell->lat0,
-                                                                                  cart_At,
-                                                                                  nlm_tot[ad],
-                                                                                  TD_info::out_current);
-                }
-                if (!gpu_success)
-#endif
+                const int iw1 = all_indexes[iw1l] / npol;
+                std::vector<std::vector<std::complex<double>>> nlm;
+                module_rt::snap_psibeta_half_tddft(orb_,
+                                                   this->ucell->infoNL,
+                                                   nlm,
+                                                   tau1 * this->ucell->lat0,
+                                                   T1,
+                                                   atom1->iw2l[iw1],
+                                                   atom1->iw2m[iw1],
+                                                   atom1->iw2n[iw1],
+                                                   tau0 * this->ucell->lat0,
+                                                   T0,
+                                                   cart_At,
+                                                   TD_info::out_current);
+                for (int dir = 0; dir < nlm_dim; dir++)
                 {
-                    // CPU path: loop over orbitals
-                    for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
-                    {
-                        const int iw1 = all_indexes[iw1l] / npol;
-                        std::vector<std::vector<std::complex<double>>> nlm;
-                        // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false
-                        // and size of outer vector is 4 when out_current is true (3 for <psi|r_i * exp(-iAr)|beta>, 1
-                        // for <psi|exp(-iAr)|beta>) inner loop : all projectors (L0,M0)
-
-                        // snap_psibeta_half_tddft() are used to calculate <psi|exp(-iAr)|beta>
-                        // and <psi|rexp(-iAr)|beta> as well if current are needed
-                        module_rt::snap_psibeta_half_tddft(orb_,
-                                                           this->ucell->infoNL,
-                                                           nlm,
-                                                           tau1 * this->ucell->lat0,
-                                                           T1,
-                                                           atom1->iw2l[iw1],
-                                                           atom1->iw2m[iw1],
-                                                           atom1->iw2n[iw1],
-                                                           tau0 * this->ucell->lat0,
-                                                           T0,
-                                                           cart_At,
-                                                           TD_info::out_current);
-                        for (int dir = 0; dir < nlm_dim; dir++)
-                        {
-                            nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
-                        }
-                    }
+                    nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
                 }
             }
+        }
+#endif
 
+        // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+        // This runs for BOTH GPU and CPU paths
+#pragma omp parallel
+        {
 #ifdef _OPENMP
             // record the iat number of the adjacent atoms
             std::set<int> ad_atom_set;
@@ -259,7 +244,6 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             }
 #endif
 
-            // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
             for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
             {
                 const int T1 = adjs.ntype[ad1];
@@ -291,11 +275,11 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                         if (TD_info::out_current)
                         {
                             std::complex<double>* tmp_c[3] = {nullptr, nullptr, nullptr};
-                            for (int i = 0; i < 3; i++)
+                            for (int ii = 0; ii < 3; ii++)
                             {
-                                tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i)
-                                               ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])
-                                               ->get_pointer();
+                                tmp_c[ii] = TD_info::td_vel_op->get_current_term_pointer(ii)
+                                                ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])
+                                                ->get_pointer();
                             }
                             this->cal_HR_IJR(iat1,
                                              iat2,
@@ -320,8 +304,8 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                     }
                 }
             }
-        }
-    }
+        } // end omp parallel for matrix assembly
+    }     // end for iat0
 
 #ifdef __CUDA
     // Release GPU resources at end of calculation
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
index c9eafe1e71..519b522c8f 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -1,5 +1,6 @@
 #include "../snap_psibeta_gpu.h"
 #include "snap_psibeta_kernel.cuh"
+#include "source_base/timer.h"
 
 #include <cstdio>
 #include <cuda_runtime.h>
@@ -21,7 +22,6 @@ namespace gpu
         if (err != cudaSuccess)                                                                                        \
         {                                                                                                              \
             fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));              \
-            return false;                                                                                              \
         }                                                                                                              \
     } while (0)
 
@@ -347,5 +347,284 @@ bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
     return true;
 }
 
+//=============================================================================
+// Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel
+//=============================================================================
+
+void snap_psibeta_atom_batch_gpu(
+    const LCAO_Orbitals& orb,
+    const InfoNonlocal& infoNL_,
+    const int T0,
+    const ModuleBase::Vector3<double>& R0,
+    const ModuleBase::Vector3<double>& A,
+    const AdjacentAtomInfo& adjs,
+    const UnitCell* ucell,
+    const Parallel_Orbitals* paraV,
+    const int npol,
+    const int nlm_dim,
+    std::vector<std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>>& nlm_tot)
+{
+    ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
+
+    // Get projector info
+    const int nproj = infoNL_.nproj[T0];
+    if (nproj == 0)
+    {
+        ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
+        return;
+    }
+
+    // Calculate natomwfc
+    int natomwfc = 0;
+    std::vector<int> proj_m0_offset_h(nproj);
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        proj_m0_offset_h[ip] = natomwfc;
+        int L0 = infoNL_.Beta[T0].Proj[ip].getL();
+        if (L0 > MAX_L)
+        {
+            fprintf(stderr,
+                    "[CUDA] Error: L0=%d exceeds MAX_L=%d, GPU kernel may produce incorrect results\n",
+                    L0,
+                    MAX_L);
+        }
+        natomwfc += 2 * L0 + 1;
+    }
+
+    //=========================================================================
+    // Collect all neighbor-orbital pairs
+    //=========================================================================
+
+    std::vector<NeighborOrbitalData> neighbor_orbitals_h;
+    std::vector<double> psi_radial_h;
+
+    // Track neighbor index for each orbital for output reconstruction
+    struct OrbitalMapping
+    {
+        int neighbor_idx;
+        int iw_index;
+    };
+    std::vector<OrbitalMapping> orbital_mappings;
+
+    for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+    {
+        const int T1 = adjs.ntype[ad];
+        const int I1 = adjs.natom[ad];
+        const int iat1 = ucell->itia2iat(T1, I1);
+        const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+        const Atom* atom1 = &ucell->atoms[T1];
+
+        // Get orbital indices
+        auto all_indexes = paraV->get_indexes_row(iat1);
+        auto col_indexes = paraV->get_indexes_col(iat1);
+        all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+        std::sort(all_indexes.begin(), all_indexes.end());
+        all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+
+        // Process each orbital
+        for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+        {
+            const int iw1 = all_indexes[iw1l] / npol;
+            const int L1 = atom1->iw2l[iw1];
+            const int m1 = atom1->iw2m[iw1];
+            const int N1 = atom1->iw2n[iw1];
+
+            if (L1 > MAX_L)
+            {
+                continue; // Skip orbitals with L > MAX_L
+            }
+
+            // Get orbital radial data
+            // Use getPsi() to match CPU implementation (not getPsi_r() which returns psi*r)
+            const double* phi_psi = orb.Phi[T1].PhiLN(L1, N1).getPsi();
+            int mesh = orb.Phi[T1].PhiLN(L1, N1).getNr();
+            double dk = orb.Phi[T1].PhiLN(L1, N1).getDk();
+            double rcut = orb.Phi[T1].getRcut();
+
+            // Add to flattened psi array
+            size_t psi_offset = psi_radial_h.size();
+            psi_radial_h.insert(psi_radial_h.end(), phi_psi, phi_psi + mesh);
+
+            // Create neighbor-orbital data
+            NeighborOrbitalData norb;
+            norb.neighbor_idx = ad;
+            norb.R1 = make_double3(tau1.x * ucell->lat0, tau1.y * ucell->lat0, tau1.z * ucell->lat0);
+            norb.L1 = L1;
+            norb.m1 = m1;
+            norb.N1 = N1;
+            norb.iw_index = all_indexes[iw1l];
+            norb.psi_offset = static_cast<int>(psi_offset);
+            norb.psi_mesh = mesh;
+            norb.psi_dk = dk;
+            norb.psi_rcut = rcut;
+
+            neighbor_orbitals_h.push_back(norb);
+
+            OrbitalMapping mapping;
+            mapping.neighbor_idx = ad;
+            mapping.iw_index = all_indexes[iw1l];
+            orbital_mappings.push_back(mapping);
+        }
+    }
+
+    int total_neighbor_orbitals = static_cast<int>(neighbor_orbitals_h.size());
+    if (total_neighbor_orbitals == 0)
+    {
+        ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
+        return;
+    }
+
+    //=========================================================================
+    // Prepare projector data
+    //=========================================================================
+
+    std::vector<ProjectorData> projectors_h(nproj);
+    std::vector<double> beta_radial_h;
+
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        const auto& proj = infoNL_.Beta[T0].Proj[ip];
+        int L0 = proj.getL();
+        int mesh = proj.getNr();
+        double dk = proj.getDk();
+        double rcut = proj.getRcut();
+        const double* beta_r = proj.getBeta_r();
+        const double* radial = proj.getRadial();
+
+        projectors_h[ip].L0 = L0;
+        projectors_h[ip].beta_offset = static_cast<int>(beta_radial_h.size());
+        projectors_h[ip].beta_mesh = mesh;
+        projectors_h[ip].beta_dk = dk;
+        projectors_h[ip].beta_rcut = rcut;
+        // Use actual radial grid range
+        projectors_h[ip].r_min = radial[0];
+        projectors_h[ip].r_max = radial[mesh - 1];
+
+        beta_radial_h.insert(beta_radial_h.end(), beta_r, beta_r + mesh);
+    }
+
+    //=========================================================================
+    // Allocate Device Memory
+    //=========================================================================
+
+    NeighborOrbitalData* neighbor_orbitals_d = nullptr;
+    ProjectorData* projectors_d = nullptr;
+    double* psi_radial_d = nullptr;
+    double* beta_radial_d = nullptr;
+    int* proj_m0_offset_d = nullptr;
+    cuDoubleComplex* nlm_out_d = nullptr;
+
+    size_t output_size = total_neighbor_orbitals * nlm_dim * natomwfc;
+
+    CUDA_CHECK(cudaMalloc(&neighbor_orbitals_d, total_neighbor_orbitals * sizeof(NeighborOrbitalData)));
+    CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData)));
+    CUDA_CHECK(cudaMalloc(&psi_radial_d, psi_radial_h.size() * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&beta_radial_d, beta_radial_h.size() * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex)));
+
+    //=========================================================================
+    // Copy Data to Device
+    //=========================================================================
+
+    CUDA_CHECK(cudaMemcpy(neighbor_orbitals_d,
+                          neighbor_orbitals_h.data(),
+                          total_neighbor_orbitals * sizeof(NeighborOrbitalData),
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(psi_radial_d, psi_radial_h.data(), psi_radial_h.size() * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(beta_radial_d, beta_radial_h.data(), beta_radial_h.size() * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex)));
+
+    //=========================================================================
+    // Launch Kernel - SINGLE launch for ALL neighbors!
+    //=========================================================================
+
+    double3 R0_d3 = make_double3(R0.x, R0.y, R0.z);
+    double3 A_d3 = make_double3(A.x, A.y, A.z);
+
+    dim3 grid(total_neighbor_orbitals, nproj, 1);
+    dim3 block(BLOCK_SIZE, 1, 1);
+
+    snap_psibeta_atom_batch_kernel<<<grid, block>>>(R0_d3,
+                                                    A_d3,
+                                                    neighbor_orbitals_d,
+                                                    projectors_d,
+                                                    psi_radial_d,
+                                                    beta_radial_d,
+                                                    proj_m0_offset_d,
+                                                    total_neighbor_orbitals,
+                                                    nproj,
+                                                    natomwfc,
+                                                    nlm_dim,
+                                                    nlm_out_d);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "[CUDA] Atom batch kernel launch error: %s\n", cudaGetErrorString(err));
+        cudaFree(neighbor_orbitals_d);
+        cudaFree(projectors_d);
+        cudaFree(psi_radial_d);
+        cudaFree(beta_radial_d);
+        cudaFree(proj_m0_offset_d);
+        cudaFree(nlm_out_d);
+        ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
+        return;
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    //=========================================================================
+    // Copy Results Back
+    //=========================================================================
+
+    std::vector<cuDoubleComplex> nlm_out_h(output_size);
+    CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
+
+    //=========================================================================
+    // Reconstruct nlm_tot structure
+    //=========================================================================
+
+    for (int i = 0; i < total_neighbor_orbitals; i++)
+    {
+        int ad = orbital_mappings[i].neighbor_idx;
+        int iw_index = orbital_mappings[i].iw_index;
+
+        std::vector<std::vector<std::complex<double>>> nlm(nlm_dim);
+        for (int d = 0; d < nlm_dim; d++)
+        {
+            nlm[d].resize(natomwfc);
+            for (int k = 0; k < natomwfc; k++)
+            {
+                size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k;
+                nlm[d][k] = std::complex<double>(nlm_out_h[idx].x, nlm_out_h[idx].y);
+            }
+        }
+
+        // Insert into nlm_tot[ad][dir][iw_index]
+        for (int dir = 0; dir < nlm_dim; dir++)
+        {
+            nlm_tot[ad][dir].insert({iw_index, nlm[dir]});
+        }
+    }
+
+    //=========================================================================
+    // Cleanup
+    //=========================================================================
+
+    cudaFree(neighbor_orbitals_d);
+    cudaFree(projectors_d);
+    cudaFree(psi_radial_d);
+    cudaFree(beta_radial_d);
+    cudaFree(proj_m0_offset_d);
+    cudaFree(nlm_out_d);
+
+    ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
+}
+
 } // namespace gpu
 } // namespace module_rt
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 1e752d6d86..7feae5907c 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -417,6 +417,238 @@ __global__ void snap_psibeta_neighbor_batch_kernel(double3 R1,
     }
 }
 
+//=============================================================================
+// Atom-Level Batch Kernel Implementation
+// Processes ALL neighbors for a center atom in a single kernel launch
+//=============================================================================
+
+__global__ void snap_psibeta_atom_batch_kernel(double3 R0,
+                                               double3 A,
+                                               const NeighborOrbitalData* __restrict__ neighbor_orbitals,
+                                               const ProjectorData* __restrict__ projectors,
+                                               const double* __restrict__ psi_radial,
+                                               const double* __restrict__ beta_radial,
+                                               const int* __restrict__ proj_m0_offset,
+                                               int total_neighbor_orbitals,
+                                               int nproj,
+                                               int natomwfc,
+                                               int nlm_dim,
+                                               cuDoubleComplex* __restrict__ nlm_out)
+{
+    int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair
+    int proj_idx = blockIdx.y; // Which projector
+    int tid = threadIdx.x;
+
+    if (norb_idx >= total_neighbor_orbitals || proj_idx >= nproj)
+    {
+        return;
+    }
+
+    // Load neighbor-orbital data (includes R1)
+    const NeighborOrbitalData& norb = neighbor_orbitals[norb_idx];
+    const ProjectorData& proj = projectors[proj_idx];
+
+    double3 R1 = norb.R1;
+    int L1 = norb.L1;
+    int m1 = norb.m1;
+    int L0 = proj.L0;
+    int m0_offset = proj_m0_offset[proj_idx];
+
+    // Skip if L values exceed our precomputed limit
+    if (L1 > MAX_L || L0 > MAX_L)
+    {
+        return;
+    }
+
+    // Compute geometry
+    double3 dR = make_double3(R1.x - R0.x, R1.y - R0.y, R1.z - R0.z);
+    double distance01 = sqrt(dR.x * dR.x + dR.y * dR.y + dR.z * dR.z);
+
+    double r1_max = norb.psi_rcut;
+    // Integration range: use projector's radial grid range [r_min, r_max]
+    // This matches the CPU implementation in snap_psibeta_half_tddft.cpp
+    double r_min = proj.r_min;
+    double r_max = proj.r_max;
+    double xl = 0.5 * (r_max - r_min);
+    double xmean = 0.5 * (r_max + r_min);
+
+    // Phase factor exp(i A · R0)
+    double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
+    cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0);
+
+    // Shared memory for reduction
+    __shared__ double s_temp_re[BLOCK_SIZE];
+    __shared__ double s_temp_im[BLOCK_SIZE];
+
+    // Initialize accumulators in registers
+    double result_re[MAX_YLM_SIZE];
+    double result_im[MAX_YLM_SIZE];
+    double result_r_re[3][MAX_YLM_SIZE];
+    double result_r_im[3][MAX_YLM_SIZE];
+
+    int num_m0 = 2 * L0 + 1;
+    for (int m0 = 0; m0 < num_m0; m0++)
+    {
+        result_re[m0] = 0.0;
+        result_im[m0] = 0.0;
+        for (int d = 0; d < 3; d++)
+        {
+            result_r_re[d][m0] = 0.0;
+            result_r_im[d][m0] = 0.0;
+        }
+    }
+
+    // Main integration loop
+    for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
+    {
+        double r_val = xmean + xl * d_gl_x[ir];
+        double w_rad = xl * d_gl_w[ir];
+
+        for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
+        {
+            double leb_x = d_lebedev_x[ian];
+            double leb_y = d_lebedev_y[ian];
+            double leb_z = d_lebedev_z[ian];
+            double w_ang = d_lebedev_w[ian];
+
+            // Local position relative to R0
+            double rx = r_val * leb_x;
+            double ry = r_val * leb_y;
+            double rz = r_val * leb_z;
+
+            // Vector from R1 (orbital atom) to integration point (R0 + r_local)
+            // This matches neighbor_batch: tx = rx + dRa.x where dRa = R0 - R1
+            double tx = rx + R0.x - R1.x;
+            double ty = ry + R0.y - R1.y;
+            double tz = rz + R0.z - R1.z;
+            double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
+
+            // Check psi cutoff - matches neighbor_batch logic
+            if (tnorm <= r1_max)
+            {
+                // Compute Y_L1 - match neighbor_batch handling of small tnorm
+                double ylm1[MAX_YLM_SIZE];
+                if (tnorm > 1e-10)
+                {
+                    double inv_tnorm = 1.0 / tnorm;
+                    compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
+                }
+                else
+                {
+                    compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1);
+                }
+
+                // Compute Y_L0
+                double ylm0[MAX_YLM_SIZE];
+                compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
+
+                // Interpolate psi at tnorm (distance to orbital atom)
+                double psi_val
+                    = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
+
+                // Phase factor - match neighbor_batch: exp(+i A · r_local)
+                double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
+                double phase = r_val * A_dot_leb;
+                cuDoubleComplex exp_iAr = cu_exp_i(phase);
+
+                // Interpolate beta
+                double beta_val
+                    = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
+
+                // Y_L1m1
+                int offset_L1 = L1 * L1 + m1;
+                double ylm_L1_val = ylm1[offset_L1];
+
+                // Combined factor - match neighbor_batch exactly
+                double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
+                cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
+
+                // Accumulate for all m0 - match neighbor_batch structure
+                int offset_L0 = L0 * L0;
+                for (int m0 = 0; m0 < num_m0; m0++)
+                {
+                    double ylm0_val = ylm0[offset_L0 + m0];
+
+                    result_re[m0] += common_factor.x * ylm0_val;
+                    result_im[m0] += common_factor.y * ylm0_val;
+
+                    if (nlm_dim == 4)
+                    {
+                        double r_op_x = rx + R0.x;
+                        double r_op_y = ry + R0.y;
+                        double r_op_z = rz + R0.z;
+
+                        result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x;
+                        result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x;
+                        result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y;
+                        result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y;
+                        result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z;
+                        result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z;
+                    }
+                }
+            }
+        }
+    }
+
+    // Reduction and output - OUTSIDE radial loop
+    int out_base = norb_idx * nlm_dim * natomwfc;
+
+    for (int m0 = 0; m0 < num_m0; m0++)
+    {
+        s_temp_re[tid] = result_re[m0];
+        s_temp_im[tid] = result_im[m0];
+        __syncthreads();
+
+        for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                s_temp_re[tid] += s_temp_re[tid + s];
+                s_temp_im[tid] += s_temp_im[tid + s];
+            }
+            __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+            cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
+            result = cu_mul(result, exp_iAR0);
+            result = cu_conj(result);
+            nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result;
+        }
+        __syncthreads();
+
+        if (nlm_dim == 4)
+        {
+            for (int d = 0; d < 3; d++)
+            {
+                s_temp_re[tid] = result_r_re[d][m0];
+                s_temp_im[tid] = result_r_im[d][m0];
+                __syncthreads();
+
+                for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+                {
+                    if (tid < s)
+                    {
+                        s_temp_re[tid] += s_temp_re[tid + s];
+                        s_temp_im[tid] += s_temp_im[tid + s];
+                    }
+                    __syncthreads();
+                }
+
+                if (tid == 0)
+                {
+                    cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
+                    result_r = cu_mul(result_r, exp_iAR0);
+                    result_r = cu_conj(result_r);
+                    nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r;
+                }
+                __syncthreads();
+            }
+        }
+    }
+}
+
 //=============================================================================
 // Host-side Helper Function Implementations
 //=============================================================================
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index 085361e81e..f9a620ff5f 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -129,48 +129,84 @@ struct ProjectorData
     double r_max;     // Maximum radial value
 };
 
+/**
+ * @brief Information about a neighbor-orbital pair for atom-level batching
+ *        Used to batch ALL neighbors for a center atom in a single kernel launch
+ */
+struct NeighborOrbitalData
+{
+    int neighbor_idx; // Which neighbor (ad index) this orbital belongs to
+    double3 R1;       // Neighbor atom position (tau1 * lat0)
+
+    // Orbital info (same as OrbitalData)
+    int L1;
+    int m1;
+    int N1;
+    int iw_index;
+    int psi_offset;
+    int psi_mesh;
+    double psi_dk;
+    double psi_rcut;
+};
+
 //=============================================================================
-// Main Kernel
+// Main Kernels
 //=============================================================================
 
 /**
- * @brief Main kernel for neighbor-level batch processing
+ * @brief Neighbor-level batch kernel (original version)
  *
  * Grid: (num_orbitals, nproj, 1)
  * Block: (BLOCK_SIZE, 1, 1)
+ */
+__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1,
+                                                   double3 R0,
+                                                   double3 A,
+                                                   const OrbitalData* __restrict__ orbitals,
+                                                   const ProjectorData* __restrict__ projectors,
+                                                   const double* __restrict__ psi_radial,
+                                                   const double* __restrict__ beta_radial,
+                                                   const int* __restrict__ proj_m0_offset,
+                                                   int num_orbitals,
+                                                   int nproj,
+                                                   int natomwfc,
+                                                   int nlm_dim,
+                                                   cuDoubleComplex* __restrict__ nlm_out);
+
+/**
+ * @brief Atom-level batch kernel - processes ALL neighbors for a center atom
  *
- * Each block processes one (orbital, projector) pair.
- * Threads within a block parallelize over angular integration points.
+ * Grid: (total_neighbor_orbitals, nproj, 1)
+ * Block: (BLOCK_SIZE, 1, 1)
  *
- * @param R1 Neighbor atom position
- * @param R0 Projector atom position
+ * This kernel processes all orbitals from all neighbors in a single launch,
+ * reducing kernel launch overhead significantly.
+ *
+ * @param R0 Center atom position (projector location)
  * @param A Vector potential
- * @param orbitals Array of orbital data [num_orbitals]
+ * @param neighbor_orbitals Array of neighbor-orbital pairs [total_neighbor_orbitals]
  * @param projectors Array of projector data [nproj]
  * @param psi_radial Flattened orbital radial functions
  * @param beta_radial Flattened projector radial functions
- * @param num_orbitals Number of orbitals in batch
+ * @param proj_m0_offset Offset for each projector's m=0 in output
+ * @param total_neighbor_orbitals Total number of (neighbor, orbital) pairs
  * @param nproj Number of projectors
- * @param natomwfc Total number of projector components (sum of 2*L0+1)
- * @param calc_r Whether to compute position operator elements
- * @param nlm_out Output array [num_orbitals * nlm_dim * natomwfc]
- *
- * Note: Gauss-Legendre grids (gl_x, gl_w) and Lebedev grids are stored in constant memory
+ * @param natomwfc Total projector components (sum of 2*L0+1)
+ * @param nlm_dim 1 for no current, 4 for current
+ * @param nlm_out Output [total_neighbor_orbitals * nlm_dim * natomwfc]
  */
-__global__ void snap_psibeta_neighbor_batch_kernel(
-    double3 R1,
-    double3 R0,
-    double3 A,
-    const OrbitalData* __restrict__ orbitals,
-    const ProjectorData* __restrict__ projectors,
-    const double* __restrict__ psi_radial,
-    const double* __restrict__ beta_radial,
-    const int* __restrict__ proj_m0_offset, // Offset for each projector in output
-    int num_orbitals,
-    int nproj,
-    int natomwfc,
-    int nlm_dim,
-    cuDoubleComplex* __restrict__ nlm_out);
+__global__ void snap_psibeta_atom_batch_kernel(double3 R0,
+                                               double3 A,
+                                               const NeighborOrbitalData* __restrict__ neighbor_orbitals,
+                                               const ProjectorData* __restrict__ projectors,
+                                               const double* __restrict__ psi_radial,
+                                               const double* __restrict__ beta_radial,
+                                               const int* __restrict__ proj_m0_offset,
+                                               int total_neighbor_orbitals,
+                                               int nproj,
+                                               int natomwfc,
+                                               int nlm_dim,
+                                               cuDoubleComplex* __restrict__ nlm_out);
 
 //=============================================================================
 // Host-side Helper Functions
diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
index ab6f55a617..93e7d483d4 100644
--- a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
+++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
@@ -3,6 +3,8 @@
 
 #include "source_base/vector3.h"
 #include "source_basis/module_ao/ORB_read.h"
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
 #include "source_cell/setup_nonlocal.h"
 #include "source_cell/unitcell.h"
 
@@ -32,27 +34,7 @@ void initialize_gpu_resources();
 void finalize_gpu_resources();
 
 /**
- * @brief Neighbor-level GPU batch processing interface
- *
- * Computes matrix elements for all orbitals on a neighbor atom with projectors
- * Replaces the original orbital loop:
- *   for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol) {
- *       snap_psibeta_half_tddft(...);
- *   }
- *
- * @param orb Orbital information
- * @param infoNL_ Non-local pseudopotential information
- * @param T1 Neighbor atom type
- * @param R1 Neighbor atom position (already multiplied by lat0)
- * @param atom1 Neighbor atom pointer (for iw2l, iw2m, iw2n)
- * @param all_indexes All orbital indices on this neighbor
- * @param npol Polarization number
- * @param T0 Projector atom type
- * @param R0 Projector atom position (already multiplied by lat0)
- * @param A Vector potential
- * @param nlm_all Output: nlm_all[dir][iw_index] = nlm_vector
- * @param calc_r Whether to calculate position operator matrix elements
- * @return true if GPU processing succeeded, false if caller should use CPU path
+ * @brief Neighbor-level GPU batch processing interface (legacy)
  */
 bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
                                      const InfoNonlocal& infoNL_,
@@ -67,6 +49,37 @@ bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
                                      std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>& nlm_all,
                                      const bool calc_r);
 
+/**
+ * @brief Atom-level GPU batch processing interface
+ *
+ * Processes ALL neighbors for a center atom in a SINGLE kernel launch.
+ * This significantly reduces kernel launch overhead compared to neighbor-level batching.
+ *
+ * @param orb Orbital information
+ * @param infoNL_ Non-local pseudopotential information
+ * @param T0 Center atom type (projector location)
+ * @param R0 Center atom position (already multiplied by lat0)
+ * @param A Vector potential
+ * @param adjs Adjacent atom information for this center atom
+ * @param ucell Unit cell pointer
+ * @param paraV Parallel orbitals information
+ * @param npol Polarization number
+ * @param nlm_dim 1 for no current, 4 for current calculation
+ * @param nlm_tot Output: nlm_tot[ad][dir][iw_index] = nlm_vector
+ */
+void snap_psibeta_atom_batch_gpu(
+    const LCAO_Orbitals& orb,
+    const InfoNonlocal& infoNL_,
+    const int T0,
+    const ModuleBase::Vector3<double>& R0,
+    const ModuleBase::Vector3<double>& A,
+    const AdjacentAtomInfo& adjs,
+    const UnitCell* ucell,
+    const Parallel_Orbitals* paraV,
+    const int npol,
+    const int nlm_dim,
+    std::vector<std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>>& nlm_tot);
+
 } // namespace gpu
 } // namespace module_rt
 

From 6014c905aa6bb3ae925e2c54a9de1c83d9c3273d Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Tue, 9 Dec 2025 11:17:10 +0800
Subject: [PATCH 03/13] remove snap_psibeta_neighbor_batch_gpu

---
 .../kernels/cuda/snap_psibeta_gpu.cu          | 293 ------------------
 .../kernels/cuda/snap_psibeta_kernel.cu       | 252 ---------------
 .../kernels/cuda/snap_psibeta_kernel.cuh      |  53 +---
 .../module_rt/kernels/snap_psibeta_gpu.h      |  16 -
 4 files changed, 9 insertions(+), 605 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
index 519b522c8f..f630aec8c2 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -54,299 +54,6 @@ void finalize_gpu_resources()
     cudaGetLastError();
 }
 
-//=============================================================================
-// Main GPU Interface Function
-//=============================================================================
-
-bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
-                                     const InfoNonlocal& infoNL_,
-                                     const int T1,
-                                     const ModuleBase::Vector3<double>& R1,
-                                     const Atom* atom1,
-                                     const std::vector<int>& all_indexes,
-                                     const int npol,
-                                     const int T0,
-                                     const ModuleBase::Vector3<double>& R0,
-                                     const ModuleBase::Vector3<double>& A,
-                                     std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>& nlm_all,
-                                     const bool calc_r)
-{
-
-    // Get projector info
-    const int nproj = infoNL_.nproj[T0];
-    if (nproj == 0)
-    {
-        return true;
-    }
-
-    // Calculate number of orbitals
-    int num_orbitals = all_indexes.size() / npol;
-    if (num_orbitals == 0)
-    {
-        return true;
-    }
-
-    // Calculate natomwfc (total projector components)
-    int natomwfc = 0;
-    std::vector<int> proj_m0_offset_h(nproj);
-    for (int ip = 0; ip < nproj; ip++)
-    {
-        proj_m0_offset_h[ip] = natomwfc;
-        int L0 = infoNL_.Beta[T0].Proj[ip].getL();
-
-        // Check L0 against MAX_L
-        if (L0 > MAX_L)
-        {
-            fprintf(stderr, "[CUDA] Warning: L0=%d exceeds MAX_L=%d, falling back to CPU\n", L0, MAX_L);
-            return false;
-        }
-
-        natomwfc += 2 * L0 + 1;
-    }
-
-    int nlm_dim = calc_r ? 4 : 1;
-
-    // Resize output
-    if (nlm_all.size() != nlm_dim)
-    {
-        nlm_all.resize(nlm_dim);
-    }
-
-    //=========================================================================
-    // Prepare Host Data
-    //=========================================================================
-
-    // Orbital data
-    std::vector<OrbitalData> orbitals_h(num_orbitals);
-
-    // Calculate total psi size and offsets
-    std::vector<int> psi_offsets_h(num_orbitals);
-    int total_psi_size = 0;
-
-    for (int i = 0; i < num_orbitals; i++)
-    {
-        int iw1l = i * npol;
-        int iw1 = all_indexes[iw1l] / npol;
-
-        int L1 = atom1->iw2l[iw1];
-        int m1 = atom1->iw2m[iw1];
-        int N1 = atom1->iw2n[iw1];
-
-        // Check L1 against MAX_L
-        if (L1 > MAX_L)
-        {
-            fprintf(stderr, "[CUDA] Warning: L1=%d exceeds MAX_L=%d, falling back to CPU\n", L1, MAX_L);
-            return false;
-        }
-
-        const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1);
-
-        orbitals_h[i].L1 = L1;
-        orbitals_h[i].m1 = m1;
-        orbitals_h[i].N1 = N1;
-        orbitals_h[i].iw_index = all_indexes[iw1l];
-        orbitals_h[i].psi_offset = total_psi_size;
-        orbitals_h[i].psi_mesh = phi_ln.getNr();
-        orbitals_h[i].psi_dk = phi_ln.getDk();
-        orbitals_h[i].psi_rcut = phi_ln.getRcut();
-
-        psi_offsets_h[i] = total_psi_size;
-        total_psi_size += phi_ln.getNr();
-    }
-
-    // Copy psi radial data
-    std::vector<double> psi_radial_h(total_psi_size);
-    for (int i = 0; i < num_orbitals; i++)
-    {
-        int iw1l = i * npol;
-        int iw1 = all_indexes[iw1l] / npol;
-
-        int L1 = atom1->iw2l[iw1];
-        int N1 = atom1->iw2n[iw1];
-
-        const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1);
-        int mesh = phi_ln.getNr();
-        const double* psi = phi_ln.getPsi();
-
-        for (int j = 0; j < mesh; j++)
-        {
-            psi_radial_h[orbitals_h[i].psi_offset + j] = psi[j];
-        }
-    }
-
-    // Projector data
-    std::vector<ProjectorData> projectors_h(nproj);
-    std::vector<int> beta_offsets_h(nproj);
-    int total_beta_size = 0;
-
-    for (int ip = 0; ip < nproj; ip++)
-    {
-        const auto& proj = infoNL_.Beta[T0].Proj[ip];
-
-        projectors_h[ip].L0 = proj.getL();
-        projectors_h[ip].beta_offset = total_beta_size;
-        projectors_h[ip].beta_mesh = proj.getNr();
-        projectors_h[ip].beta_dk = proj.getDk();
-        projectors_h[ip].beta_rcut = proj.getRcut();
-
-        // Get r_min and r_max from radial grid
-        const double* radial = proj.getRadial();
-        int mesh = proj.getNr();
-        projectors_h[ip].r_min = radial[0];
-        projectors_h[ip].r_max = radial[mesh - 1];
-
-        beta_offsets_h[ip] = total_beta_size;
-        total_beta_size += mesh;
-    }
-
-    // Copy beta radial data
-    std::vector<double> beta_radial_h(total_beta_size);
-    for (int ip = 0; ip < nproj; ip++)
-    {
-        const auto& proj = infoNL_.Beta[T0].Proj[ip];
-        int mesh = proj.getNr();
-        const double* beta_r = proj.getBeta_r();
-
-        for (int j = 0; j < mesh; j++)
-        {
-            beta_radial_h[projectors_h[ip].beta_offset + j] = beta_r[j];
-        }
-    }
-
-    //=========================================================================
-    // Allocate Device Memory
-    //=========================================================================
-
-    OrbitalData* orbitals_d = nullptr;
-    ProjectorData* projectors_d = nullptr;
-    double* psi_radial_d = nullptr;
-    double* beta_radial_d = nullptr;
-    int* proj_m0_offset_d = nullptr;
-    cuDoubleComplex* nlm_out_d = nullptr;
-
-    size_t output_size = num_orbitals * nlm_dim * natomwfc;
-
-    CUDA_CHECK(cudaMalloc(&orbitals_d, num_orbitals * sizeof(OrbitalData)));
-    CUDA_CHECK(cudaMalloc(&projectors_d, nproj * sizeof(ProjectorData)));
-    CUDA_CHECK(cudaMalloc(&psi_radial_d, total_psi_size * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&beta_radial_d, total_beta_size * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&proj_m0_offset_d, nproj * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex)));
-
-    //=========================================================================
-    // Copy Data to Device
-    //=========================================================================
-
-    CUDA_CHECK(cudaMemcpy(orbitals_d, orbitals_h.data(), num_orbitals * sizeof(OrbitalData), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(projectors_d, projectors_h.data(), nproj * sizeof(ProjectorData), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(psi_radial_d, psi_radial_h.data(), total_psi_size * sizeof(double), cudaMemcpyHostToDevice));
-    CUDA_CHECK(
-        cudaMemcpy(beta_radial_d, beta_radial_h.data(), total_beta_size * sizeof(double), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(proj_m0_offset_d, proj_m0_offset_h.data(), nproj * sizeof(int), cudaMemcpyHostToDevice));
-
-    // Initialize output to zero
-    CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex)));
-
-    //=========================================================================
-    // Launch Kernel
-    //=========================================================================
-
-    double3 R1_d3 = make_double3(R1.x, R1.y, R1.z);
-    double3 R0_d3 = make_double3(R0.x, R0.y, R0.z);
-    double3 A_d3 = make_double3(A.x, A.y, A.z);
-
-    dim3 grid(num_orbitals, nproj, 1);
-    dim3 block(BLOCK_SIZE, 1, 1);
-
-    snap_psibeta_neighbor_batch_kernel<<<grid, block>>>(R1_d3,
-                                                        R0_d3,
-                                                        A_d3,
-                                                        orbitals_d,
-                                                        projectors_d,
-                                                        psi_radial_d,
-                                                        beta_radial_d,
-                                                        proj_m0_offset_d,
-                                                        num_orbitals,
-                                                        nproj,
-                                                        natomwfc,
-                                                        nlm_dim,
-                                                        nlm_out_d);
-
-    // Check for kernel errors
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        fprintf(stderr, "[CUDA] Kernel launch error: %s\n", cudaGetErrorString(err));
-        // Cleanup and return
-        cudaFree(orbitals_d);
-        cudaFree(projectors_d);
-        cudaFree(psi_radial_d);
-        cudaFree(beta_radial_d);
-        cudaFree(proj_m0_offset_d);
-        cudaFree(nlm_out_d);
-        return false;
-    }
-
-    CUDA_CHECK(cudaDeviceSynchronize());
-
-    // Check for errors after synchronization
-    err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        fprintf(stderr, "[CUDA] Kernel execution error: %s\n", cudaGetErrorString(err));
-        cudaFree(orbitals_d);
-        cudaFree(projectors_d);
-        cudaFree(psi_radial_d);
-        cudaFree(beta_radial_d);
-        cudaFree(proj_m0_offset_d);
-        cudaFree(nlm_out_d);
-        return false;
-    }
-
-    //=========================================================================
-    // Copy Results Back
-    //=========================================================================
-
-    std::vector<cuDoubleComplex> nlm_out_h(output_size);
-    CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
-
-    // Reconstruct output into nlm_all format
-    for (int i = 0; i < num_orbitals; i++)
-    {
-        int iw_index = orbitals_h[i].iw_index;
-
-        // Create nlm vectors for this orbital
-        std::vector<std::vector<std::complex<double>>> nlm(nlm_dim);
-        for (int d = 0; d < nlm_dim; d++)
-        {
-            nlm[d].resize(natomwfc);
-            for (int k = 0; k < natomwfc; k++)
-            {
-                size_t idx = i * nlm_dim * natomwfc + d * natomwfc + k;
-                nlm[d][k] = std::complex<double>(nlm_out_h[idx].x, nlm_out_h[idx].y);
-            }
-        }
-
-        // Insert into output maps
-        for (int dir = 0; dir < nlm_dim; dir++)
-        {
-            nlm_all[dir].insert({iw_index, nlm[dir]});
-        }
-    }
-
-    //=========================================================================
-    // Cleanup
-    //=========================================================================
-
-    cudaFree(orbitals_d);
-    cudaFree(projectors_d);
-    cudaFree(psi_radial_d);
-    cudaFree(beta_radial_d);
-    cudaFree(proj_m0_offset_d);
-    cudaFree(nlm_out_d);
-    return true;
-}
-
 //=============================================================================
 // Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel
 //=============================================================================
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 7feae5907c..8a2c914ca7 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -165,258 +165,6 @@ __device__ __forceinline__ double warp_reduce_sum(double val)
     return val;
 }
 
-//=============================================================================
-// Main Kernel Implementation - Memory Efficient Version
-//=============================================================================
-
-__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1,
-                                                   double3 R0,
-                                                   double3 A,
-                                                   const OrbitalData* __restrict__ orbitals,
-                                                   const ProjectorData* __restrict__ projectors,
-                                                   const double* __restrict__ psi_radial,
-                                                   const double* __restrict__ beta_radial,
-                                                   const int* __restrict__ proj_m0_offset,
-                                                   int num_orbitals,
-                                                   int nproj,
-                                                   int natomwfc,
-                                                   int nlm_dim,
-                                                   cuDoubleComplex* __restrict__ nlm_out)
-{
-
-    int orb_idx = blockIdx.x;
-    int proj_idx = blockIdx.y;
-    int tid = threadIdx.x;
-
-    if (orb_idx >= num_orbitals || proj_idx >= nproj)
-    {
-        return;
-    }
-
-    // Load orbital and projector data
-    OrbitalData orb = orbitals[orb_idx];
-    ProjectorData proj = projectors[proj_idx];
-
-    int L1 = orb.L1;
-    int m1 = orb.m1;
-    int L0 = proj.L0;
-    int num_m0 = 2 * L0 + 1;
-
-    // Distance check
-    double3 dRa = make_double3(R0.x - R1.x, R0.y - R1.y, R0.z - R1.z);
-    double distance = sqrt(dRa.x * dRa.x + dRa.y * dRa.y + dRa.z * dRa.z);
-
-    // Check for NaN
-    if (isnan(distance))
-    {
-        return;
-    }
-
-    double Rcut_sum = orb.psi_rcut + proj.beta_rcut;
-
-    if (distance > Rcut_sum)
-    {
-        // Zero output
-        if (tid == 0)
-        {
-            int m0_offset = proj_m0_offset[proj_idx];
-            int out_base = orb_idx * nlm_dim * natomwfc;
-            for (int m0 = 0; m0 < num_m0; m0++)
-            {
-                for (int d = 0; d < nlm_dim; d++)
-                {
-                    nlm_out[out_base + d * natomwfc + m0_offset + m0] = make_cuDoubleComplex(0.0, 0.0);
-                }
-            }
-        }
-        return;
-    }
-
-    // Reduced shared memory: only for reduction (2 doubles per thread * BLOCK_SIZE)
-    __shared__ double s_temp_re[BLOCK_SIZE];
-    __shared__ double s_temp_im[BLOCK_SIZE];
-
-    // Radial grid mapping
-    double xl = (proj.r_max - proj.r_min) * 0.5;
-    double xmean = (proj.r_max + proj.r_min) * 0.5;
-
-    // Precompute A dot R0
-    double A_phase_R0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
-    cuDoubleComplex exp_iAR0 = cu_exp_i(A_phase_R0);
-
-    // Result accumulators in registers - one per m0 component
-    double result_re[MAX_YLM_SIZE];
-    double result_im[MAX_YLM_SIZE];
-    double result_r_re[3][MAX_YLM_SIZE];
-    double result_r_im[3][MAX_YLM_SIZE];
-
-    // Initialize accumulators
-    for (int m0 = 0; m0 < num_m0; m0++)
-    {
-        result_re[m0] = 0.0;
-        result_im[m0] = 0.0;
-        if (nlm_dim == 4)
-        {
-            for (int d = 0; d < 3; d++)
-            {
-                result_r_re[d][m0] = 0.0;
-                result_r_im[d][m0] = 0.0;
-            }
-        }
-    }
-
-    // Radial loop - using constant memory for Gauss-Legendre grid
-    for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
-    {
-        double r_val = xmean + xl * d_gl_x[ir];
-        double w_rad = xl * d_gl_w[ir];
-
-        // Angular loop - each thread handles multiple angular points
-        for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
-        {
-            double leb_x = d_lebedev_x[ian];
-            double leb_y = d_lebedev_y[ian];
-            double leb_z = d_lebedev_z[ian];
-            double w_ang = d_lebedev_w[ian];
-
-            double rx = r_val * leb_x;
-            double ry = r_val * leb_y;
-            double rz = r_val * leb_z;
-
-            double tx = rx + dRa.x;
-            double ty = ry + dRa.y;
-            double tz = rz + dRa.z;
-            double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
-
-            if (tnorm <= orb.psi_rcut)
-            {
-                // Compute Y_L1
-                double ylm1[MAX_YLM_SIZE];
-                if (tnorm > 1e-10)
-                {
-                    double inv_tnorm = 1.0 / tnorm;
-                    compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
-                }
-                else
-                {
-                    compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1);
-                }
-
-                // Compute Y_L0
-                double ylm0[MAX_YLM_SIZE];
-                compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
-
-                // Interpolate psi
-                double psi_val
-                    = interpolate_radial_gpu(psi_radial + orb.psi_offset, orb.psi_mesh, 1.0 / orb.psi_dk, tnorm);
-
-                // Phase factor
-                double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
-                double phase = r_val * A_dot_leb;
-                cuDoubleComplex exp_iAr = cu_exp_i(phase);
-
-                // Interpolate beta
-                double beta_val
-                    = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
-
-                // Y_L1m1
-                int offset_L1 = L1 * L1 + m1;
-                double ylm_L1_val = ylm1[offset_L1];
-
-                // Combined factor
-                double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
-                cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
-
-                // Accumulate for all m0
-                int offset_L0 = L0 * L0;
-                for (int m0 = 0; m0 < num_m0; m0++)
-                {
-                    double ylm0_val = ylm0[offset_L0 + m0];
-
-                    result_re[m0] += common_factor.x * ylm0_val;
-                    result_im[m0] += common_factor.y * ylm0_val;
-
-                    if (nlm_dim == 4)
-                    {
-                        double r_op_x = rx + R0.x;
-                        double r_op_y = ry + R0.y;
-                        double r_op_z = rz + R0.z;
-
-                        result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x;
-                        result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x;
-                        result_r_re[1][m0] += common_factor.x * ylm0_val * r_op_y;
-                        result_r_im[1][m0] += common_factor.y * ylm0_val * r_op_y;
-                        result_r_re[2][m0] += common_factor.x * ylm0_val * r_op_z;
-                        result_r_im[2][m0] += common_factor.y * ylm0_val * r_op_z;
-                    }
-                }
-            }
-        }
-    }
-
-    // Reduce across threads for each m0
-    int m0_offset = proj_m0_offset[proj_idx];
-    int out_base = orb_idx * nlm_dim * natomwfc;
-
-    for (int m0 = 0; m0 < num_m0; m0++)
-    {
-        // Reduce real part
-        s_temp_re[tid] = result_re[m0];
-        s_temp_im[tid] = result_im[m0];
-        __syncthreads();
-
-        // Standard block reduction
-        for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
-        {
-            if (tid < s)
-            {
-                s_temp_re[tid] += s_temp_re[tid + s];
-                s_temp_im[tid] += s_temp_im[tid + s];
-            }
-            __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-            cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
-            result = cu_mul(result, exp_iAR0);
-            result = cu_conj(result);
-            nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result;
-        }
-        __syncthreads();
-
-        // Position operator terms
-        if (nlm_dim == 4)
-        {
-            for (int d = 0; d < 3; d++)
-            {
-                s_temp_re[tid] = result_r_re[d][m0];
-                s_temp_im[tid] = result_r_im[d][m0];
-                __syncthreads();
-
-                for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
-                {
-                    if (tid < s)
-                    {
-                        s_temp_re[tid] += s_temp_re[tid + s];
-                        s_temp_im[tid] += s_temp_im[tid + s];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid == 0)
-                {
-                    cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
-                    result_r = cu_mul(result_r, exp_iAR0);
-                    result_r = cu_conj(result_r);
-                    nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r;
-                }
-                __syncthreads();
-            }
-        }
-    }
-}
-
 //=============================================================================
 // Atom-Level Batch Kernel Implementation
 // Processes ALL neighbors for a center atom in a single kernel launch
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index f9a620ff5f..c3191653eb 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -100,21 +100,6 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
 // Kernel Input/Output Structures
 //=============================================================================
 
-/**
- * @brief Information about a single orbital for batch processing
- */
-struct OrbitalData
-{
-    int L1;          // Angular momentum
-    int m1;          // Magnetic quantum number
-    int N1;          // Radial quantum number
-    int iw_index;    // Original orbital index for output mapping
-    int psi_offset;  // Offset in flattened psi array
-    int psi_mesh;    // Number of mesh points
-    double psi_dk;   // Grid spacing
-    double psi_rcut; // Cutoff radius
-};
-
 /**
  * @brief Information about a single projector
  */
@@ -138,41 +123,21 @@ struct NeighborOrbitalData
     int neighbor_idx; // Which neighbor (ad index) this orbital belongs to
     double3 R1;       // Neighbor atom position (tau1 * lat0)
 
-    // Orbital info (same as OrbitalData)
-    int L1;
-    int m1;
-    int N1;
-    int iw_index;
-    int psi_offset;
-    int psi_mesh;
-    double psi_dk;
-    double psi_rcut;
+    // Orbital info
+    int L1;          // Angular momentum
+    int m1;          // Magnetic quantum number
+    int N1;          // Radial quantum number
+    int iw_index;    // Original orbital index for output mapping
+    int psi_offset;  // Offset in flattened psi array
+    int psi_mesh;    // Number of mesh points
+    double psi_dk;   // Grid spacing
+    double psi_rcut; // Cutoff radius
 };
 
 //=============================================================================
 // Main Kernels
 //=============================================================================
 
-/**
- * @brief Neighbor-level batch kernel (original version)
- *
- * Grid: (num_orbitals, nproj, 1)
- * Block: (BLOCK_SIZE, 1, 1)
- */
-__global__ void snap_psibeta_neighbor_batch_kernel(double3 R1,
-                                                   double3 R0,
-                                                   double3 A,
-                                                   const OrbitalData* __restrict__ orbitals,
-                                                   const ProjectorData* __restrict__ projectors,
-                                                   const double* __restrict__ psi_radial,
-                                                   const double* __restrict__ beta_radial,
-                                                   const int* __restrict__ proj_m0_offset,
-                                                   int num_orbitals,
-                                                   int nproj,
-                                                   int natomwfc,
-                                                   int nlm_dim,
-                                                   cuDoubleComplex* __restrict__ nlm_out);
-
 /**
  * @brief Atom-level batch kernel - processes ALL neighbors for a center atom
  *
diff --git a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
index 93e7d483d4..43b617e188 100644
--- a/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
+++ b/source/source_lcao/module_rt/kernels/snap_psibeta_gpu.h
@@ -33,22 +33,6 @@ void initialize_gpu_resources();
  */
 void finalize_gpu_resources();
 
-/**
- * @brief Neighbor-level GPU batch processing interface (legacy)
- */
-bool snap_psibeta_neighbor_batch_gpu(const LCAO_Orbitals& orb,
-                                     const InfoNonlocal& infoNL_,
-                                     const int T1,
-                                     const ModuleBase::Vector3<double>& R1,
-                                     const Atom* atom1,
-                                     const std::vector<int>& all_indexes,
-                                     const int npol,
-                                     const int T0,
-                                     const ModuleBase::Vector3<double>& R0,
-                                     const ModuleBase::Vector3<double>& A,
-                                     std::vector<std::unordered_map<int, std::vector<std::complex<double>>>>& nlm_all,
-                                     const bool calc_r);
-
 /**
  * @brief Atom-level GPU batch processing interface
  *

From 9047e1b41212a243387b6754a4988f1224f1ab0a Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 19 Dec 2025 16:18:40 +0800
Subject: [PATCH 04/13] perf(gpu): optimize snap_psibeta_atom_batch_kernel with
 loop restructuring

- Move ylm0 computation outside radial loop (saves 140x redundant calculations)
- Hoist A_dot_leb and dR calculations outside inner loop
- Add #pragma unroll hints for radial and m0 loops

Achieves 23.3% speedup on snap_psibeta_gpu (19.27s -> 14.78s).
Numerical correctness verified: energy matches baseline (-756.053 Ry).
---
 .../kernels/cuda/snap_psibeta_kernel.cu       | 73 ++++++++++---------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 8a2c914ca7..9d8c57a2ef 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -246,35 +246,50 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }
     }
 
-    // Main integration loop
-    for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
+    // Main integration loop - RESTRUCTURED: angular loop outside, radial inside
+    // ylm0 only depends on angular direction (leb_x,y,z), not radial distance
+    // This saves 140x redundant ylm0 computations per angular point
+    for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
     {
-        double r_val = xmean + xl * d_gl_x[ir];
-        double w_rad = xl * d_gl_w[ir];
-
-        for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
+        double leb_x = d_lebedev_x[ian];
+        double leb_y = d_lebedev_y[ian];
+        double leb_z = d_lebedev_z[ian];
+        double w_ang = d_lebedev_w[ian];
+
+        // Precompute ylm0 ONCE per angular point (doesn't depend on r_val)
+        double ylm0[MAX_YLM_SIZE];
+        compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
+        int offset_L0 = L0 * L0;
+
+        // Precompute A dot direction (doesn't depend on r_val)
+        double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
+
+        // Precompute dR components
+        double dRx = R0.x - R1.x;
+        double dRy = R0.y - R1.y;
+        double dRz = R0.z - R1.z;
+
+        #pragma unroll 4
+        for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
         {
-            double leb_x = d_lebedev_x[ian];
-            double leb_y = d_lebedev_y[ian];
-            double leb_z = d_lebedev_z[ian];
-            double w_ang = d_lebedev_w[ian];
+            double r_val = xmean + xl * d_gl_x[ir];
+            double w_rad = xl * d_gl_w[ir];
 
             // Local position relative to R0
             double rx = r_val * leb_x;
             double ry = r_val * leb_y;
             double rz = r_val * leb_z;
 
-            // Vector from R1 (orbital atom) to integration point (R0 + r_local)
-            // This matches neighbor_batch: tx = rx + dRa.x where dRa = R0 - R1
-            double tx = rx + R0.x - R1.x;
-            double ty = ry + R0.y - R1.y;
-            double tz = rz + R0.z - R1.z;
+            // Vector from R1 to integration point
+            double tx = rx + dRx;
+            double ty = ry + dRy;
+            double tz = rz + dRz;
             double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
 
-            // Check psi cutoff - matches neighbor_batch logic
+            // Check psi cutoff
             if (tnorm <= r1_max)
             {
-                // Compute Y_L1 - match neighbor_batch handling of small tnorm
+                // Compute Y_L1 (depends on direction to R1, varies with r_val)
                 double ylm1[MAX_YLM_SIZE];
                 if (tnorm > 1e-10)
                 {
@@ -286,33 +301,25 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                     compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1);
                 }
 
-                // Compute Y_L0
-                double ylm0[MAX_YLM_SIZE];
-                compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
-
-                // Interpolate psi at tnorm (distance to orbital atom)
-                double psi_val
-                    = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
+                // Interpolate psi
+                double psi_val = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
 
-                // Phase factor - match neighbor_batch: exp(+i A · r_local)
-                double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
+                // Phase factor
                 double phase = r_val * A_dot_leb;
                 cuDoubleComplex exp_iAr = cu_exp_i(phase);
 
                 // Interpolate beta
-                double beta_val
-                    = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
+                double beta_val = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
 
                 // Y_L1m1
-                int offset_L1 = L1 * L1 + m1;
-                double ylm_L1_val = ylm1[offset_L1];
+                double ylm_L1_val = ylm1[L1 * L1 + m1];
 
-                // Combined factor - match neighbor_batch exactly
+                // Combined factor
                 double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
                 cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
 
-                // Accumulate for all m0 - match neighbor_batch structure
-                int offset_L0 = L0 * L0;
+                // Accumulate for all m0
+                #pragma unroll
                 for (int m0 = 0; m0 < num_m0; m0++)
                 {
                     double ylm0_val = ylm0[offset_L0 + m0];

From 97b26d6410a38d549cd0c9239f4522e090fbdfff Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 19 Dec 2025 17:13:31 +0800
Subject: [PATCH 05/13] perf(gpu): optimize compute_ylm_gpu with atan2 and
 sincos

- Replace conditional atan branches with single atan2 call
- Use sincos() instead of separate sin/cos calls

Achieves 8.4% additional speedup (14.78s -> 13.56s)
Combined with loop restructuring: 29.6% total from baseline
Numerical correctness verified: -756.053 Ry
---
 .../kernels/cuda/snap_psibeta_kernel.cu       | 51 +++++++++----------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 9d8c57a2ef..e706d513b8 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -30,6 +30,7 @@ __constant__ double d_gl_w[RADIAL_GRID_NUM];
 /**
  * @brief Compute real spherical harmonics Y_lm
  *        Based on the recursive formula used in ModuleBase::Ylm
+ *        OPTIMIZED: Use atan2, sincos for better performance
  */
 __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm)
 {
@@ -45,38 +46,26 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
         return;
     }
 
-    // Compute theta and phi
-    double rxy = sqrt(x * x + y * y);
-    double r = sqrt(x * x + y * y + z * z);
+    // Compute r and cost
+    double r2 = x * x + y * y + z * z;
+    double r = sqrt(r2);
 
-    double cost, phi;
+    double cost, sint, phi;
     if (r < 1e-10)
     {
         cost = 1.0;
+        sint = 0.0;
         phi = 0.0;
     }
     else
     {
         cost = z / r;
-        if (rxy < 1e-10)
-        {
-            phi = 0.0;
-        }
-        else if (x > 1e-10)
-        {
-            phi = atan(y / x);
-        }
-        else if (x < -1e-10)
-        {
-            phi = atan(y / x) + PI;
-        }
-        else
-        {
-            phi = (y >= 0.0) ? PI / 2.0 : -PI / 2.0;
-        }
+        sint = sqrt(1.0 - cost * cost);
+        // Use atan2 for robust phi computation (replaces multiple conditionals)
+        phi = atan2(y, x);
     }
 
-    double sint = sqrt(1.0 - cost * cost);
+    // Ensure sint is non-negative (numerical safety)
     if (sint < 0.0)
     {
         sint = 0.0;
@@ -143,10 +132,14 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
             }
             double same = c * sqrt(1.0 / fact_ratio) * SQRT2;
 
-            ylm[lm] = same * p[l][m] * cos(m * phi);
+            // Use sincos for efficiency (computes both sin and cos together)
+            double sin_mphi, cos_mphi;
+            sincos(m * phi, &sin_mphi, &cos_mphi);
+
+            ylm[lm] = same * p[l][m] * cos_mphi;
             lm++;
 
-            ylm[lm] = same * p[l][m] * sin(m * phi);
+            ylm[lm] = same * p[l][m] * sin_mphi;
             lm++;
         }
     }
@@ -269,7 +262,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         double dRy = R0.y - R1.y;
         double dRz = R0.z - R1.z;
 
-        #pragma unroll 4
+#pragma unroll 4
         for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
         {
             double r_val = xmean + xl * d_gl_x[ir];
@@ -302,14 +295,16 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                 }
 
                 // Interpolate psi
-                double psi_val = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
+                double psi_val
+                    = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
 
                 // Phase factor
                 double phase = r_val * A_dot_leb;
                 cuDoubleComplex exp_iAr = cu_exp_i(phase);
 
                 // Interpolate beta
-                double beta_val = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
+                double beta_val
+                    = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
 
                 // Y_L1m1
                 double ylm_L1_val = ylm1[L1 * L1 + m1];
@@ -318,8 +313,8 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                 double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
                 cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
 
-                // Accumulate for all m0
-                #pragma unroll
+// Accumulate for all m0
+#pragma unroll
                 for (int m0 = 0; m0 < num_m0; m0++)
                 {
                     double ylm0_val = ylm0[offset_L0 + m0];

From 9bdfdb7aee30839585d0890543cee930c003acda Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 19 Dec 2025 20:19:07 +0800
Subject: [PATCH 06/13] make the code more concise

---
 .../module_rt/kernels/cuda/snap_psibeta_kernel.cu      | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index e706d513b8..25a1cef499 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -72,15 +72,7 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
     }
 
     // Associated Legendre polynomials P_l^m
-    double p[MAX_L + 1][MAX_L + 1];
-
-    for (int l = 0; l <= L; l++)
-    {
-        for (int m = 0; m <= L; m++)
-        {
-            p[l][m] = 0.0;
-        }
-    }
+    double p[MAX_L + 1][MAX_L + 1] = {0};
 
     p[0][0] = 1.0;
 

From 926fcb6c836a1fb2f60fe84cdff3f4d761cbef69 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Sat, 20 Dec 2025 02:35:03 +0800
Subject: [PATCH 07/13] perf(cuda): optimize compute_ylm_gpu with template
 parameters

- Convert compute_ylm_gpu to templated version with L as template param
- Use linear array for Legendre polynomials (reduces from 25 to 15 doubles)
- Add DISPATCH_YLM macro for runtime-to-template dispatch
- Add MAX_M0_SIZE constant for result array sizing
- Replace C++17 constexpr if with regular if for C++14 compatibility
- Enable compiler loop unrolling with #pragma unroll

Performance: snap_psibeta_gpu improved from 13.27s to 9.83s (1.35x speedup)
---
 .../kernels/cuda/snap_psibeta_kernel.cu       | 100 +++++++++++++-----
 .../kernels/cuda/snap_psibeta_kernel.cuh      |  38 ++++++-
 2 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 25a1cef499..de0d6b5dcd 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -32,11 +32,36 @@ __constant__ double d_gl_w[RADIAL_GRID_NUM];
  *        Based on the recursive formula used in ModuleBase::Ylm
  *        OPTIMIZED: Use atan2, sincos for better performance
  */
-__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm)
+/**
+ * @brief Helper to access linear-stored P_l^m value
+ *        Linear index for (l,m): sum_{i=0}^{l-1}(i+1) + m = l*(l+1)/2 + m
+ */
+__device__ __forceinline__ double& p_access(double* p, int l, int m)
+{
+    return p[l * (l + 1) / 2 + m];
+}
+
+__device__ __forceinline__ double p_get(const double* p, int l, int m)
 {
-    const double PI = 3.14159265358979323846;
-    const double FOUR_PI = 4.0 * PI;
-    const double SQRT2 = 1.41421356237309504880;
+    return p[l * (l + 1) / 2 + m];
+}
+
+/**
+ * @brief Compute real spherical harmonics Y_lm (TEMPLATED VERSION)
+ *        L is now a compile-time constant for better optimization
+ *        Uses linear array for Legendre polynomials to reduce register pressure
+ *
+ * @tparam L Maximum angular momentum (compile-time constant)
+ * @param x, y, z Direction components (should be normalized)
+ * @param ylm Output array of size (L+1)^2
+ */
+template <int L>
+__device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
+{
+    constexpr double PI = 3.14159265358979323846;
+    constexpr double FOUR_PI = 4.0 * PI;
+    constexpr double SQRT2 = 1.41421356237309504880;
+    constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage
 
     // Y_00
     ylm[0] = 0.5 * sqrt(1.0 / PI);
@@ -46,7 +71,7 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
         return;
     }
 
-    // Compute r and cost
+    // Compute cost, sint, phi
     double r2 = x * x + y * y + z * z;
     double r = sqrt(r2);
 
@@ -61,82 +86,101 @@ __device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm
     {
         cost = z / r;
         sint = sqrt(1.0 - cost * cost);
-        // Use atan2 for robust phi computation (replaces multiple conditionals)
         phi = atan2(y, x);
     }
 
-    // Ensure sint is non-negative (numerical safety)
     if (sint < 0.0)
     {
         sint = 0.0;
     }
 
-    // Associated Legendre polynomials P_l^m
-    double p[MAX_L + 1][MAX_L + 1] = {0};
+    // Associated Legendre polynomials P_l^m in linear storage
+    double p[P_SIZE];
 
-    p[0][0] = 1.0;
+    // Initialize
+    p_access(p, 0, 0) = 1.0;
 
     if (L >= 1)
     {
-        p[1][0] = cost;
-        p[1][1] = -sint;
+        p_access(p, 1, 0) = cost;
+        p_access(p, 1, 1) = -sint;
     }
 
+// Recurrence for l >= 2
+#pragma unroll
     for (int l = 2; l <= L; l++)
     {
+#pragma unroll
         for (int m = 0; m <= l - 2; m++)
         {
-            p[l][m] = ((2 * l - 1) * cost * p[l - 1][m] - (l - 1 + m) * p[l - 2][m]) / (double)(l - m);
+            p_access(p, l, m)
+                = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m)) / (double)(l - m);
         }
-        p[l][l - 1] = (2 * l - 1) * cost * p[l - 1][l - 1];
+        p_access(p, l, l - 1) = (2 * l - 1) * cost * p_get(p, l - 1, l - 1);
+
+        // Compute (2l-1)!! = 1*3*5*...*(2l-1)
         double fact = 1.0;
+#pragma unroll
         for (int i = 1; i <= 2 * l - 1; i += 2)
         {
             fact *= i;
         }
+
+        // Compute sint^l
         double sint_pow = 1.0;
+#pragma unroll
         for (int i = 0; i < l; i++)
         {
             sint_pow *= sint;
         }
-        p[l][l] = fact * sint_pow;
+
+        p_access(p, l, l) = fact * sint_pow;
         if (l % 2 == 1)
         {
-            p[l][l] = -p[l][l];
+            p_access(p, l, l) = -p_access(p, l, l);
         }
     }
 
-    // Compute Y_lm
+    // Compute Y_lm from P_l^m
     int lm = 0;
+#pragma unroll
     for (int l = 0; l <= L; l++)
     {
         double c = sqrt((2.0 * l + 1.0) / FOUR_PI);
 
-        ylm[lm] = c * p[l][0];
+        ylm[lm] = c * p_get(p, l, 0);
         lm++;
 
+#pragma unroll
         for (int m = 1; m <= l; m++)
         {
             double fact_ratio = 1.0;
+#pragma unroll
             for (int i = l - m + 1; i <= l + m; i++)
             {
                 fact_ratio *= i;
             }
             double same = c * sqrt(1.0 / fact_ratio) * SQRT2;
 
-            // Use sincos for efficiency (computes both sin and cos together)
             double sin_mphi, cos_mphi;
             sincos(m * phi, &sin_mphi, &cos_mphi);
 
-            ylm[lm] = same * p[l][m] * cos_mphi;
+            ylm[lm] = same * p_get(p, l, m) * cos_mphi;
             lm++;
 
-            ylm[lm] = same * p[l][m] * sin_mphi;
+            ylm[lm] = same * p_get(p, l, m) * sin_mphi;
             lm++;
         }
     }
 }
 
+// Explicit template instantiations for L = 0, 1, 2, 3, 4
+template __device__ void compute_ylm_gpu<0>(double x, double y, double z, double* ylm);
+template __device__ void compute_ylm_gpu<1>(double x, double y, double z, double* ylm);
+template __device__ void compute_ylm_gpu<2>(double x, double y, double z, double* ylm);
+template __device__ void compute_ylm_gpu<3>(double x, double y, double z, double* ylm);
+template __device__ void compute_ylm_gpu<4>(double x, double y, double z, double* ylm);
+
 //=============================================================================
 // Warp Reduction for double
 //=============================================================================
@@ -214,10 +258,10 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
     __shared__ double s_temp_im[BLOCK_SIZE];
 
     // Initialize accumulators in registers
-    double result_re[MAX_YLM_SIZE];
-    double result_im[MAX_YLM_SIZE];
-    double result_r_re[3][MAX_YLM_SIZE];
-    double result_r_im[3][MAX_YLM_SIZE];
+    double result_re[MAX_M0_SIZE];
+    double result_im[MAX_M0_SIZE];
+    double result_r_re[3][MAX_M0_SIZE];
+    double result_r_im[3][MAX_M0_SIZE];
 
     int num_m0 = 2 * L0 + 1;
     for (int m0 = 0; m0 < num_m0; m0++)
@@ -243,7 +287,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
 
         // Precompute ylm0 ONCE per angular point (doesn't depend on r_val)
         double ylm0[MAX_YLM_SIZE];
-        compute_ylm_gpu(L0, leb_x, leb_y, leb_z, ylm0);
+        DISPATCH_YLM(L0, leb_x, leb_y, leb_z, ylm0);
         int offset_L0 = L0 * L0;
 
         // Precompute A dot direction (doesn't depend on r_val)
@@ -279,11 +323,11 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                 if (tnorm > 1e-10)
                 {
                     double inv_tnorm = 1.0 / tnorm;
-                    compute_ylm_gpu(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
+                    DISPATCH_YLM(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
                 }
                 else
                 {
-                    compute_ylm_gpu(L1, 0.0, 0.0, 1.0, ylm1);
+                    DISPATCH_YLM(L1, 0.0, 0.0, 1.0, ylm1);
                 }
 
                 // Interpolate psi
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index c3191653eb..8c00e4cc5a 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -15,6 +15,7 @@ constexpr int ANGULAR_GRID_NUM = 110;
 constexpr int BLOCK_SIZE = 128;                         // >= ANGULAR_GRID_NUM for efficient reduction
 constexpr int MAX_L = 4;                                // Maximum angular momentum supported
 constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25
+constexpr int MAX_M0_SIZE = 2 * MAX_L + 1;              // = 9
 
 //=============================================================================
 // Device Helper Functions
@@ -88,13 +89,44 @@ __device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double
 
 /**
  * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z)
- *        This is a simplified version for small L values
+ *        TEMPLATED VERSION - L is compile-time constant for optimization
  *
- * @param L Angular momentum (0 <= L <= MAX_L)
+ * @tparam L Maximum angular momentum (0 <= L <= MAX_L)
  * @param x, y, z Direction (should be normalized)
  * @param ylm Output array of size (L+1)^2
  */
-__device__ void compute_ylm_gpu(int L, double x, double y, double z, double* ylm);
+template <int L>
+__device__ void compute_ylm_gpu(double x, double y, double z, double* ylm);
+
+/**
+ * @brief Runtime dispatch macro for templated compute_ylm_gpu
+ *        Converts runtime L to compile-time template call
+ */
+#define DISPATCH_YLM(L_val, x, y, z, ylm)                                                                              \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        switch (L_val)                                                                                                 \
+        {                                                                                                              \
+        case 0:                                                                                                        \
+            compute_ylm_gpu<0>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        case 1:                                                                                                        \
+            compute_ylm_gpu<1>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        case 2:                                                                                                        \
+            compute_ylm_gpu<2>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        case 3:                                                                                                        \
+            compute_ylm_gpu<3>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        case 4:                                                                                                        \
+            compute_ylm_gpu<4>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        default:                                                                                                       \
+            compute_ylm_gpu<4>(x, y, z, ylm);                                                                          \
+            break;                                                                                                     \
+        }                                                                                                              \
+    } while (0)
 
 //=============================================================================
 // Kernel Input/Output Structures

From 7ccabc5a04fa628992581ea2c838076f72fcab86 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Sat, 20 Dec 2025 02:36:20 +0800
Subject: [PATCH 08/13] perf(cuda): use warp shuffle for reduction in
 snap_psibeta kernel

- Replace shared memory tree reduction with warp shuffle reduction
- Use warp_reduce_sum for intra-warp reduction (faster shuffle ops)
- Reduce shared memory from BLOCK_SIZE (2KB) to NUM_WARPS (64 bytes)
- Cross-warp reduction done by first warp reading from shared memory

Reduces register usage from 94 to 88, shared memory from 2KB to 64 bytes.
---
 .../kernels/cuda/snap_psibeta_kernel.cu       | 85 +++++++++++--------
 1 file changed, 51 insertions(+), 34 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index de0d6b5dcd..9f9fd4c099 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -253,9 +253,10 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
     double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
     cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0);
 
-    // Shared memory for reduction
-    __shared__ double s_temp_re[BLOCK_SIZE];
-    __shared__ double s_temp_im[BLOCK_SIZE];
+    // Shared memory for warp-level reduction (only need space for warp sums)
+    constexpr int NUM_WARPS = BLOCK_SIZE / 32; // 128 / 32 = 4 warps
+    __shared__ double s_temp_re[NUM_WARPS];
+    __shared__ double s_temp_im[NUM_WARPS];
 
     // Initialize accumulators in registers
     double result_re[MAX_M0_SIZE];
@@ -376,31 +377,40 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }
     }
 
-    // Reduction and output - OUTSIDE radial loop
+    // Reduction and output using warp shuffle - more efficient than shared memory
     int out_base = norb_idx * nlm_dim * natomwfc;
+    int warp_id = tid / 32;
+    int lane_id = tid % 32;
 
     for (int m0 = 0; m0 < num_m0; m0++)
     {
-        s_temp_re[tid] = result_re[m0];
-        s_temp_im[tid] = result_im[m0];
-        __syncthreads();
+        // Step 1: Warp-level reduction using shuffle
+        double sum_re = warp_reduce_sum(result_re[m0]);
+        double sum_im = warp_reduce_sum(result_im[m0]);
 
-        for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+        // Step 2: First lane of each warp writes to shared memory
+        if (lane_id == 0)
         {
-            if (tid < s)
-            {
-                s_temp_re[tid] += s_temp_re[tid + s];
-                s_temp_im[tid] += s_temp_im[tid + s];
-            }
-            __syncthreads();
+            s_temp_re[warp_id] = sum_re;
+            s_temp_im[warp_id] = sum_im;
         }
+        __syncthreads();
 
-        if (tid == 0)
+        // Step 3: First warp reduces across all warps
+        if (warp_id == 0)
         {
-            cuDoubleComplex result = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
-            result = cu_mul(result, exp_iAR0);
-            result = cu_conj(result);
-            nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result;
+            sum_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0;
+            sum_im = (lane_id < NUM_WARPS) ? s_temp_im[lane_id] : 0.0;
+            sum_re = warp_reduce_sum(sum_re);
+            sum_im = warp_reduce_sum(sum_im);
+
+            if (lane_id == 0)
+            {
+                cuDoubleComplex result = make_cuDoubleComplex(sum_re, sum_im);
+                result = cu_mul(result, exp_iAR0);
+                result = cu_conj(result);
+                nlm_out[out_base + 0 * natomwfc + m0_offset + m0] = result;
+            }
         }
         __syncthreads();
 
@@ -408,26 +418,33 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         {
             for (int d = 0; d < 3; d++)
             {
-                s_temp_re[tid] = result_r_re[d][m0];
-                s_temp_im[tid] = result_r_im[d][m0];
-                __syncthreads();
+                // Step 1: Warp-level reduction
+                double sum_r_re = warp_reduce_sum(result_r_re[d][m0]);
+                double sum_r_im = warp_reduce_sum(result_r_im[d][m0]);
 
-                for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
+                // Step 2: First lane writes to shared memory
+                if (lane_id == 0)
                 {
-                    if (tid < s)
-                    {
-                        s_temp_re[tid] += s_temp_re[tid + s];
-                        s_temp_im[tid] += s_temp_im[tid + s];
-                    }
-                    __syncthreads();
+                    s_temp_re[warp_id] = sum_r_re;
+                    s_temp_im[warp_id] = sum_r_im;
                 }
+                __syncthreads();
 
-                if (tid == 0)
+                // Step 3: First warp reduces across warps
+                if (warp_id == 0)
                 {
-                    cuDoubleComplex result_r = make_cuDoubleComplex(s_temp_re[0], s_temp_im[0]);
-                    result_r = cu_mul(result_r, exp_iAR0);
-                    result_r = cu_conj(result_r);
-                    nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r;
+                    sum_r_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0;
+                    sum_r_im = (lane_id < NUM_WARPS) ? s_temp_im[lane_id] : 0.0;
+                    sum_r_re = warp_reduce_sum(sum_r_re);
+                    sum_r_im = warp_reduce_sum(sum_r_im);
+
+                    if (lane_id == 0)
+                    {
+                        cuDoubleComplex result_r = make_cuDoubleComplex(sum_r_re, sum_r_im);
+                        result_r = cu_mul(result_r, exp_iAR0);
+                        result_r = cu_conj(result_r);
+                        nlm_out[out_base + (d + 1) * natomwfc + m0_offset + m0] = result_r;
+                    }
                 }
                 __syncthreads();
             }

From 0afca47efd36c0d9e77d445a586cd83fb69b9190 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Sat, 20 Dec 2025 13:35:50 +0800
Subject: [PATCH 09/13] refactor(cuda): improve snap_psibeta_kernel code
 organization and documentation

- Add comprehensive file headers explaining purpose and key features
- Organize code into logical sections with clear separators
- Add doxygen-style documentation for all functions, structs, and constants
- Fix inaccurate comments (BLOCK_SIZE requirement, direction vector normalization)
- Remove unused variables (dR, distance01)
- Remove finalize_gpu_resources() as it's not needed for constant memory
- Improve inline comments explaining algorithms and optimizations
---
 .../module_operator_lcao/td_nonlocal_lcao.cpp |   6 -
 .../kernels/cuda/snap_psibeta_gpu.cu          | 132 +++++--
 .../kernels/cuda/snap_psibeta_kernel.cu       | 359 +++++++++++-------
 .../kernels/cuda/snap_psibeta_kernel.cuh      | 244 ++++++++----
 4 files changed, 484 insertions(+), 257 deletions(-)

diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
index 6d2972b92c..63f4590056 100644
--- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
@@ -306,12 +306,6 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             }
         } // end omp parallel for matrix assembly
     }     // end for iat0
-
-#ifdef __CUDA
-    // Release GPU resources at end of calculation
-    module_rt::gpu::finalize_gpu_resources();
-#endif
-
     ModuleBase::timer::tick("TDNonlocal", "calculate_HR");
 }
 
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
index f630aec8c2..026af2d1d0 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -1,3 +1,16 @@
+/**
+ * @file snap_psibeta_gpu.cu
+ * @brief Host-side GPU interface for <psi|beta> overlap computation
+ *
+ * This file provides the high-level interface for GPU-accelerated computation
+ * of overlap integrals between atomic orbitals (psi) and non-local projectors
+ * (beta). It handles:
+ * - GPU resource initialization and cleanup
+ * - Data marshalling from ABACUS structures to GPU-friendly formats
+ * - Kernel launch configuration
+ * - Result unpacking back to ABACUS data structures
+ */
+
 #include "../snap_psibeta_gpu.h"
 #include "snap_psibeta_kernel.cuh"
 #include "source_base/timer.h"
@@ -12,9 +25,12 @@ namespace gpu
 {
 
 //=============================================================================
-// CUDA Error Checking Macro
+// CUDA Error Handling
 //=============================================================================
 
+/**
+ * @brief CUDA error checking macro with file/line information
+ */
 #define CUDA_CHECK(call)                                                                                               \
     do                                                                                                                 \
     {                                                                                                                  \
@@ -26,12 +42,21 @@ namespace gpu
     } while (0)
 
 //=============================================================================
-// Resource Management - Simplified (using constant memory for grids)
+// GPU Resource Management
 //=============================================================================
 
+/**
+ * @brief Initialize GPU resources for snap_psibeta computation
+ *
+ * Checks for available CUDA devices and copies integration grids
+ * (Lebedev-Laikov angular and Gauss-Legendre radial) to constant memory.
+ *
+ * @note Call this once at the start of a calculation session before any
+ *       snap_psibeta_atom_batch_gpu calls.
+ */
 void initialize_gpu_resources()
 {
-    // Check for CUDA device
+    // Verify CUDA device availability
     int device_count = 0;
     cudaError_t err = cudaGetDeviceCount(&device_count);
     if (err != cudaSuccess || device_count == 0)
@@ -40,24 +65,58 @@ void initialize_gpu_resources()
         return;
     }
 
-    // Copy integration grids (Lebedev and Gauss-Legendre) to constant memory
+    // Initialize integration grids in constant memory
     copy_grids_to_device();
 
-    // Sync to ensure all initialization is complete
+    // Synchronize to ensure initialization is complete
     cudaDeviceSynchronize();
 }
 
-void finalize_gpu_resources()
+//=============================================================================
+// Internal Helper Structures
+//=============================================================================
+
+/**
+ * @brief Mapping structure for reconstructing output data
+ *
+ * Associates each orbital in the flattened GPU array with its original
+ * neighbor and orbital indices for proper result placement.
+ */
+struct OrbitalMapping
 {
-    // No explicit cleanup needed - constant memory is managed by CUDA runtime
-    // Just reset any CUDA errors that may have accumulated
-    cudaGetLastError();
-}
+    int neighbor_idx; ///< Index of neighbor atom in adjacency list
+    int iw_index;     ///< Global orbital index for output mapping
+};
 
 //=============================================================================
-// Atom-Level GPU Batch Interface - Processes ALL neighbors in single kernel
+// Main GPU Interface Function
 //=============================================================================
 
+/**
+ * @brief Compute <psi|beta> overlap integrals on GPU
+ *
+ * This function processes ALL neighbor atoms for a single center atom (where
+ * the projectors are located) in a single kernel launch, providing significant
+ * performance improvement over per-neighbor processing.
+ *
+ * Workflow:
+ * 1. Collect all (neighbor, orbital) pairs into flattened arrays
+ * 2. Prepare projector data for the center atom
+ * 3. Transfer data to GPU and launch kernel
+ * 4. Retrieve results and reconstruct nlm_tot structure
+ *
+ * @param orb       LCAO orbital information
+ * @param infoNL_   Non-local projector information
+ * @param T0        Atom type of center atom (projector location)
+ * @param R0        Position of center atom
+ * @param A         Vector potential for phase factor
+ * @param adjs      Adjacent atom information
+ * @param ucell     Unit cell information
+ * @param paraV     Parallel orbital distribution
+ * @param npol      Number of spin polarizations
+ * @param nlm_dim   Output dimension (1 for overlap only, 4 for overlap + current)
+ * @param nlm_tot   Output: overlap integrals indexed as [neighbor][direction][orbital]
+ */
 void snap_psibeta_atom_batch_gpu(
     const LCAO_Orbitals& orb,
     const InfoNonlocal& infoNL_,
@@ -73,7 +132,10 @@ void snap_psibeta_atom_batch_gpu(
 {
     ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
 
-    // Get projector info
+    //=========================================================================
+    // Early exit if no projectors on center atom
+    //=========================================================================
+
     const int nproj = infoNL_.nproj[T0];
     if (nproj == 0)
     {
@@ -81,13 +143,19 @@ void snap_psibeta_atom_batch_gpu(
         return;
     }
 
-    // Calculate natomwfc
-    int natomwfc = 0;
+    //=========================================================================
+    // Compute projector output indices
+    //=========================================================================
+
+    int natomwfc = 0; // Total number of projector components
     std::vector<int> proj_m0_offset_h(nproj);
+
     for (int ip = 0; ip < nproj; ip++)
     {
         proj_m0_offset_h[ip] = natomwfc;
         int L0 = infoNL_.Beta[T0].Proj[ip].getL();
+
+        // Validate angular momentum
         if (L0 > MAX_L)
         {
             fprintf(stderr,
@@ -99,18 +167,11 @@ void snap_psibeta_atom_batch_gpu(
     }
 
     //=========================================================================
-    // Collect all neighbor-orbital pairs
+    // Collect all (neighbor, orbital) pairs
     //=========================================================================
 
     std::vector<NeighborOrbitalData> neighbor_orbitals_h;
     std::vector<double> psi_radial_h;
-
-    // Track neighbor index for each orbital for output reconstruction
-    struct OrbitalMapping
-    {
-        int neighbor_idx;
-        int iw_index;
-    };
     std::vector<OrbitalMapping> orbital_mappings;
 
     for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
@@ -121,7 +182,7 @@ void snap_psibeta_atom_batch_gpu(
         const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
         const Atom* atom1 = &ucell->atoms[T1];
 
-        // Get orbital indices
+        // Get unique orbital indices (union of row and column indices)
         auto all_indexes = paraV->get_indexes_row(iat1);
         auto col_indexes = paraV->get_indexes_col(iat1);
         all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
@@ -136,19 +197,19 @@ void snap_psibeta_atom_batch_gpu(
             const int m1 = atom1->iw2m[iw1];
             const int N1 = atom1->iw2n[iw1];
 
+            // Skip orbitals with angular momentum beyond supported limit
             if (L1 > MAX_L)
             {
-                continue; // Skip orbitals with L > MAX_L
+                continue;
             }
 
-            // Get orbital radial data
-            // Use getPsi() to match CPU implementation (not getPsi_r() which returns psi*r)
+            // Get orbital radial function (use getPsi(), not getPsi_r())
             const double* phi_psi = orb.Phi[T1].PhiLN(L1, N1).getPsi();
             int mesh = orb.Phi[T1].PhiLN(L1, N1).getNr();
             double dk = orb.Phi[T1].PhiLN(L1, N1).getDk();
             double rcut = orb.Phi[T1].getRcut();
 
-            // Add to flattened psi array
+            // Append to flattened psi array
             size_t psi_offset = psi_radial_h.size();
             psi_radial_h.insert(psi_radial_h.end(), phi_psi, phi_psi + mesh);
 
@@ -167,6 +228,7 @@ void snap_psibeta_atom_batch_gpu(
 
             neighbor_orbitals_h.push_back(norb);
 
+            // Track mapping for result reconstruction
             OrbitalMapping mapping;
             mapping.neighbor_idx = ad;
             mapping.iw_index = all_indexes[iw1l];
@@ -203,7 +265,6 @@ void snap_psibeta_atom_batch_gpu(
         projectors_h[ip].beta_mesh = mesh;
         projectors_h[ip].beta_dk = dk;
         projectors_h[ip].beta_rcut = rcut;
-        // Use actual radial grid range
         projectors_h[ip].r_min = radial[0];
         projectors_h[ip].r_max = radial[mesh - 1];
 
@@ -211,7 +272,7 @@ void snap_psibeta_atom_batch_gpu(
     }
 
     //=========================================================================
-    // Allocate Device Memory
+    // Allocate GPU memory
     //=========================================================================
 
     NeighborOrbitalData* neighbor_orbitals_d = nullptr;
@@ -231,7 +292,7 @@ void snap_psibeta_atom_batch_gpu(
     CUDA_CHECK(cudaMalloc(&nlm_out_d, output_size * sizeof(cuDoubleComplex)));
 
     //=========================================================================
-    // Copy Data to Device
+    // Transfer data to GPU
     //=========================================================================
 
     CUDA_CHECK(cudaMemcpy(neighbor_orbitals_d,
@@ -247,7 +308,7 @@ void snap_psibeta_atom_batch_gpu(
     CUDA_CHECK(cudaMemset(nlm_out_d, 0, output_size * sizeof(cuDoubleComplex)));
 
     //=========================================================================
-    // Launch Kernel - SINGLE launch for ALL neighbors!
+    // Launch kernel
     //=========================================================================
 
     double3 R0_d3 = make_double3(R0.x, R0.y, R0.z);
@@ -269,6 +330,7 @@ void snap_psibeta_atom_batch_gpu(
                                                     nlm_dim,
                                                     nlm_out_d);
 
+    // Check for launch errors
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
     {
@@ -286,14 +348,14 @@ void snap_psibeta_atom_batch_gpu(
     CUDA_CHECK(cudaDeviceSynchronize());
 
     //=========================================================================
-    // Copy Results Back
+    // Retrieve results
     //=========================================================================
 
     std::vector<cuDoubleComplex> nlm_out_h(output_size);
     CUDA_CHECK(cudaMemcpy(nlm_out_h.data(), nlm_out_d, output_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
 
     //=========================================================================
-    // Reconstruct nlm_tot structure
+    // Reconstruct output structure
     //=========================================================================
 
     for (int i = 0; i < total_neighbor_orbitals; i++)
@@ -312,7 +374,7 @@ void snap_psibeta_atom_batch_gpu(
             }
         }
 
-        // Insert into nlm_tot[ad][dir][iw_index]
+        // Insert into nlm_tot[neighbor][direction][orbital]
         for (int dir = 0; dir < nlm_dim; dir++)
         {
             nlm_tot[ad][dir].insert({iw_index, nlm[dir]});
@@ -320,7 +382,7 @@ void snap_psibeta_atom_batch_gpu(
     }
 
     //=========================================================================
-    // Cleanup
+    // Cleanup GPU memory
     //=========================================================================
 
     cudaFree(neighbor_orbitals_d);
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 9f9fd4c099..8d29015add 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -1,3 +1,16 @@
+/**
+ * @file snap_psibeta_kernel.cu
+ * @brief CUDA kernel implementation for <psi|beta> overlap integrals
+ *
+ * This file implements the GPU-accelerated numerical integration for computing
+ * overlap integrals between atomic orbitals (psi) and non-local projectors (beta).
+ * The implementation uses:
+ * - Lebedev-Laikov quadrature (110 points) for angular integration
+ * - Gauss-Legendre quadrature (140 points) for radial integration
+ * - Templated spherical harmonics with compile-time L for optimization
+ * - Warp-level shuffle reduction for efficient parallel summation
+ */
+
 #include "snap_psibeta_kernel.cuh"
 #include "source_base/math_integral.h"
 
@@ -10,60 +23,75 @@ namespace gpu
 {
 
 //=============================================================================
-// Constant Memory for Integration Grids
+// Constant Memory - Integration Grids
 //=============================================================================
 
-// Lebedev-Laikov angular grid (110 points)
-__constant__ double d_lebedev_x[ANGULAR_GRID_NUM];
-__constant__ double d_lebedev_y[ANGULAR_GRID_NUM];
-__constant__ double d_lebedev_z[ANGULAR_GRID_NUM];
-__constant__ double d_lebedev_w[ANGULAR_GRID_NUM];
+// Lebedev-Laikov angular quadrature grid (110 points)
+__constant__ double d_lebedev_x[ANGULAR_GRID_NUM]; ///< x-direction cosines
+__constant__ double d_lebedev_y[ANGULAR_GRID_NUM]; ///< y-direction cosines
+__constant__ double d_lebedev_z[ANGULAR_GRID_NUM]; ///< z-direction cosines
+__constant__ double d_lebedev_w[ANGULAR_GRID_NUM]; ///< Angular integration weights
 
-// Gauss-Legendre radial grid (140 points)
-__constant__ double d_gl_x[RADIAL_GRID_NUM];
-__constant__ double d_gl_w[RADIAL_GRID_NUM];
+// Gauss-Legendre radial quadrature grid (140 points)
+__constant__ double d_gl_x[RADIAL_GRID_NUM]; ///< Quadrature abscissae on [-1, 1]
+__constant__ double d_gl_w[RADIAL_GRID_NUM]; ///< Quadrature weights
 
 //=============================================================================
-// Spherical Harmonics Implementation
+// Spherical Harmonics - Helper Functions
 //=============================================================================
 
 /**
- * @brief Compute real spherical harmonics Y_lm
- *        Based on the recursive formula used in ModuleBase::Ylm
- *        OPTIMIZED: Use atan2, sincos for better performance
- */
-/**
- * @brief Helper to access linear-stored P_l^m value
- *        Linear index for (l,m): sum_{i=0}^{l-1}(i+1) + m = l*(l+1)/2 + m
+ * @brief Access element in lower-triangular stored Legendre polynomial array
+ *
+ * For associated Legendre polynomials P_l^m, we only need 0 <= m <= l.
+ * Storage layout: P_0^0, P_1^0, P_1^1, P_2^0, P_2^1, P_2^2, ...
+ * Linear index: l*(l+1)/2 + m
  */
 __device__ __forceinline__ double& p_access(double* p, int l, int m)
 {
     return p[l * (l + 1) / 2 + m];
 }
 
+/**
+ * @brief Read-only access to Legendre polynomial array
+ */
 __device__ __forceinline__ double p_get(const double* p, int l, int m)
 {
     return p[l * (l + 1) / 2 + m];
 }
 
+//=============================================================================
+// Spherical Harmonics - Main Implementation
+//=============================================================================
+
 /**
- * @brief Compute real spherical harmonics Y_lm (TEMPLATED VERSION)
- *        L is now a compile-time constant for better optimization
- *        Uses linear array for Legendre polynomials to reduce register pressure
+ * @brief Compute real spherical harmonics Y_lm (templated version)
+ *
+ * Uses the recursive computation of associated Legendre polynomials:
+ *   P_l^m = ((2l-1)*cos(theta)*P_{l-1}^m - (l-1+m)*P_{l-2}^m) / (l-m)
+ *   P_l^{l-1} = (2l-1)*cos(theta)*P_{l-1}^{l-1}
+ *   P_l^l = (-1)^l * (2l-1)!! * sin^l(theta)
+ *
+ * Real spherical harmonics are defined as:
+ *   Y_{lm} = c_l * P_l^0                           for m = 0
+ *   Y_{l,2m-1} = c_l * sqrt(2/(l-m)!/(l+m)!) * P_l^m * cos(m*phi)  for m > 0
+ *   Y_{l,2m}   = c_l * sqrt(2/(l-m)!/(l+m)!) * P_l^m * sin(m*phi)  for m > 0
+ * where c_l = sqrt((2l+1)/(4*pi))
  *
  * @tparam L Maximum angular momentum (compile-time constant)
- * @param x, y, z Direction components (should be normalized)
- * @param ylm Output array of size (L+1)^2
+ * @param x, y, z Direction vector components (need not be normalized)
+ * @param ylm Output array storing Y_lm values in order: Y_00, Y_10, Y_11c, Y_11s, ...
  */
 template <int L>
 __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
 {
+    // Mathematical constants
     constexpr double PI = 3.14159265358979323846;
     constexpr double FOUR_PI = 4.0 * PI;
     constexpr double SQRT2 = 1.41421356237309504880;
-    constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage
+    constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage size
 
-    // Y_00
+    // Y_00 = 1/(2*sqrt(pi))
     ylm[0] = 0.5 * sqrt(1.0 / PI);
 
     if (L == 0)
@@ -71,13 +99,14 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
         return;
     }
 
-    // Compute cost, sint, phi
+    // Compute spherical angles
     double r2 = x * x + y * y + z * z;
     double r = sqrt(r2);
 
     double cost, sint, phi;
     if (r < 1e-10)
     {
+        // At origin, default to z-axis direction
         cost = 1.0;
         sint = 0.0;
         phi = 0.0;
@@ -89,86 +118,92 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
         phi = atan2(y, x);
     }
 
+    // Ensure sint is non-negative (numerical safety)
     if (sint < 0.0)
     {
         sint = 0.0;
     }
 
-    // Associated Legendre polynomials P_l^m in linear storage
+    // Associated Legendre polynomials P_l^m in lower-triangular storage
     double p[P_SIZE];
 
-    // Initialize
+    // Base cases
     p_access(p, 0, 0) = 1.0;
 
     if (L >= 1)
     {
-        p_access(p, 1, 0) = cost;
-        p_access(p, 1, 1) = -sint;
+        p_access(p, 1, 0) = cost;  // P_1^0 = cos(theta)
+        p_access(p, 1, 1) = -sint; // P_1^1 = -sin(theta)
     }
 
-// Recurrence for l >= 2
+    // Recurrence relations for l >= 2
 #pragma unroll
     for (int l = 2; l <= L; l++)
     {
+        // P_l^m for m = 0 to l-2: standard recurrence
 #pragma unroll
         for (int m = 0; m <= l - 2; m++)
         {
-            p_access(p, l, m)
-                = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m)) / (double)(l - m);
+            p_access(p, l, m) = ((2 * l - 1) * cost * p_get(p, l - 1, m) - (l - 1 + m) * p_get(p, l - 2, m))
+                                / static_cast<double>(l - m);
         }
+
+        // P_l^{l-1} = (2l-1) * cos(theta) * P_{l-1}^{l-1}
         p_access(p, l, l - 1) = (2 * l - 1) * cost * p_get(p, l - 1, l - 1);
 
-        // Compute (2l-1)!! = 1*3*5*...*(2l-1)
-        double fact = 1.0;
+        // P_l^l = (-1)^l * (2l-1)!! * sin^l(theta)
+        double double_factorial = 1.0;
 #pragma unroll
         for (int i = 1; i <= 2 * l - 1; i += 2)
         {
-            fact *= i;
+            double_factorial *= i;
         }
 
-        // Compute sint^l
-        double sint_pow = 1.0;
+        double sint_power = 1.0;
 #pragma unroll
         for (int i = 0; i < l; i++)
         {
-            sint_pow *= sint;
+            sint_power *= sint;
         }
 
-        p_access(p, l, l) = fact * sint_pow;
+        p_access(p, l, l) = double_factorial * sint_power;
         if (l % 2 == 1)
         {
             p_access(p, l, l) = -p_access(p, l, l);
         }
     }
 
-    // Compute Y_lm from P_l^m
+    // Transform Legendre polynomials to real spherical harmonics
     int lm = 0;
 #pragma unroll
     for (int l = 0; l <= L; l++)
     {
         double c = sqrt((2.0 * l + 1.0) / FOUR_PI);
 
+        // m = 0 component
         ylm[lm] = c * p_get(p, l, 0);
         lm++;
 
+        // m > 0 components (cosine and sine parts)
 #pragma unroll
         for (int m = 1; m <= l; m++)
         {
-            double fact_ratio = 1.0;
+            // Compute normalization factor: sqrt(2 * (l-m)! / (l+m)!)
+            double factorial_ratio = 1.0;
 #pragma unroll
             for (int i = l - m + 1; i <= l + m; i++)
             {
-                fact_ratio *= i;
+                factorial_ratio *= i;
             }
-            double same = c * sqrt(1.0 / fact_ratio) * SQRT2;
+            double norm = c * sqrt(1.0 / factorial_ratio) * SQRT2;
 
             double sin_mphi, cos_mphi;
             sincos(m * phi, &sin_mphi, &cos_mphi);
 
-            ylm[lm] = same * p_get(p, l, m) * cos_mphi;
+            ylm[lm] = norm * p_get(p, l, m) * cos_mphi; // Y_{l,m} cosine part
             lm++;
 
-            ylm[lm] = same * p_get(p, l, m) * sin_mphi;
+            ylm[lm] = norm * p_get(p, l, m) * sin_mphi; // Y_{l,m} sine part
             lm++;
         }
     }
@@ -182,9 +217,18 @@ template __device__ void compute_ylm_gpu<3>(double x, double y, double z, double
 template __device__ void compute_ylm_gpu<4>(double x, double y, double z, double* ylm);
 
 //=============================================================================
-// Warp Reduction for double
+// Warp-Level Reduction
 //=============================================================================
 
+/**
+ * @brief Warp-level sum reduction using shuffle instructions
+ *
+ * Performs a parallel reduction within a warp (32 threads) using __shfl_down_sync.
+ * After this function, lane 0 contains the sum of all input values in the warp.
+ *
+ * @param val Input value from each thread
+ * @return Sum across all threads in the warp (valid only in lane 0)
+ */
 __device__ __forceinline__ double warp_reduce_sum(double val)
 {
     for (int offset = 16; offset > 0; offset /= 2)
@@ -195,10 +239,20 @@ __device__ __forceinline__ double warp_reduce_sum(double val)
 }
 
 //=============================================================================
-// Atom-Level Batch Kernel Implementation
-// Processes ALL neighbors for a center atom in a single kernel launch
+// Main Kernel Implementation
 //=============================================================================
 
+/**
+ * @brief Atom-level batch kernel for <psi|beta> overlap integrals
+ *
+ * Integration is performed using restructured loops for efficiency:
+ * - Outer loop: angular points (each thread handles different angles)
+ * - Inner loop: radial points (each thread accumulates all radii)
+ *
+ * This structure exploits the fact that Y_lm for the projector (ylm0) only
+ * depends on the angular direction, not the radial distance, saving
+ * RADIAL_GRID_NUM redundant ylm0 computations per angular point.
+ */
 __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                                                double3 A,
                                                const NeighborOrbitalData* __restrict__ neighbor_orbitals,
@@ -212,59 +266,74 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                                                int nlm_dim,
                                                cuDoubleComplex* __restrict__ nlm_out)
 {
-    int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair
-    int proj_idx = blockIdx.y; // Which projector
-    int tid = threadIdx.x;
+    // Thread/block indices
+    const int norb_idx = blockIdx.x; // Which (neighbor, orbital) pair
+    const int proj_idx = blockIdx.y; // Which projector
+    const int tid = threadIdx.x;
 
+    // Early exit for out-of-bounds blocks
     if (norb_idx >= total_neighbor_orbitals || proj_idx >= nproj)
     {
         return;
     }
 
-    // Load neighbor-orbital data (includes R1)
+    //-------------------------------------------------------------------------
+    // Load input data
+    //-------------------------------------------------------------------------
+
     const NeighborOrbitalData& norb = neighbor_orbitals[norb_idx];
     const ProjectorData& proj = projectors[proj_idx];
 
-    double3 R1 = norb.R1;
-    int L1 = norb.L1;
-    int m1 = norb.m1;
-    int L0 = proj.L0;
-    int m0_offset = proj_m0_offset[proj_idx];
+    const double3 R1 = norb.R1;
+    const int L1 = norb.L1;
+    const int m1 = norb.m1;
+    const int L0 = proj.L0;
+    const int m0_offset = proj_m0_offset[proj_idx];
 
-    // Skip if L values exceed our precomputed limit
+    // Skip if angular momentum exceeds supported limit
     if (L1 > MAX_L || L0 > MAX_L)
     {
         return;
     }
 
+    //-------------------------------------------------------------------------
     // Compute geometry
-    double3 dR = make_double3(R1.x - R0.x, R1.y - R0.y, R1.z - R0.z);
-    double distance01 = sqrt(dR.x * dR.x + dR.y * dR.y + dR.z * dR.z);
-
-    double r1_max = norb.psi_rcut;
-    // Integration range: use projector's radial grid range [r_min, r_max]
-    // This matches the CPU implementation in snap_psibeta_half_tddft.cpp
-    double r_min = proj.r_min;
-    double r_max = proj.r_max;
-    double xl = 0.5 * (r_max - r_min);
-    double xmean = 0.5 * (r_max + r_min);
-
-    // Phase factor exp(i A · R0)
-    double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
-    cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0);
-
-    // Shared memory for warp-level reduction (only need space for warp sums)
+    //-------------------------------------------------------------------------
+
+    // Note: dR (R1 - R0) is computed inline as dRx/dRy/dRz in the integration loop
+
+    // Orbital cutoff
+    const double r1_max = norb.psi_rcut;
+
+    // Integration range from projector radial grid
+    const double r_min = proj.r_min;
+    const double r_max = proj.r_max;
+    const double xl = 0.5 * (r_max - r_min);    // Half-range for Gauss-Legendre
+    const double xmean = 0.5 * (r_max + r_min); // Midpoint
+
+    // Phase factor exp(i * A · R0)
+    const double AdotR0 = A.x * R0.x + A.y * R0.y + A.z * R0.z;
+    const cuDoubleComplex exp_iAR0 = cu_exp_i(AdotR0);
+
+    //-------------------------------------------------------------------------
+    // Shared memory for warp reduction
+    //-------------------------------------------------------------------------
+
     constexpr int NUM_WARPS = BLOCK_SIZE / 32; // 128 / 32 = 4 warps
     __shared__ double s_temp_re[NUM_WARPS];
     __shared__ double s_temp_im[NUM_WARPS];
 
-    // Initialize accumulators in registers
+    //-------------------------------------------------------------------------
+    // Initialize accumulators (per-thread registers)
+    //-------------------------------------------------------------------------
+
+    const int num_m0 = 2 * L0 + 1;
+
     double result_re[MAX_M0_SIZE];
     double result_im[MAX_M0_SIZE];
-    double result_r_re[3][MAX_M0_SIZE];
+    double result_r_re[3][MAX_M0_SIZE]; // For current operator: x, y, z components
     double result_r_im[3][MAX_M0_SIZE];
 
-    int num_m0 = 2 * L0 + 1;
     for (int m0 = 0; m0 < num_m0; m0++)
     {
         result_re[m0] = 0.0;
@@ -276,54 +345,60 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }
     }
 
-    // Main integration loop - RESTRUCTURED: angular loop outside, radial inside
-    // ylm0 only depends on angular direction (leb_x,y,z), not radial distance
-    // This saves 140x redundant ylm0 computations per angular point
+    //-------------------------------------------------------------------------
+    // Main integration loop
+    // Outer: angular points (parallelized across threads)
+    // Inner: radial points (accumulated per thread)
+    //-------------------------------------------------------------------------
+
     for (int ian = tid; ian < ANGULAR_GRID_NUM; ian += BLOCK_SIZE)
     {
-        double leb_x = d_lebedev_x[ian];
-        double leb_y = d_lebedev_y[ian];
-        double leb_z = d_lebedev_z[ian];
-        double w_ang = d_lebedev_w[ian];
+        // Load angular grid point
+        const double leb_x = d_lebedev_x[ian];
+        const double leb_y = d_lebedev_y[ian];
+        const double leb_z = d_lebedev_z[ian];
+        const double w_ang = d_lebedev_w[ian];
 
-        // Precompute ylm0 ONCE per angular point (doesn't depend on r_val)
+        // Precompute Y_lm for projector (independent of radial distance)
         double ylm0[MAX_YLM_SIZE];
         DISPATCH_YLM(L0, leb_x, leb_y, leb_z, ylm0);
-        int offset_L0 = L0 * L0;
+        const int offset_L0 = L0 * L0;
 
-        // Precompute A dot direction (doesn't depend on r_val)
-        double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
+        // Precompute A · direction (for phase factor)
+        const double A_dot_leb = A.x * leb_x + A.y * leb_y + A.z * leb_z;
 
-        // Precompute dR components
-        double dRx = R0.x - R1.x;
-        double dRy = R0.y - R1.y;
-        double dRz = R0.z - R1.z;
+        // Vector from R1 to R0 (for computing distance to orbital center)
+        const double dRx = R0.x - R1.x;
+        const double dRy = R0.y - R1.y;
+        const double dRz = R0.z - R1.z;
 
+        // Radial integration
 #pragma unroll 4
         for (int ir = 0; ir < RADIAL_GRID_NUM; ir++)
         {
-            double r_val = xmean + xl * d_gl_x[ir];
-            double w_rad = xl * d_gl_w[ir];
+            // Transform Gauss-Legendre point from [-1,1] to [r_min, r_max]
+            const double r_val = xmean + xl * d_gl_x[ir];
+            const double w_rad = xl * d_gl_w[ir];
 
-            // Local position relative to R0
-            double rx = r_val * leb_x;
-            double ry = r_val * leb_y;
-            double rz = r_val * leb_z;
+            // Integration point position relative to R0
+            const double rx = r_val * leb_x;
+            const double ry = r_val * leb_y;
+            const double rz = r_val * leb_z;
 
             // Vector from R1 to integration point
-            double tx = rx + dRx;
-            double ty = ry + dRy;
-            double tz = rz + dRz;
-            double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
+            const double tx = rx + dRx;
+            const double ty = ry + dRy;
+            const double tz = rz + dRz;
+            const double tnorm = sqrt(tx * tx + ty * ty + tz * tz);
 
-            // Check psi cutoff
+            // Check if within orbital cutoff
             if (tnorm <= r1_max)
             {
-                // Compute Y_L1 (depends on direction to R1, varies with r_val)
+                // Compute Y_lm for orbital (depends on direction from R1)
                 double ylm1[MAX_YLM_SIZE];
                 if (tnorm > 1e-10)
                 {
-                    double inv_tnorm = 1.0 / tnorm;
+                    const double inv_tnorm = 1.0 / tnorm;
                     DISPATCH_YLM(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, ylm1);
                 }
                 else
@@ -331,39 +406,40 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                     DISPATCH_YLM(L1, 0.0, 0.0, 1.0, ylm1);
                 }
 
-                // Interpolate psi
-                double psi_val
+                // Interpolate orbital radial function
+                const double psi_val
                     = interpolate_radial_gpu(psi_radial + norb.psi_offset, norb.psi_mesh, 1.0 / norb.psi_dk, tnorm);
 
-                // Phase factor
-                double phase = r_val * A_dot_leb;
-                cuDoubleComplex exp_iAr = cu_exp_i(phase);
-
-                // Interpolate beta
-                double beta_val
+                // Interpolate projector radial function
+                const double beta_val
                     = interpolate_radial_gpu(beta_radial + proj.beta_offset, proj.beta_mesh, 1.0 / proj.beta_dk, r_val);
 
-                // Y_L1m1
-                double ylm_L1_val = ylm1[L1 * L1 + m1];
+                // Phase factor exp(i * A · r)
+                const double phase = r_val * A_dot_leb;
+                const cuDoubleComplex exp_iAr = cu_exp_i(phase);
+
+                // Orbital Y_lm value
+                const double ylm_L1_val = ylm1[L1 * L1 + m1];
 
-                // Combined factor
-                double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
-                cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
+                // Combined integration factor: Y_L1m1 * psi * beta * r * dr * dOmega
+                const double factor = ylm_L1_val * psi_val * beta_val * r_val * w_rad * w_ang;
+                const cuDoubleComplex common_factor = cu_mul_real(exp_iAr, factor);
 
-// Accumulate for all m0
+                // Accumulate for all m0 components of projector
 #pragma unroll
                 for (int m0 = 0; m0 < num_m0; m0++)
                 {
-                    double ylm0_val = ylm0[offset_L0 + m0];
+                    const double ylm0_val = ylm0[offset_L0 + m0];
 
                     result_re[m0] += common_factor.x * ylm0_val;
                     result_im[m0] += common_factor.y * ylm0_val;
 
+                    // Current operator contribution (if requested)
                     if (nlm_dim == 4)
                     {
-                        double r_op_x = rx + R0.x;
-                        double r_op_y = ry + R0.y;
-                        double r_op_z = rz + R0.z;
+                        const double r_op_x = rx + R0.x;
+                        const double r_op_y = ry + R0.y;
+                        const double r_op_z = rz + R0.z;
 
                         result_r_re[0][m0] += common_factor.x * ylm0_val * r_op_x;
                         result_r_im[0][m0] += common_factor.y * ylm0_val * r_op_x;
@@ -374,13 +450,17 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                     }
                 }
             }
-        }
-    }
+        } // End radial loop
+    }     // End angular loop
+
+    //-------------------------------------------------------------------------
+    // Parallel reduction and output
+    // Uses warp shuffle for efficiency, followed by cross-warp reduction
+    //-------------------------------------------------------------------------
 
-    // Reduction and output using warp shuffle - more efficient than shared memory
-    int out_base = norb_idx * nlm_dim * natomwfc;
-    int warp_id = tid / 32;
-    int lane_id = tid % 32;
+    const int out_base = norb_idx * nlm_dim * natomwfc;
+    const int warp_id = tid / 32;
+    const int lane_id = tid % 32;
 
     for (int m0 = 0; m0 < num_m0; m0++)
     {
@@ -396,7 +476,7 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }
         __syncthreads();
 
-        // Step 3: First warp reduces across all warps
+        // Step 3: First warp reduces across all warps and writes output
         if (warp_id == 0)
         {
             sum_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0;
@@ -414,15 +494,14 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }
         __syncthreads();
 
+        // Process current operator components (if nlm_dim == 4)
         if (nlm_dim == 4)
         {
             for (int d = 0; d < 3; d++)
             {
-                // Step 1: Warp-level reduction
                 double sum_r_re = warp_reduce_sum(result_r_re[d][m0]);
                 double sum_r_im = warp_reduce_sum(result_r_im[d][m0]);
 
-                // Step 2: First lane writes to shared memory
                 if (lane_id == 0)
                 {
                     s_temp_re[warp_id] = sum_r_re;
@@ -430,7 +509,6 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                 }
                 __syncthreads();
 
-                // Step 3: First warp reduces across warps
                 if (warp_id == 0)
                 {
                     sum_r_re = (lane_id < NUM_WARPS) ? s_temp_re[lane_id] : 0.0;
@@ -453,9 +531,12 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
 }
 
 //=============================================================================
-// Host-side Helper Function Implementations
+// Host-side Helper Functions
 //=============================================================================
 
+/**
+ * @brief CUDA error checking macro for kernel-related operations
+ */
 #define CUDA_CHECK_KERNEL(call)                                                                                        \
     do                                                                                                                 \
     {                                                                                                                  \
@@ -466,9 +547,15 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
         }                                                                                                              \
     } while (0)
 
+/**
+ * @brief Copy integration grids to GPU constant memory
+ *
+ * Initializes the constant memory arrays with Lebedev-Laikov angular grid
+ * and Gauss-Legendre radial grid for use in kernel integration.
+ */
 void copy_grids_to_device()
 {
-    // Copy Lebedev-Laikov angular grid to constant memory
+    // Copy Lebedev-Laikov 110-point angular quadrature grid
     CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x,
                                          ModuleBase::Integral::Lebedev_Laikov_grid110_x,
                                          ANGULAR_GRID_NUM * sizeof(double)));
@@ -482,7 +569,7 @@ void copy_grids_to_device()
                                          ModuleBase::Integral::Lebedev_Laikov_grid110_w,
                                          ANGULAR_GRID_NUM * sizeof(double)));
 
-    // Prepare and copy Gauss-Legendre radial grid to constant memory
+    // Compute and copy Gauss-Legendre radial quadrature grid
     std::vector<double> h_gl_x(RADIAL_GRID_NUM);
     std::vector<double> h_gl_w(RADIAL_GRID_NUM);
     ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data());
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index 8c00e4cc5a..42280b1498 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -1,3 +1,19 @@
+/**
+ * @file snap_psibeta_kernel.cuh
+ * @brief CUDA kernel declarations for computing <psi|beta> overlap integrals
+ *
+ * This file provides GPU-accelerated computation of overlap integrals between
+ * atomic orbitals (psi) and non-local projectors (beta) for real-time TDDFT
+ * calculations. The implementation uses numerical integration on a combined
+ * radial (Gauss-Legendre) and angular (Lebedev-Laikov) grid.
+ *
+ * Key Features:
+ * - Atom-level batching: processes all neighbors for a center atom in single kernel
+ * - Templated spherical harmonics for compile-time optimization
+ * - Efficient memory access via constant memory for integration grids
+ * - Warp-level reduction for high-performance summation
+ */
+
 #ifndef SNAP_PSIBETA_KERNEL_CUH
 #define SNAP_PSIBETA_KERNEL_CUH
 
@@ -9,44 +25,36 @@ namespace module_rt
 namespace gpu
 {
 
-// Grid constants
+//=============================================================================
+// Configuration Constants
+//=============================================================================
+
+/// Number of points in radial Gauss-Legendre grid
 constexpr int RADIAL_GRID_NUM = 140;
+
+/// Number of points in angular Lebedev-Laikov grid (110-point rule)
 constexpr int ANGULAR_GRID_NUM = 110;
-constexpr int BLOCK_SIZE = 128;                         // >= ANGULAR_GRID_NUM for efficient reduction
-constexpr int MAX_L = 4;                                // Maximum angular momentum supported
-constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1); // = 25
-constexpr int MAX_M0_SIZE = 2 * MAX_L + 1;              // = 9
 
-//=============================================================================
-// Device Helper Functions
-//=============================================================================
+/// Thread block size for kernel execution
+constexpr int BLOCK_SIZE = 128;
 
-/**
- * @brief GPU version of cubic interpolation for radial functions
- */
-__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi,
-                                                         int mesh,
-                                                         double inv_dk,
-                                                         double distance)
-{
-    double position = distance * inv_dk;
-    int iq = __double2int_rd(position); // floor
+/// Maximum supported angular momentum quantum number L
+constexpr int MAX_L = 4;
 
-    if (iq > mesh - 4)
-        return 0.0;
-    if (iq < 0)
-        return 0.0;
+/// Size of spherical harmonics array: (MAX_L + 1)^2 = 25
+constexpr int MAX_YLM_SIZE = (MAX_L + 1) * (MAX_L + 1);
 
-    double x0 = position - static_cast<double>(iq);
-    double x1 = 1.0 - x0;
-    double x2 = 2.0 - x0;
-    double x3 = 3.0 - x0;
+/// Maximum number of magnetic quantum numbers for a single L: 2*MAX_L + 1 = 9
+constexpr int MAX_M0_SIZE = 2 * MAX_L + 1;
 
-    return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0;
-}
+//=============================================================================
+// Device Helper Functions - Complex Arithmetic
+//=============================================================================
 
 /**
  * @brief Compute exp(i * theta) = cos(theta) + i * sin(theta)
+ * @param theta Phase angle in radians
+ * @return Complex exponential as cuDoubleComplex
  */
 __device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta)
 {
@@ -56,7 +64,7 @@ __device__ __forceinline__ cuDoubleComplex cu_exp_i(double theta)
 }
 
 /**
- * @brief Complex multiplication
+ * @brief Complex multiplication: a * b
  */
 __device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleComplex b)
 {
@@ -64,7 +72,7 @@ __device__ __forceinline__ cuDoubleComplex cu_mul(cuDoubleComplex a, cuDoubleCom
 }
 
 /**
- * @brief Complex addition
+ * @brief Complex addition: a + b
  */
 __device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleComplex b)
 {
@@ -72,7 +80,7 @@ __device__ __forceinline__ cuDoubleComplex cu_add(cuDoubleComplex a, cuDoubleCom
 }
 
 /**
- * @brief Complex conjugate
+ * @brief Complex conjugate: conj(a)
  */
 __device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a)
 {
@@ -80,27 +88,83 @@ __device__ __forceinline__ cuDoubleComplex cu_conj(cuDoubleComplex a)
 }
 
 /**
- * @brief Complex * real
+ * @brief Complex times real: a * r
  */
 __device__ __forceinline__ cuDoubleComplex cu_mul_real(cuDoubleComplex a, double r)
 {
     return make_cuDoubleComplex(a.x * r, a.y * r);
 }
 
+//=============================================================================
+// Device Helper Functions - Radial Interpolation
+//=============================================================================
+
 /**
- * @brief Compute real spherical harmonics Y_lm at given direction (x, y, z)
- *        TEMPLATED VERSION - L is compile-time constant for optimization
+ * @brief Cubic spline interpolation for radial functions
+ *
+ * Implements cubic polynomial interpolation using 4 consecutive grid points.
+ * This is the GPU equivalent of CPU-side PolyInt::Polynomial_Interpolation.
+ *
+ * @param psi     Radial function values on uniform grid
+ * @param mesh    Number of grid points
+ * @param inv_dk  Inverse of grid spacing (1/dk)
+ * @param distance Radial distance r at which to interpolate
+ * @return Interpolated function value
+ */
+__device__ __forceinline__ double interpolate_radial_gpu(const double* __restrict__ psi,
+                                                         int mesh,
+                                                         double inv_dk,
+                                                         double distance)
+{
+    double position = distance * inv_dk;
+    int iq = __double2int_rd(position); // floor(position)
+
+    // Boundary checks
+    if (iq > mesh - 4 || iq < 0)
+    {
+        return 0.0;
+    }
+
+    // Lagrange interpolation weights
+    double x0 = position - static_cast<double>(iq);
+    double x1 = 1.0 - x0;
+    double x2 = 2.0 - x0;
+    double x3 = 3.0 - x0;
+
+    // 4-point Lagrange interpolation formula
+    return x1 * x2 * (psi[iq] * x3 + psi[iq + 3] * x0) / 6.0 + x0 * x3 * (psi[iq + 1] * x2 - psi[iq + 2] * x1) / 2.0;
+}
+
+//=============================================================================
+// Device Helper Functions - Spherical Harmonics
+//=============================================================================
+
+/**
+ * @brief Compute real spherical harmonics Y_lm at a given direction
+ *
+ * TEMPLATED VERSION: L is a compile-time constant enabling loop unrolling
+ * and register allocation optimizations by the compiler.
+ *
+ * Uses the recursive computation of associated Legendre polynomials
+ * followed by transformation to real spherical harmonics with proper
+ * normalization (same as ModuleBase::Ylm).
  *
  * @tparam L Maximum angular momentum (0 <= L <= MAX_L)
- * @param x, y, z Direction (should be normalized)
- * @param ylm Output array of size (L+1)^2
+ * @param x, y, z Direction vector components (need not be normalized, normalization is done internally)
+ * @param ylm Output array of size (L+1)^2, indexed as ylm[l*l + l + m]
  */
 template <int L>
 __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm);
 
 /**
  * @brief Runtime dispatch macro for templated compute_ylm_gpu
- *        Converts runtime L to compile-time template call
+ *
+ * Converts a runtime L value to the appropriate compile-time template
+ * instantiation for optimal performance.
+ *
+ * @param L_val Runtime angular momentum value
+ * @param x, y, z Direction vector components
+ * @param ylm Output array for spherical harmonics
  */
 #define DISPATCH_YLM(L_val, x, y, z, ylm)                                                                              \
     do                                                                                                                 \
@@ -129,68 +193,83 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm);
     } while (0)
 
 //=============================================================================
-// Kernel Input/Output Structures
+// Data Structures for Kernel Input
 //=============================================================================
 
 /**
- * @brief Information about a single projector
+ * @brief Non-local projector (beta function) information
+ *
+ * Contains all data needed to evaluate a single projector during integration.
  */
 struct ProjectorData
 {
-    int L0;           // Angular momentum
-    int beta_offset;  // Offset in flattened beta array
-    int beta_mesh;    // Number of mesh points
-    double beta_dk;   // Grid spacing
-    double beta_rcut; // Cutoff radius
-    double r_min;     // Minimum radial value
-    double r_max;     // Maximum radial value
+    int L0;           ///< Angular momentum quantum number
+    int beta_offset;  ///< Offset into flattened beta radial array
+    int beta_mesh;    ///< Number of radial mesh points
+    double beta_dk;   ///< Radial grid spacing
+    double beta_rcut; ///< Cutoff radius for projector
+    double r_min;     ///< Minimum radial grid value (integration start)
+    double r_max;     ///< Maximum radial grid value (integration end)
 };
 
 /**
- * @brief Information about a neighbor-orbital pair for atom-level batching
- *        Used to batch ALL neighbors for a center atom in a single kernel launch
+ * @brief Neighbor atom orbital information for atom-level batching
+ *
+ * Each structure represents one (neighbor_atom, orbital) pair that contributes
+ * to the overlap integral. This enables processing ALL neighbors for a center
+ * atom in a single kernel launch, minimizing launch overhead.
  */
 struct NeighborOrbitalData
 {
-    int neighbor_idx; // Which neighbor (ad index) this orbital belongs to
-    double3 R1;       // Neighbor atom position (tau1 * lat0)
-
-    // Orbital info
-    int L1;          // Angular momentum
-    int m1;          // Magnetic quantum number
-    int N1;          // Radial quantum number
-    int iw_index;    // Original orbital index for output mapping
-    int psi_offset;  // Offset in flattened psi array
-    int psi_mesh;    // Number of mesh points
-    double psi_dk;   // Grid spacing
-    double psi_rcut; // Cutoff radius
+    int neighbor_idx; ///< Index of neighbor atom (ad index in adjacency list)
+    double3 R1;       ///< Neighbor atom position in Cartesian coordinates (tau * lat0)
+
+    // Orbital information
+    int L1;          ///< Angular momentum of orbital
+    int m1;          ///< Magnetic quantum number of orbital
+    int N1;          ///< Radial quantum number of orbital
+    int iw_index;    ///< Global orbital index for output mapping
+    int psi_offset;  ///< Offset into flattened psi radial array
+    int psi_mesh;    ///< Number of radial mesh points for orbital
+    double psi_dk;   ///< Radial grid spacing for orbital
+    double psi_rcut; ///< Cutoff radius for orbital
 };
 
 //=============================================================================
-// Main Kernels
+// Main CUDA Kernel Declaration
 //=============================================================================
 
 /**
- * @brief Atom-level batch kernel - processes ALL neighbors for a center atom
+ * @brief Atom-level batch kernel for <psi|beta> overlap computation
  *
- * Grid: (total_neighbor_orbitals, nproj, 1)
- * Block: (BLOCK_SIZE, 1, 1)
+ * This kernel processes ALL neighbor orbitals for a single center atom in one
+ * launch, significantly reducing kernel launch overhead. Each thread block
+ * handles the integration for one (neighbor_orbital, projector) pair.
  *
- * This kernel processes all orbitals from all neighbors in a single launch,
- * reducing kernel launch overhead significantly.
+ * Grid Configuration:
+ *   - gridDim.x = total_neighbor_orbitals (all orbitals from all neighbors)
+ *   - gridDim.y = nproj (number of projectors on center atom)
  *
- * @param R0 Center atom position (projector location)
- * @param A Vector potential
- * @param neighbor_orbitals Array of neighbor-orbital pairs [total_neighbor_orbitals]
- * @param projectors Array of projector data [nproj]
- * @param psi_radial Flattened orbital radial functions
- * @param beta_radial Flattened projector radial functions
- * @param proj_m0_offset Offset for each projector's m=0 in output
+ * Block Configuration:
+ *   - blockDim.x = BLOCK_SIZE threads for parallel integration
+ *
+ * Integration Strategy:
+ *   - Angular loop (outer): each thread processes different angular points
+ *   - Radial loop (inner): each thread accumulates over all radial points
+ *   - Warp shuffle reduction for efficient summation
+ *
+ * @param R0                    Center atom position (projector location)
+ * @param A                     Vector potential for phase factor
+ * @param neighbor_orbitals     Array of neighbor-orbital data [total_neighbor_orbitals]
+ * @param projectors            Array of projector data [nproj]
+ * @param psi_radial            Flattened array of orbital radial functions
+ * @param beta_radial           Flattened array of projector radial functions
+ * @param proj_m0_offset        Starting index of each projector's m=0 component in output
  * @param total_neighbor_orbitals Total number of (neighbor, orbital) pairs
- * @param nproj Number of projectors
- * @param natomwfc Total projector components (sum of 2*L0+1)
- * @param nlm_dim 1 for no current, 4 for current
- * @param nlm_out Output [total_neighbor_orbitals * nlm_dim * natomwfc]
+ * @param nproj                 Number of projectors on center atom
+ * @param natomwfc              Total projector components: sum of (2*L0+1) for all projectors
+ * @param nlm_dim               Output dimension: 1 for overlap only, 4 for overlap + current
+ * @param nlm_out               Output array [total_neighbor_orbitals * nlm_dim * natomwfc]
  */
 __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                                                double3 A,
@@ -206,12 +285,17 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
                                                cuDoubleComplex* __restrict__ nlm_out);
 
 //=============================================================================
-// Host-side Helper Functions
+// Host-side Initialization
 //=============================================================================
 
 /**
- * @brief Copy integration grids (Lebedev and Gauss-Legendre) to constant memory
- *        Should be called once before kernel execution in each calculate_HR call
+ * @brief Copy integration grids to GPU constant memory
+ *
+ * Copies the Lebedev-Laikov angular grid (110 points) and Gauss-Legendre
+ * radial grid (140 points) to CUDA constant memory for fast access during
+ * kernel execution.
+ *
+ * @note Must be called once before any kernel launches in a calculation session.
  */
 void copy_grids_to_device();
 

From 8f96081f9f921e0e1a04a4dea2d780f74065d8a5 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 26 Dec 2025 13:40:08 +0800
Subject: [PATCH 10/13] refactor(td_nonlocal_lcao): use runtime check for
 GPU/CPU branch selection

- Add use_gpu runtime flag that checks both __CUDA macro and PARAM.inp.device
- GPU path is now only enabled when __CUDA is defined AND device == "gpu"
- Makes the conditional logic clearer with if/else instead of nested #ifdef
---
 .../module_operator_lcao/td_nonlocal_lcao.cpp | 118 ++++++++++--------
 1 file changed, 67 insertions(+), 51 deletions(-)

diff --git a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
index 63f4590056..6f39ad039f 100644
--- a/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/td_nonlocal_lcao.cpp
@@ -132,15 +132,26 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
     ModuleBase::TITLE("TDNonlocal", "calculate_HR");
     ModuleBase::timer::tick("TDNonlocal", "calculate_HR");
 
+    // Determine whether to use GPU path:
+    // GPU is only used when both __CUDA is defined AND device is set to "gpu"
 #ifdef __CUDA
-    // Initialize GPU resources before entering the computation loop
-    // Use set_device_by_rank for multi-GPU support
-    int dev_id = 0;
+    const bool use_gpu = (PARAM.inp.device == "gpu");
+#else
+    const bool use_gpu = false;
+#endif
+
+    // Initialize GPU resources if using GPU
+    if (use_gpu)
+    {
+#ifdef __CUDA
+        // Use set_device_by_rank for multi-GPU support
+        int dev_id = 0;
 #ifdef __MPI
-    dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD);
+        dev_id = base_device::information::set_device_by_rank(MPI_COMM_WORLD);
 #endif
-    module_rt::gpu::initialize_gpu_resources();
+        module_rt::gpu::initialize_gpu_resources();
 #endif
+    }
 
     const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV();
     const int npol = this->ucell->get_npol();
@@ -160,59 +171,64 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             nlm_tot[i].resize(nlm_dim);
         }
 
+        if (use_gpu)
+        {
 #ifdef __CUDA
-        // GPU path: Atom-level GPU batch processing
-        module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_,
-                                                    this->ucell->infoNL,
-                                                    T0,
-                                                    tau0 * this->ucell->lat0,
-                                                    cart_At,
-                                                    adjs,
-                                                    this->ucell,
-                                                    paraV,
-                                                    npol,
-                                                    nlm_dim,
-                                                    nlm_tot);
-#else
-        // CPU path: OpenMP parallel over neighbors to compute nlm_tot
-#pragma omp parallel for schedule(dynamic)
-        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            // GPU path: Atom-level GPU batch processing
+            module_rt::gpu::snap_psibeta_atom_batch_gpu(orb_,
+                                                        this->ucell->infoNL,
+                                                        T0,
+                                                        tau0 * this->ucell->lat0,
+                                                        cart_At,
+                                                        adjs,
+                                                        this->ucell,
+                                                        paraV,
+                                                        npol,
+                                                        nlm_dim,
+                                                        nlm_tot);
+#endif
+        }
+        else
         {
-            const int T1 = adjs.ntype[ad];
-            const int I1 = adjs.natom[ad];
-            const int iat1 = ucell->itia2iat(T1, I1);
-            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
-            const Atom* atom1 = &ucell->atoms[T1];
-            auto all_indexes = paraV->get_indexes_row(iat1);
-            auto col_indexes = paraV->get_indexes_col(iat1);
-            all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
-            std::sort(all_indexes.begin(), all_indexes.end());
-            all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
-
-            // CPU path: loop over orbitals
-            for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+            // CPU path: OpenMP parallel over neighbors to compute nlm_tot
+#pragma omp parallel for schedule(dynamic)
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
             {
-                const int iw1 = all_indexes[iw1l] / npol;
-                std::vector<std::vector<std::complex<double>>> nlm;
-                module_rt::snap_psibeta_half_tddft(orb_,
-                                                   this->ucell->infoNL,
-                                                   nlm,
-                                                   tau1 * this->ucell->lat0,
-                                                   T1,
-                                                   atom1->iw2l[iw1],
-                                                   atom1->iw2m[iw1],
-                                                   atom1->iw2n[iw1],
-                                                   tau0 * this->ucell->lat0,
-                                                   T0,
-                                                   cart_At,
-                                                   TD_info::out_current);
-                for (int dir = 0; dir < nlm_dim; dir++)
+                const int T1 = adjs.ntype[ad];
+                const int I1 = adjs.natom[ad];
+                const int iat1 = ucell->itia2iat(T1, I1);
+                const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+                const Atom* atom1 = &ucell->atoms[T1];
+                auto all_indexes = paraV->get_indexes_row(iat1);
+                auto col_indexes = paraV->get_indexes_col(iat1);
+                all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+                std::sort(all_indexes.begin(), all_indexes.end());
+                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+
+                // CPU path: loop over orbitals
+                for (size_t iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
                 {
-                    nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
+                    const int iw1 = all_indexes[iw1l] / npol;
+                    std::vector<std::vector<std::complex<double>>> nlm;
+                    module_rt::snap_psibeta_half_tddft(orb_,
+                                                       this->ucell->infoNL,
+                                                       nlm,
+                                                       tau1 * this->ucell->lat0,
+                                                       T1,
+                                                       atom1->iw2l[iw1],
+                                                       atom1->iw2m[iw1],
+                                                       atom1->iw2n[iw1],
+                                                       tau0 * this->ucell->lat0,
+                                                       T0,
+                                                       cart_At,
+                                                       TD_info::out_current);
+                    for (int dir = 0; dir < nlm_dim; dir++)
+                    {
+                        nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
+                    }
                 }
             }
         }
-#endif
 
         // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
         // This runs for BOTH GPU and CPU paths

From 97d8611e47ee30fe53b2e8a8af6884bf6d5798b6 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 26 Dec 2025 14:05:06 +0800
Subject: [PATCH 11/13] refactor(snap_psibeta): unify CUDA error checking
 macros

- Move CUDA_CHECK macro to shared header snap_psibeta_kernel.cuh
- Remove duplicate CUDA_CHECK definition from snap_psibeta_gpu.cu
- Remove CUDA_CHECK_KERNEL macro and replace all usages with CUDA_CHECK
- Reduces code duplication and improves consistency
---
 .../kernels/cuda/snap_psibeta_gpu.cu          | 17 --------
 .../kernels/cuda/snap_psibeta_kernel.cu       | 41 +++++++------------
 .../kernels/cuda/snap_psibeta_kernel.cuh      | 21 ++++++++++
 3 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
index 026af2d1d0..9cfab0cb87 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -24,23 +24,6 @@ namespace module_rt
 namespace gpu
 {
 
-//=============================================================================
-// CUDA Error Handling
-//=============================================================================
-
-/**
- * @brief CUDA error checking macro with file/line information
- */
-#define CUDA_CHECK(call)                                                                                               \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudaError_t err = (call);                                                                                      \
-        if (err != cudaSuccess)                                                                                        \
-        {                                                                                                              \
-            fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));              \
-        }                                                                                                              \
-    } while (0)
-
 //=============================================================================
 // GPU Resource Management
 //=============================================================================
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 8d29015add..073a5fbd7d 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -534,19 +534,6 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
 // Host-side Helper Functions
 //=============================================================================
 
-/**
- * @brief CUDA error checking macro for kernel-related operations
- */
-#define CUDA_CHECK_KERNEL(call)                                                                                        \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudaError_t err = (call);                                                                                      \
-        if (err != cudaSuccess)                                                                                        \
-        {                                                                                                              \
-            fprintf(stderr, "[CUDA Kernel] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));       \
-        }                                                                                                              \
-    } while (0)
-
 /**
  * @brief Copy integration grids to GPU constant memory
  *
@@ -556,26 +543,26 @@ __global__ void snap_psibeta_atom_batch_kernel(double3 R0,
 void copy_grids_to_device()
 {
     // Copy Lebedev-Laikov 110-point angular quadrature grid
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_x,
-                                         ModuleBase::Integral::Lebedev_Laikov_grid110_x,
-                                         ANGULAR_GRID_NUM * sizeof(double)));
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_y,
-                                         ModuleBase::Integral::Lebedev_Laikov_grid110_y,
-                                         ANGULAR_GRID_NUM * sizeof(double)));
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_z,
-                                         ModuleBase::Integral::Lebedev_Laikov_grid110_z,
-                                         ANGULAR_GRID_NUM * sizeof(double)));
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_lebedev_w,
-                                         ModuleBase::Integral::Lebedev_Laikov_grid110_w,
-                                         ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_x,
+                                  ModuleBase::Integral::Lebedev_Laikov_grid110_x,
+                                  ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_y,
+                                  ModuleBase::Integral::Lebedev_Laikov_grid110_y,
+                                  ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_z,
+                                  ModuleBase::Integral::Lebedev_Laikov_grid110_z,
+                                  ANGULAR_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lebedev_w,
+                                  ModuleBase::Integral::Lebedev_Laikov_grid110_w,
+                                  ANGULAR_GRID_NUM * sizeof(double)));
 
     // Compute and copy Gauss-Legendre radial quadrature grid
     std::vector<double> h_gl_x(RADIAL_GRID_NUM);
     std::vector<double> h_gl_w(RADIAL_GRID_NUM);
     ModuleBase::Integral::Gauss_Legendre_grid_and_weight(RADIAL_GRID_NUM, h_gl_x.data(), h_gl_w.data());
 
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double)));
-    CUDA_CHECK_KERNEL(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_gl_x, h_gl_x.data(), RADIAL_GRID_NUM * sizeof(double)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_gl_w, h_gl_w.data(), RADIAL_GRID_NUM * sizeof(double)));
 }
 
 } // namespace gpu
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index 42280b1498..02537c477e 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -17,9 +17,30 @@
 #ifndef SNAP_PSIBETA_KERNEL_CUH
 #define SNAP_PSIBETA_KERNEL_CUH
 
+#include <cstdio>
 #include <cuComplex.h>
 #include <cuda_runtime.h>
 
+//=============================================================================
+// CUDA Error Checking Macro
+//=============================================================================
+
+/**
+ * @brief CUDA error checking macro with file/line information
+ *
+ * Checks the return value of CUDA API calls and prints an error message
+ * with file and line information if the call fails.
+ */
+#define CUDA_CHECK(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t err = (call);                                                                                      \
+        if (err != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));              \
+        }                                                                                                              \
+    } while (0)
+
 namespace module_rt
 {
 namespace gpu

From b0f94276c10e7d95dfe6f5643e0fdd20f7b7c5eb Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 26 Dec 2025 14:07:26 +0800
Subject: [PATCH 12/13] refactor(snap_psibeta_kernel): use ModuleBase constants

- Replace local PI, FOUR_PI, SQRT2 definitions with ModuleBase:: versions
- Add include for source_base/constants.h
---
 .../module_rt/kernels/cuda/snap_psibeta_kernel.cu    | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
index 073a5fbd7d..60ea9ad26b 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cu
@@ -12,6 +12,7 @@
  */
 
 #include "snap_psibeta_kernel.cuh"
+#include "source_base/constants.h"
 #include "source_base/math_integral.h"
 
 #include <cstdio>
@@ -85,14 +86,11 @@ __device__ __forceinline__ double p_get(const double* p, int l, int m)
 template <int L>
 __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
 {
-    // Mathematical constants
-    constexpr double PI = 3.14159265358979323846;
-    constexpr double FOUR_PI = 4.0 * PI;
-    constexpr double SQRT2 = 1.41421356237309504880;
+
     constexpr int P_SIZE = (L + 1) * (L + 2) / 2; // Lower triangular storage size
 
     // Y_00 = 1/(2*sqrt(pi))
-    ylm[0] = 0.5 * sqrt(1.0 / PI);
+    ylm[0] = 0.5 * sqrt(1.0 / ModuleBase::PI);
 
     if (L == 0)
     {
@@ -178,7 +176,7 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
 #pragma unroll
     for (int l = 0; l <= L; l++)
     {
-        double c = sqrt((2.0 * l + 1.0) / FOUR_PI);
+        double c = sqrt((2.0 * l + 1.0) / ModuleBase::FOUR_PI);
 
         // m = 0 component
         ylm[lm] = c * p_get(p, l, 0);
@@ -195,7 +193,7 @@ __device__ void compute_ylm_gpu(double x, double y, double z, double* ylm)
             {
                 factorial_ratio *= i;
             }
-            double norm = c * sqrt(1.0 / factorial_ratio) * SQRT2;
+            double norm = c * sqrt(1.0 / factorial_ratio) * ModuleBase::SQRT2;
 
             double sin_mphi, cos_mphi;
             sincos(m * phi, &sin_mphi, &cos_mphi);

From 9c60e9f5b1d752ae8a6531af2180be47a599f51f Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Fri, 26 Dec 2025 14:15:22 +0800
Subject: [PATCH 13/13] refactor(snap_psibeta): use ModuleBase::WARNING_QUIT
 for error handling

- Replace fprintf(stderr, ...) with ModuleBase::WARNING_QUIT
- Update CUDA_CHECK macro to use WARNING_QUIT instead of fprintf
- Add includes for tool_quit.h and string header
- Consistent error handling with ABACUS codebase conventions
---
 .../module_rt/kernels/cuda/snap_psibeta_gpu.cu   | 16 +++++++---------
 .../kernels/cuda/snap_psibeta_kernel.cuh         | 11 ++++++++---
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
index 9cfab0cb87..cb381ece78 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_gpu.cu
@@ -14,9 +14,11 @@
 #include "../snap_psibeta_gpu.h"
 #include "snap_psibeta_kernel.cuh"
 #include "source_base/timer.h"
+#include "source_base/tool_quit.h"
 
 #include <cstdio>
 #include <cuda_runtime.h>
+#include <string>
 #include <vector>
 
 namespace module_rt
@@ -44,8 +46,7 @@ void initialize_gpu_resources()
     cudaError_t err = cudaGetDeviceCount(&device_count);
     if (err != cudaSuccess || device_count == 0)
     {
-        fprintf(stderr, "[CUDA] No CUDA devices found or error getting device count!\n");
-        return;
+        ModuleBase::WARNING_QUIT("snap_psibeta_gpu", "No CUDA devices found or error getting device count!");
     }
 
     // Initialize integration grids in constant memory
@@ -141,10 +142,8 @@ void snap_psibeta_atom_batch_gpu(
         // Validate angular momentum
         if (L0 > MAX_L)
         {
-            fprintf(stderr,
-                    "[CUDA] Error: L0=%d exceeds MAX_L=%d, GPU kernel may produce incorrect results\n",
-                    L0,
-                    MAX_L);
+            ModuleBase::WARNING_QUIT("snap_psibeta_gpu",
+                                     "L0=" + std::to_string(L0) + " exceeds MAX_L=" + std::to_string(MAX_L));
         }
         natomwfc += 2 * L0 + 1;
     }
@@ -317,15 +316,14 @@ void snap_psibeta_atom_batch_gpu(
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
     {
-        fprintf(stderr, "[CUDA] Atom batch kernel launch error: %s\n", cudaGetErrorString(err));
         cudaFree(neighbor_orbitals_d);
         cudaFree(projectors_d);
         cudaFree(psi_radial_d);
         cudaFree(beta_radial_d);
         cudaFree(proj_m0_offset_d);
         cudaFree(nlm_out_d);
-        ModuleBase::timer::tick("module_rt", "snap_psibeta_gpu");
-        return;
+        ModuleBase::WARNING_QUIT("snap_psibeta_gpu",
+                                 std::string("Atom batch kernel launch error: ") + cudaGetErrorString(err));
     }
 
     CUDA_CHECK(cudaDeviceSynchronize());
diff --git a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
index 02537c477e..157224ea57 100644
--- a/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
+++ b/source/source_lcao/module_rt/kernels/cuda/snap_psibeta_kernel.cuh
@@ -17,9 +17,12 @@
 #ifndef SNAP_PSIBETA_KERNEL_CUH
 #define SNAP_PSIBETA_KERNEL_CUH
 
+#include "source_base/tool_quit.h"
+
 #include <cstdio>
 #include <cuComplex.h>
 #include <cuda_runtime.h>
+#include <string>
 
 //=============================================================================
 // CUDA Error Checking Macro
@@ -28,8 +31,8 @@
 /**
  * @brief CUDA error checking macro with file/line information
  *
- * Checks the return value of CUDA API calls and prints an error message
- * with file and line information if the call fails.
+ * Checks the return value of CUDA API calls and calls WARNING_QUIT
+ * with error information if the call fails.
  */
 #define CUDA_CHECK(call)                                                                                               \
     do                                                                                                                 \
@@ -37,7 +40,9 @@
         cudaError_t err = (call);                                                                                      \
         if (err != cudaSuccess)                                                                                        \
         {                                                                                                              \
-            fprintf(stderr, "[CUDA] Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err));              \
+            ModuleBase::WARNING_QUIT("CUDA_CHECK",                                                                     \
+                                     std::string("Error at ") + __FILE__ + ":" + std::to_string(__LINE__) + " - "      \
+                                         + cudaGetErrorString(err));                                                   \
         }                                                                                                              \
     } while (0)