diff --git a/source/module_esolver/esolver_ks.cpp b/source/module_esolver/esolver_ks.cpp
index da4c4c2c3c..7a850cc54d 100644
--- a/source/module_esolver/esolver_ks.cpp
+++ b/source/module_esolver/esolver_ks.cpp
@@ -117,6 +117,7 @@ void ESolver_KS<T, Device>::before_all_runners(const Input_para& inp, UnitCell&
                          PARAM.inp.mixing_gg0_min,
                          PARAM.inp.mixing_angle,
                          PARAM.inp.mixing_dmr);
+    p_chgmix->init_mixing();
 
     /// PAW Section
 #ifdef USE_PAW
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 6341a2b6b8..b372aa63db 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -75,12 +75,14 @@ ESolver_KS_LCAO<TK, TR>::ESolver_KS_LCAO()
     //  because some members like two_level_step are used outside if(cal_exx)
     if (GlobalC::exx_info.info_ri.real_number)
     {
-        this->exx_lri_double = std::make_shared<Exx_LRI<double>>(GlobalC::exx_info.info_ri, GlobalC::exx_info.info_ewald);
+        this->exx_lri_double
+            = std::make_shared<Exx_LRI<double>>(GlobalC::exx_info.info_ri, GlobalC::exx_info.info_ewald);
         this->exd = std::make_shared<Exx_LRI_Interface<TK, double>>(exx_lri_double);
     }
     else
     {
-        this->exx_lri_complex = std::make_shared<Exx_LRI<std::complex<double>>>(GlobalC::exx_info.info_ri, GlobalC::exx_info.info_ewald);
+        this->exx_lri_complex
+            = std::make_shared<Exx_LRI<std::complex<double>>>(GlobalC::exx_info.info_ri, GlobalC::exx_info.info_ewald);
         this->exc = std::make_shared<Exx_LRI_Interface<TK, std::complex<double>>>(exx_lri_complex);
     }
 #endif
@@ -198,7 +200,10 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(const Input_para& inp, UnitCell
     {
         if (GlobalC::exx_info.info_global.cal_exx)
         {
-            XC_Functional::set_xc_first_loop(ucell);
+            if (PARAM.inp.init_wfc != "file")
+            { // if init_wfc==file, directly enter the EXX loop
+                XC_Functional::set_xc_first_loop(ucell);
+            }
             // initialize 2-center radial tables for EXX-LRI
             if (GlobalC::exx_info.info_ri.real_number)
             {
@@ -521,7 +526,7 @@ void ESolver_KS_LCAO<TK, TR>::iter_init(const int istep, const int iter)
 
     if (iter == 1)
     {
-        this->p_chgmix->init_mixing(); // init mixing
+        this->p_chgmix->mix_reset(); // init mixing
         this->p_chgmix->mixing_restart_step = PARAM.inp.scf_nmax + 1;
         this->p_chgmix->mixing_restart_count = 0;
         // this output will be removed once the feeature is stable
@@ -575,7 +580,16 @@ void ESolver_KS_LCAO<TK, TR>::iter_init(const int istep, const int iter)
     // electrons number.
     if (istep == 0 && this->wf.init_wfc == "file")
     {
-        if (iter == 1)
+        int exx_two_level_step = 0;
+#ifdef __EXX
+        if (GlobalC::exx_info.info_global.cal_exx)
+        {
+            // the following steps are only needed in the first outer exx loop
+            exx_two_level_step
+                = GlobalC::exx_info.info_ri.real_number ? this->exd->two_level_step : this->exc->two_level_step;
+        }
+#endif
+        if (iter == 1 && exx_two_level_step == 0)
         {
             std::cout << " WAVEFUN -> CHARGE " << std::endl;
 
@@ -945,9 +959,24 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(const int istep, int& iter)
     // 3) save exx matrix
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        GlobalC::exx_info.info_ri.real_number ?
-            this->exd->exx_iter_finish(this->kv, GlobalC::ucell, *this->p_hamilt, *this->pelec, *this->p_chgmix, this->scf_ene_thr, iter, istep, this->conv_esolver) :
-            this->exc->exx_iter_finish(this->kv, GlobalC::ucell, *this->p_hamilt, *this->pelec, *this->p_chgmix, this->scf_ene_thr, iter, istep, this->conv_esolver);
+        GlobalC::exx_info.info_ri.real_number ? this->exd->exx_iter_finish(this->kv,
+                                                                           GlobalC::ucell,
+                                                                           *this->p_hamilt,
+                                                                           *this->pelec,
+                                                                           *this->p_chgmix,
+                                                                           this->scf_ene_thr,
+                                                                           iter,
+                                                                           istep,
+                                                                           this->conv_esolver)
+                                              : this->exc->exx_iter_finish(this->kv,
+                                                                           GlobalC::ucell,
+                                                                           *this->p_hamilt,
+                                                                           *this->pelec,
+                                                                           *this->p_chgmix,
+                                                                           this->scf_ene_thr,
+                                                                           iter,
+                                                                           istep,
+                                                                           this->conv_esolver);
     }
 #endif
 
@@ -967,26 +996,26 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(const int istep, int& iter)
             }
             std::string fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_CHG.cube";
             ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
-                data,
-                is,
-                PARAM.inp.nspin,
-                0,
-                fn,
-                this->pelec->eferm.get_efval(is),
-                &(GlobalC::ucell),
-                3,
-                1);
+                                          data,
+                                          is,
+                                          PARAM.inp.nspin,
+                                          0,
+                                          fn,
+                                          this->pelec->eferm.get_efval(is),
+                                          &(GlobalC::ucell),
+                                          3,
+                                          1);
             if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5)
             {
                 fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_TAU.cube";
                 ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
-                    this->pelec->charge->kin_r_save[is],
-                    is,
-                    PARAM.inp.nspin,
-                    0,
-                    fn,
-                    this->pelec->eferm.get_efval(is),
-                    &(GlobalC::ucell));
+                                              this->pelec->charge->kin_r_save[is],
+                                              is,
+                                              PARAM.inp.nspin,
+                                              0,
+                                              fn,
+                                              this->pelec->eferm.get_efval(is),
+                                              &(GlobalC::ucell));
             }
         }
     }
@@ -1032,7 +1061,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     {
         this->pelec->cal_tau(*(this->psi));
     }
-    
+
     // 2) call after_scf() of ESolver_KS
     ESolver_KS<TK>::after_scf(istep);
 
diff --git a/source/module_esolver/esolver_ks_lcao_tddft.cpp b/source/module_esolver/esolver_ks_lcao_tddft.cpp
index efb2c362c9..9cb6d35cd3 100644
--- a/source/module_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/module_esolver/esolver_ks_lcao_tddft.cpp
@@ -2,11 +2,11 @@
 
 #include "module_io/cal_r_overlap_R.h"
 #include "module_io/dipole_io.h"
+#include "module_io/output_log.h"
 #include "module_io/td_current_io.h"
 #include "module_io/write_HS.h"
 #include "module_io/write_HS_R.h"
 #include "module_io/write_wfc_nao.h"
-#include "module_io/output_log.h"
 
 //--------------temporary----------------------------
 #include "module_base/blas_connector.h"
@@ -15,13 +15,13 @@
 #include "module_base/scalapack_connector.h"
 #include "module_elecstate/module_charge/symmetry_rho.h"
 #include "module_elecstate/occupy.h"
+#include "module_elecstate/potentials/H_TDDFT_pw.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h" // need divide_HS_in_frag
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
 #include "module_hamilt_lcao/module_tddft/evolve_elec.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/print_info.h"
-#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
-#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
-#include "module_elecstate/potentials/H_TDDFT_pw.h"
 
 //-----HSolver ElecState Hamilt--------
 #include "module_elecstate/elecstate_lcao.h"
@@ -69,7 +69,6 @@ ESolver_KS_LCAO_TDDFT::~ESolver_KS_LCAO_TDDFT()
         delete td_p;
     }
     TD_Velocity::td_vel_op = nullptr;
-
 }
 
 void ESolver_KS_LCAO_TDDFT::before_all_runners(const Input_para& inp, UnitCell& ucell)
@@ -113,19 +112,27 @@ void ESolver_KS_LCAO_TDDFT::before_all_runners(const Input_para& inp, UnitCell&
 #ifdef __EXX
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        XC_Functional::set_xc_first_loop(ucell);
+        if (PARAM.inp.init_wfc != "file")
+        { // if init_wfc==file, directly enter the EXX loop
+            XC_Functional::set_xc_first_loop(ucell);
+        }
         // initialize 2-center radial tables for EXX-LRI
         if (GlobalC::exx_info.info_ri.real_number)
         {
             this->exx_lri_double->init(MPI_COMM_WORLD, this->kv, orb_);
+            this->exd->exx_before_all_runners(this->kv, GlobalC::ucell, this->pv);
         }
         else
         {
             this->exx_lri_complex->init(MPI_COMM_WORLD, this->kv, orb_);
+            this->exc->exx_before_all_runners(this->kv, GlobalC::ucell, this->pv);
         }
     }
 #endif
 
+    if (TD_Velocity::out_current_comm)
+        this->r_calculator.init(this->pv, orb_);
+
     // 8) initialize the charge density
     this->pelec->charge->allocate(PARAM.inp.nspin);
     this->pelec->omega = GlobalC::ucell.omega; // this line is very odd.
@@ -144,7 +151,7 @@ void ESolver_KS_LCAO_TDDFT::before_all_runners(const Input_para& inp, UnitCell&
 
     this->atoms_fixed = !ucell.if_atoms_can_move();
 
-    td_p  = new TD_Velocity(&GlobalC::ucell);
+    td_p = new TD_Velocity(&GlobalC::ucell);
     TD_Velocity::td_vel_op = td_p;
 }
 
@@ -178,45 +185,47 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
     ModuleBase::timer::tick(this->classname, "before_scf");
     // things only initialize once
     this->pelec_td->first_evolve = true;
-    if(!TD_Velocity::tddft_velocity && TD_Velocity::out_current)
+    if (!TD_Velocity::tddft_velocity && TD_Velocity::out_current)
     {
         // initialize the velocity operator
-        velocity_mat = new TD_current(&GlobalC::ucell, &GlobalC::GridD, &this->pv, orb_, two_center_bundle_.overlap_orb.get());
-        //calculate velocity operator
+        velocity_mat
+            = new TD_current(&GlobalC::ucell, &GlobalC::GridD, &this->pv, orb_, two_center_bundle_.overlap_orb.get());
+        // calculate velocity operator
         velocity_mat->calculate_grad_term();
         velocity_mat->calculate_vcomm_r();
     }
-    int estep_max = (istep == 0) ? PARAM.inp.estep_per_md +1 : PARAM.inp.estep_per_md;
-    for(int estep =0; estep < estep_max; estep++)
+    int estep_max = (istep == 0) ? PARAM.inp.estep_per_md + 1 : PARAM.inp.estep_per_md;
+    for (int estep = 0; estep < estep_max; estep++)
     {
         // calculate total time step
         this->totstep++;
         this->print_step();
-        this->p_chgmix->init_mixing();
-        //update At
-        if(elecstate::H_TDDFT_pw::stype!=0)
+        // this->p_chgmix->mix_reset();
+        // update At
+        if (elecstate::H_TDDFT_pw::stype != 0)
         {
             elecstate::H_TDDFT_pw::update_At();
             td_p->cal_cart_At(elecstate::H_TDDFT_pw::At);
         }
         std::cout << "cart_At: " << td_p->cart_At[0] << " " << td_p->cart_At[1] << " " << td_p->cart_At[2] << std::endl;
-        std::cout<<"Et: "<<elecstate::H_TDDFT_pw::Et[0]<<" "<<elecstate::H_TDDFT_pw::Et[1]<<" "<<elecstate::H_TDDFT_pw::Et[2]<<std::endl;
-        if(estep!=0)
+        std::cout << "Et: " << elecstate::H_TDDFT_pw::Et[0] << " " << elecstate::H_TDDFT_pw::Et[1] << " "
+                  << elecstate::H_TDDFT_pw::Et[2] << std::endl;
+        if (estep != 0)
         {
             this->CE.update_all_dis(GlobalC::ucell);
             this->CE.extrapolate_charge(
 #ifdef __MPI
-            &(GlobalC::Pgrid),
+                &(GlobalC::Pgrid),
 #endif
-            GlobalC::ucell,
-            this->pelec->charge,
-            &(this->sf),
-            GlobalV::ofs_running,
-            GlobalV::ofs_warning);
-            //need to test if correct when estep>0
+                GlobalC::ucell,
+                this->pelec->charge,
+                &(this->sf),
+                GlobalV::ofs_running,
+                GlobalV::ofs_warning);
+            // need to test if correct when estep>0
             this->pelec_td->init_scf(totstep, this->sf.strucFac, GlobalC::ucell.symm);
             dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM()->cal_DMR();
-            if(totstep <= PARAM.inp.td_tend + 1)
+            if (totstep <= PARAM.inp.td_tend + 1)
             {
                 TD_Velocity::evolve_once = true;
             }
@@ -232,15 +241,15 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
         std::cout << " * * * * * *\n << Start SCF iteration." << std::endl;
         for (int iter = 1; iter <= this->maxniter; ++iter)
         {
-            //no need to change
-            // 5) write head
+            // no need to change
+            //  5) write head
             ModuleIO::write_head_td(GlobalV::ofs_running, istep, estep, iter, this->basisname);
 
-    #ifdef __MPI
+#ifdef __MPI
             auto iterstart = MPI_Wtime();
-    #else
+#else
             auto iterstart = std::chrono::system_clock::now();
-    #endif
+#endif
 
             // probably no need to change
             // 6) initialization of SCF iterations
@@ -260,14 +269,15 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
             // parallel algorithms, in which they do not occupy all processors, for
             // example wavefunctions uses 20 processors while density uses 10.
             if (GlobalV::MY_STOGROUP == 0)
-            {   // check chg between two estep 
+            { // check chg between two estep
                 // double drho = this->estate.caldr2();
                 // EState should be used after it is constructed.
                 drho = p_chgmix->get_drho(pelec->charge, PARAM.inp.nelec);
                 // no need to change
                 if (PARAM.inp.scf_os_stop) // if oscillation is detected, SCF will stop
                 {
-                    this->oscillate_esolver = this->p_chgmix->if_scf_oscillate(iter, drho, PARAM.inp.scf_os_ndim, PARAM.inp.scf_os_thr);
+                    this->oscillate_esolver
+                        = this->p_chgmix->if_scf_oscillate(iter, drho, PARAM.inp.scf_os_ndim, PARAM.inp.scf_os_thr);
                 }
                 // change done
                 this->conv_esolver = (drho < this->scf_thr);
@@ -275,23 +285,23 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
                 // If drho < hsolver_error in the first iter or drho < scf_thr, we
                 // do not change rho.
                 // no need to cange
-                if(!this->conv_esolver)
+                if (!this->conv_esolver)
                 {
                     //----------charge mixing---------------
                     p_chgmix->mix_rho(pelec->charge); // update chr->rho by mixing
                     if (PARAM.inp.scf_thr_type == 2)
                     {
                         pelec->charge->renormalize_rho(); // renormalize rho in R-space would
-                                                        // induce a error in K-space
+                                                          // induce a error in K-space
                     }
                     //----------charge mixing done-----------
                 }
             }
-    #ifdef __MPI
+#ifdef __MPI
             MPI_Bcast(&drho, 1, MPI_DOUBLE, 0, PARAPW_WORLD);
             MPI_Bcast(&this->conv_esolver, 1, MPI_DOUBLE, 0, PARAPW_WORLD);
             MPI_Bcast(pelec->charge->rho[0], this->pw_rhod->nrxx, MPI_DOUBLE, 0, PARAPW_WORLD);
-    #endif
+#endif
             // no need to change
             // 9) update potential
             // Hamilt should be used after it is constructed.
@@ -300,14 +310,14 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
             // no need to change
             // 10) finish scf iterations
             this->iter_finish(totstep, iter);
-    #ifdef __MPI
+#ifdef __MPI
             double duration = (double)(MPI_Wtime() - iterstart);
-    #else
+#else
             double duration
                 = (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now() - iterstart))
-                    .count()
-                / static_cast<double>(1e6);
-    #endif
+                      .count()
+                  / static_cast<double>(1e6);
+#endif
             // not change for now, perhaps do no harm
             // 11) get mtaGGA related parameters
             double dkin = 0.0; // for meta-GGA
@@ -318,17 +328,17 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
             this->pelec->print_etot(this->conv_esolver, iter, drho, dkin, duration, PARAM.inp.printe, diag_ethr);
 
             // 12) Json, need to be moved to somewhere else
-    #ifdef __RAPIDJSON
+#ifdef __RAPIDJSON
             // add Json of scf mag
             Json::add_output_scf_mag(GlobalC::ucell.magnet.tot_magnetization,
-                                    GlobalC::ucell.magnet.abs_magnetization,
-                                    this->pelec->f_en.etot * ModuleBase::Ry_to_eV,
-                                    this->pelec->f_en.etot_delta * ModuleBase::Ry_to_eV,
-                                    drho,
-                                    duration);
-    #endif //__RAPIDJSON
-            // no need to change
-            // 13) check convergence
+                                     GlobalC::ucell.magnet.abs_magnetization,
+                                     this->pelec->f_en.etot * ModuleBase::Ry_to_eV,
+                                     this->pelec->f_en.etot_delta * ModuleBase::Ry_to_eV,
+                                     drho,
+                                     duration);
+#endif //__RAPIDJSON
+       // no need to change
+       // 13) check convergence
             if (this->conv_esolver || this->oscillate_esolver)
             {
                 this->niter = iter;
@@ -347,14 +357,14 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
         this->after_scf(totstep);
         ModuleBase::timer::tick(this->classname, "after_scf");
         this->pelec_td->first_evolve = false;
-        if(!restart_done)
+        if (!restart_done)
         {
-            estep += PARAM.inp.estep_shift%PARAM.inp.estep_per_md;
+            estep += PARAM.inp.estep_shift % PARAM.inp.estep_per_md;
             totstep += PARAM.inp.estep_shift;
             restart_done = true;
         }
     }
-    if(!TD_Velocity::tddft_velocity && TD_Velocity::out_current)
+    if (!TD_Velocity::tddft_velocity && TD_Velocity::out_current)
     {
         delete velocity_mat;
     }
@@ -364,9 +374,9 @@ void ESolver_KS_LCAO_TDDFT::runner(const int istep, UnitCell& ucell)
 void ESolver_KS_LCAO_TDDFT::print_step()
 {
     std::cout << " -------------------------------------------" << std::endl;
-	GlobalV::ofs_running << "\n -------------------------------------------" << std::endl;
+    GlobalV::ofs_running << "\n -------------------------------------------" << std::endl;
     std::cout << " STEP OF ELECTRON EVOLVE : " << unsigned(totstep) << std::endl;
-	GlobalV::ofs_running << " STEP OF ELECTRON EVOLVE : " << unsigned(totstep) << std::endl;
+    GlobalV::ofs_running << " STEP OF ELECTRON EVOLVE : " << unsigned(totstep) << std::endl;
     std::cout << " -------------------------------------------" << std::endl;
     GlobalV::ofs_running << " -------------------------------------------" << std::endl;
 }
@@ -374,6 +384,13 @@ void ESolver_KS_LCAO_TDDFT::print_step()
 void ESolver_KS_LCAO_TDDFT::iter_init(const int istep, const int iter)
 {
     ModuleBase::TITLE("ESolver_KS_LCAO_TDDFT", "iter_init");
+    if (iter == 1)
+    {
+        this->p_chgmix->mix_reset(); // init mixing
+        this->p_chgmix->mixing_restart_step = PARAM.inp.scf_nmax + 1;
+        this->p_chgmix->mixing_restart_count = 0;
+    }
+
     // mohan update 2012-06-05
     this->pelec->f_en.deband_harris = this->pelec->cal_delta_eband();
 
@@ -382,7 +399,16 @@ void ESolver_KS_LCAO_TDDFT::iter_init(const int istep, const int iter)
     // electrons number.
     if (istep == 0 && this->wf.init_wfc == "file")
     {
-        if (iter == 1)
+        int exx_two_level_step = 0;
+#ifdef __EXX
+        if (GlobalC::exx_info.info_global.cal_exx)
+        {
+            // the following steps are only needed in the first outer exx loop
+            exx_two_level_step
+                = GlobalC::exx_info.info_ri.real_number ? this->exd->two_level_step : this->exc->two_level_step;
+        }
+#endif
+        if (iter == 1 && exx_two_level_step == 0)
         {
             std::cout << " WAVEFUN -> CHARGE " << std::endl;
 
@@ -432,17 +458,19 @@ void ESolver_KS_LCAO_TDDFT::iter_init(const int istep, const int iter)
     // calculate exact-exchange
     if (GlobalC::exx_info.info_ri.real_number)
     {
-        this->exd->exx_eachiterinit(istep,
-                                    *dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM(),
-                                    this->kv,
-                                    iter);
+        this->exd->exx_eachiterinit(
+            istep,
+            *dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM(),
+            this->kv,
+            iter);
     }
     else
     {
-        this->exc->exx_eachiterinit(istep,
-                                    *dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM(),
-                                    this->kv,
-                                    iter);
+        this->exc->exx_eachiterinit(
+            istep,
+            *dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM(),
+            this->kv,
+            iter);
     }
 #endif
 
@@ -472,13 +500,14 @@ void ESolver_KS_LCAO_TDDFT::iter_init(const int istep, const int iter)
     // method
     if (PARAM.inp.sc_mag_switch && iter > PARAM.inp.sc_scf_nmin)
     {
-        SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
+        SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc
+            = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
         sc.run_lambda_loop(iter - 1);
     }
 }
 void ESolver_KS_LCAO_TDDFT::cal_force(ModuleBase::matrix& force)
 {
-    if(atoms_fixed)
+    if (atoms_fixed)
     {
         return;
     }
@@ -688,7 +717,7 @@ void ESolver_KS_LCAO_TDDFT::update_pot(const int istep, const int iter)
     }
 
     if (elecstate::ElecStateLCAO<std::complex<double>>::out_wfc_lcao
-        && (this->conv_esolver || iter == PARAM.inp.scf_nmax) && (istep % PARAM.inp.out_interval == 0) )
+        && (this->conv_esolver || iter == PARAM.inp.scf_nmax) && (istep % PARAM.inp.out_interval == 0))
     {
         ModuleIO::write_wfc_nao(elecstate::ElecStateLCAO<std::complex<double>>::out_wfc_lcao,
                                 this->psi[0],
@@ -775,12 +804,13 @@ void ESolver_KS_LCAO_TDDFT::update_pot(const int istep, const int iter)
             }
         }
 
-            // calculate energy density matrix for tddft
-            if (istep >= (wf.init_wfc == "file" ? 0 : 1) && module_tddft::Evolve_elec::td_edm == 0 && (istep+1)%PARAM.inp.estep_per_md == 0)
-            {
-                this->cal_edm_tddft();
-            }
-        }    
+        // calculate energy density matrix for tddft
+        if (istep >= (wf.init_wfc == "file" ? 0 : 1) && module_tddft::Evolve_elec::td_edm == 0
+            && (istep + 1) % PARAM.inp.estep_per_md == 0)
+        {
+            this->cal_edm_tddft();
+        }
+    }
 
     // print "eigen value" for tddft
     /*if (this->conv_esolver)
@@ -825,17 +855,17 @@ void ESolver_KS_LCAO_TDDFT::after_scf(const int istep)
     {
         elecstate::DensityMatrix<std::complex<double>, double>* tmp_DM
             = dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM();
-        if(TD_Velocity::out_current_k)
+        if (TD_Velocity::out_current_k)
         {
             ModuleIO::write_current_eachk(istep,
-                                    this->psi,
-                                    pelec,
-                                    kv,
-                                    two_center_bundle_.overlap_orb.get(),
-                                    tmp_DM->get_paraV_pointer(),
-                                    orb_,
-                                    this->velocity_mat,
-                                    this->RA);
+                                          this->psi,
+                                          pelec,
+                                          kv,
+                                          two_center_bundle_.overlap_orb.get(),
+                                          tmp_DM->get_paraV_pointer(),
+                                          orb_,
+                                          this->velocity_mat,
+                                          this->RA);
         }
         else
         {
@@ -850,7 +880,44 @@ void ESolver_KS_LCAO_TDDFT::after_scf(const int istep)
                                     this->RA);
         }
     }
-    std::cout << "Potential (Ry): " << std::setprecision(15) << this->pelec->f_en.etot <<std::endl;
-}
 
+    if (TD_Velocity::out_current_comm == true)
+    {
+        if (TD_Velocity::out_current_k)
+        {
+            ModuleIO::write_current_eachk(
+                istep,
+                this->psi,
+                pelec,
+                kv,
+                &this->pv,
+                orb_,
+                this->r_calculator,
+                dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->getSR(),
+                dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->getHR(),
+#ifdef __EXX
+                &(this->exx_lri_complex->Hexxs)
+#endif
+            );
+        }
+        else
+        {
+            ModuleIO::write_current(
+                istep,
+                this->psi,
+                pelec,
+                kv,
+                &this->pv,
+                orb_,
+                this->r_calculator,
+                dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->getSR(),
+                dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->getHR(),
+#ifdef __EXX
+                &(this->exx_lri_complex->Hexxs)
+#endif
+            );
+        }
+    }
+    std::cout << "Potential (Ry): " << std::setprecision(15) << this->pelec->f_en.etot << std::endl;
+}
 } // namespace ModuleESolver
diff --git a/source/module_esolver/esolver_ks_lcao_tddft.h b/source/module_esolver/esolver_ks_lcao_tddft.h
index 46ae1a8519..fe71a8beeb 100644
--- a/source/module_esolver/esolver_ks_lcao_tddft.h
+++ b/source/module_esolver/esolver_ks_lcao_tddft.h
@@ -2,6 +2,7 @@
 #define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_ESOLVER_ESOLVER_KS_LCAO_TDDFT_H
 #include "esolver_ks.h"
 #include "esolver_ks_lcao.h"
+#include "module_io/cal_r_overlap_R.h"
 #include "module_elecstate/elecstate_lcao_tddft.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/record_adj.h"
 #include "module_psi/psi.h"
@@ -57,6 +58,8 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
     TD_current* velocity_mat = nullptr;
 
     TD_Velocity* td_p = nullptr;
+
+    cal_r_overlap_R r_calculator;
 };
 
 } // namespace ModuleESolver
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index ca19bfb630..dbef4859d9 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -405,16 +405,33 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
         // Peize Lin add 2016-12-03
         // set xc type before the first cal of xc in pelec->init_scf
         // and calculate Cs, Vs
-        Operator<TK>* exx = new OperatorEXX<OperatorLCAO<TK, TR>>(this->hsk,
-            this->hR,
-            *this->kv,
-            Hexxd,
-            Hexxc,
-            Add_Hexx_Type::R,
-            istep,
-            exx_two_level_step,
-            !GlobalC::restart.info_load.restart_exx
-            && GlobalC::restart.info_load.load_H);
+        Operator<TK>* exx;
+        if (PARAM.inp.esolver_type == "tddft")
+        {
+            exx = new OperatorEXX<OperatorLCAO<TK, TR>>(this->hsk,
+                this->hR,
+                *this->kv,
+                Hexxd,
+                Hexxc,
+                Add_Hexx_Type::k,
+                istep,
+                exx_two_level_step,
+                !GlobalC::restart.info_load.restart_exx
+                && GlobalC::restart.info_load.load_H);
+        }
+        else
+        {
+            exx = new OperatorEXX<OperatorLCAO<TK, TR>>(this->hsk,
+                this->hR,
+                *this->kv,
+                Hexxd,
+                Hexxc,
+                Add_Hexx_Type::R,
+                istep,
+                exx_two_level_step,
+                !GlobalC::restart.info_load.restart_exx
+                && GlobalC::restart.info_load.load_H);
+        }
         this->getOperator()->add(exx);
     }
 #endif
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
index 0288149557..174d5b8bb8 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
@@ -8,6 +8,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_hamilt_general/module_xc/xc_functional.h"
 #include "module_io/restart_exx_csr.h"
+#include "module_elecstate/potentials/H_TDDFT_pw.h"
+#include "module_hamilt_lcao/module_tddft/td_velocity.h"
 
 namespace hamilt
 {
@@ -203,7 +205,14 @@ void OperatorEXX<OperatorLCAO<TK, TR>>::contributeHR()
 {
     ModuleBase::TITLE("OperatorEXX", "contributeHR");
     // Peize Lin add 2016-12-03
-    if (this->istep == 0 && PARAM.inp.calculation != "nscf" && this->two_level_step != nullptr && *this->two_level_step == 0 && !this->restart) { return; }  //in the non-exx loop, do nothing 
+    if (this->istep == 0
+        && PARAM.inp.calculation != "nscf"
+        && this->two_level_step != nullptr && *this->two_level_step == 0
+        && PARAM.inp.init_wfc != "file"
+        && !this->restart)
+    {
+        return;
+    }  //in the non-exx loop, do nothing 
     if (this->add_hexx_type == Add_Hexx_Type::k) { return; }
 
     if (XC_Functional::get_func_type() == 4 || XC_Functional::get_func_type() == 5)
@@ -276,23 +285,36 @@ void OperatorEXX<OperatorLCAO<TK, TR>>::contributeHk(int ik)
                               || GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Ccp)
                                  ? 1.0
                                  : GlobalC::exx_info.info_global.hybrid_alpha;
-        if (GlobalC::exx_info.info_ri.real_number) {
-            RI_2D_Comm::add_Hexx(
-                this->kv,
-                ik,
-                coeff,
-                *this->Hexxd,
-                *this->hR->get_paraV(),
-                this->hsk->get_hk());
-        } else {
-            RI_2D_Comm::add_Hexx(
+        if (PARAM.inp.esolver_type == "tddft" && elecstate::H_TDDFT_pw::stype == 2){
+            RI_2D_Comm::add_Hexx_td(
                 this->kv,
                 ik,
                 coeff,
                 *this->Hexxc,
                 *this->hR->get_paraV(),
+                TD_Velocity::td_vel_op->cart_At,
                 this->hsk->get_hk());
-}
+        }
+        else{
+            if (GlobalC::exx_info.info_ri.real_number) {
+                RI_2D_Comm::add_Hexx(
+                    this->kv,
+                    ik,
+                    coeff,
+                    *this->Hexxd,
+                    *this->hR->get_paraV(),
+                    this->hsk->get_hk());
+            } else {
+                RI_2D_Comm::add_Hexx(
+                    this->kv,
+                    ik,
+                    coeff,
+                    *this->Hexxc,
+                    *this->hR->get_paraV(),
+                    this->hsk->get_hk());
+            }
+        }
+
     }
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
index a9d3a88ae5..1157190e7b 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
@@ -177,14 +177,17 @@ void OperatorLCAO<TK, TR>::init(const int ik_in) {
         case calculation_type::lcao_exx:
         {
             //update HR first
-            if (!this->hr_done)
+            if (!this->hr_done && PARAM.inp.esolver_type != "tddft")
             {
                 this->contributeHR();
             }
 
+            if (PARAM.inp.esolver_type == "tddft")
+            {
+                this->contributeHk(ik_in);
+            }
             //update HK next
             //in cal_type=lcao_exx, HK only need to update from one node
-            // this->contributeHk(ik_in);
 
         break;
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
index 7470c2e549..f892a956cf 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
@@ -348,6 +348,10 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
         {
             TD_Velocity::td_vel_op->initialize_current_term(this->hR_tmp, paraV);
         }
+        if (TD_Velocity::out_current_comm)
+        {
+            TD_Velocity::td_vel_op->set_velocity_HR(this->hR_tmp);
+        }
         // calculate the values in hR_tmp
         this->update_td();
         this->hR_tmp->set_zero();
diff --git a/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp b/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp
index af8f7c76d5..2917aee21e 100644
--- a/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp
+++ b/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp
@@ -67,6 +67,38 @@ void folding_HR(const hamilt::HContainer<TR>& hR,
     }*/
 }
 
+template<typename TR>
+void folding_partial_HR(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                UnitCell& ucell,
+                const int ix,
+                const int ncol,
+                const int hk_type)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < hR.size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<TR>& tmp = hR.get_atom_pair(i);
+        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        {
+            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
+            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI;
+            double sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            std::complex<double> kphase = std::complex<double>(cosp, sinp);
+            const ModuleBase::Vector3<double> dR_car = dR * ucell.latvec * ucell.lat0;
+
+            tmp.find_R(r_index);
+            tmp.add_to_matrix(hk, ncol, kphase * ModuleBase::IMAG_UNIT * std::complex<double>(dR_car[ix]), hk_type);
+        }
+    }
+}
+
+
 // template instantiation
 template void folding_HR<std::complex<double>>(const hamilt::HContainer<std::complex<double>>& hR,
                                             std::complex<double>* hk,
@@ -78,6 +110,20 @@ template void folding_HR<double>(const hamilt::HContainer<double>& hR,
                                 const ModuleBase::Vector3<double>& kvec_d_in,
                                 const int ncol,
                                 const int hk_type);
+template void folding_partial_HR<std::complex<double>>(const hamilt::HContainer<std::complex<double>>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                UnitCell& ucell,
+                const int ix,
+                const int ncol,
+                const int hk_type);
+template void folding_partial_HR<double>(const hamilt::HContainer<double>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                UnitCell& ucell,
+                const int ix,
+                const int ncol,
+                const int hk_type);
 // special case for double
 void folding_HR(const hamilt::HContainer<double>& hR,
                 double* hk,
diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h
index 31bf65e3da..60b36ed628 100644
--- a/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h
+++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h
@@ -24,6 +24,14 @@ void folding_HR(const hamilt::HContainer<double>& hR,
                 const ModuleBase::Vector3<double>& kvec_d_in,
                 const int ncol,
                 const int hk_type);
+template<typename TR>
+void folding_partial_HR(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                UnitCell& ucell,
+                const int ix,
+                const int ncol,
+                const int hk_type);
 
 #ifdef __MPI
 /**
diff --git a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
index 8b3f81af7d..d4020ee71e 100644
--- a/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_tddft/CMakeLists.txt
@@ -11,7 +11,6 @@ if(ENABLE_LCAO)
         td_velocity.cpp
         td_current.cpp
         snap_psibeta_half_tddft.cpp
-        td_folding_mixing.cpp
     )
 
     add_library(
diff --git a/source/module_hamilt_lcao/module_tddft/td_folding_mixing.cpp b/source/module_hamilt_lcao/module_tddft/td_folding_mixing.cpp
deleted file mode 100644
index 4b058896a2..0000000000
--- a/source/module_hamilt_lcao/module_tddft/td_folding_mixing.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "td_velocity.h"
-#include "module_base/libm/libm.h"
-#include "module_hamilt_lcao/module_tddft/td_velocity.h"
-
-void TD_Velocity::folding_HR_td(const hamilt::HContainer<double>& hR,
-                std::complex<double>* hk,
-                const ModuleBase::Vector3<double>& kvec_d_in,
-                const int ncol,
-                const int hk_type)
-{
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int i = 0; i < hR.size_atom_pairs(); ++i)
-    {
-        hamilt::AtomPair<double>& tmp = hR.get_atom_pair(i);
-        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
-        {
-            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
-
-            //new
-            //cal tddft phase for mixing gague
-            const int iat1 = tmp.get_atom_i();
-            const int iat2 = tmp.get_atom_j();
-            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, r_index);
-            const double arg_td = cart_At * dtau * ucell->lat0;
-            /*std::cout << "arg_td" << arg_td << std::endl;
-            std::cout << "cart_At" << cart_At[0] << " "<< cart_At[1] << " " << cart_At[2] << std::endl;
-            std::cout << "dtau" << dtau[0] << " "<< dtau[1] << " " << dtau[2] << std::endl;
-            std::cout << "ucell->lat0" << ucell->lat0 << std::endl;
-            std::cout << "iat1" << iat1 << " " << "iat2" << iat2 << std::endl;*/
-
-            //new
-            // cal k_phase
-            // if TK==std::complex<double>, kphase is e^{ikR}
-            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
-            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI + arg_td;
-            double sinp, cosp;
-            ModuleBase::libm::sincos(arg, &sinp, &cosp);
-            std::complex<double> kphase = std::complex<double>(cosp, sinp);
-
-            tmp.find_R(r_index);
-            tmp.add_to_matrix(hk, ncol, kphase, hk_type);
-        }
-    }
-}
diff --git a/source/module_hamilt_lcao/module_tddft/td_velocity.cpp b/source/module_hamilt_lcao/module_tddft/td_velocity.cpp
index bb5213b560..15002d86a9 100644
--- a/source/module_hamilt_lcao/module_tddft/td_velocity.cpp
+++ b/source/module_hamilt_lcao/module_tddft/td_velocity.cpp
@@ -8,6 +8,7 @@ bool TD_Velocity::out_mat_R = false;
 bool TD_Velocity::out_vecpot = false;
 bool TD_Velocity::out_current = false;
 bool TD_Velocity::out_current_k = false;
+bool TD_Velocity::out_current_comm = false;
 bool TD_Velocity::init_vecpot_file = false;
 bool TD_Velocity::evolve_once = false;
 
diff --git a/source/module_hamilt_lcao/module_tddft/td_velocity.h b/source/module_hamilt_lcao/module_tddft/td_velocity.h
index 88696d0739..61dc3b9a2a 100644
--- a/source/module_hamilt_lcao/module_tddft/td_velocity.h
+++ b/source/module_hamilt_lcao/module_tddft/td_velocity.h
@@ -30,6 +30,9 @@ class TD_Velocity
     /// @brief switch to control the output of current
     static bool out_current;
 
+    /// @brief switch to control the output of current based on i[r, H] directly
+    static bool out_current_comm;
+
     /// @brief switch to control the format of the output current, in total or in each k-point
     static bool out_current_k;
 
@@ -66,16 +69,31 @@ class TD_Velocity
     {
         return this->current_term[i];
     }
+
+    // set velocity HR.
+    void set_velocity_HR(hamilt::HContainer<std::complex<double>>* HR)
+    {
+        this->velocity_HR = HR;
+    }
+
+    hamilt::HContainer<std::complex<double>>* get_velocity_HR_pointer() const
+    {
+        return this->velocity_HR;
+    }
+
         // folding HR to hk, for mixing gague
-    void folding_HR_td(const hamilt::HContainer<double>& hR,
+    template<typename TR>
+    void folding_HR_td(const hamilt::HContainer<TR>& hR,
                 std::complex<double>* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
                 const int ncol,
                 const int hk_type);
 
-    void folding_HR_td_c(const hamilt::HContainer<std::complex<double>>& hR,
+    template<typename TR>
+    void folding_partial_HR_td(const hamilt::HContainer<TR>& hR,
                 std::complex<double>* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ix,
                 const int ncol,
                 const int hk_type);
 
@@ -115,6 +133,10 @@ class TD_Velocity
 
     /// @brief part of Momentum operator, -i∇ - i[r,Vnl]. Used to calculate current.
     std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
+
+    /// @brief store kinetic hamilton
+    hamilt::HContainer<std::complex<double>>* velocity_HR = nullptr;
 };
+#include "td_velocity.hpp"
 
 #endif
diff --git a/source/module_hamilt_lcao/module_tddft/td_velocity.hpp b/source/module_hamilt_lcao/module_tddft/td_velocity.hpp
new file mode 100644
index 0000000000..19d31ea64a
--- /dev/null
+++ b/source/module_hamilt_lcao/module_tddft/td_velocity.hpp
@@ -0,0 +1,93 @@
+#ifndef TD_VELOCITY_HPP
+#define TD_VELOCITY_HPP
+
+#include "td_velocity.h"
+#include "module_base/libm/libm.h"
+#include "module_hamilt_lcao/module_tddft/td_velocity.h"
+
+template<typename TR>
+void TD_Velocity::folding_HR_td(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ncol,
+                const int hk_type)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < hR.size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<TR>& tmp = hR.get_atom_pair(i);
+        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        {
+            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+
+            //new
+            //cal tddft phase for mixing gague
+            const int iat1 = tmp.get_atom_i();
+            const int iat2 = tmp.get_atom_j();
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, r_index);
+            const double arg_td = cart_At * dtau * ucell->lat0;
+            /*std::cout << "arg_td" << arg_td << std::endl;
+            std::cout << "cart_At" << cart_At[0] << " "<< cart_At[1] << " " << cart_At[2] << std::endl;
+            std::cout << "dtau" << dtau[0] << " "<< dtau[1] << " " << dtau[2] << std::endl;
+            std::cout << "ucell->lat0" << ucell->lat0 << std::endl;
+            std::cout << "iat1" << iat1 << " " << "iat2" << iat2 << std::endl;*/
+
+            //new
+            // cal k_phase
+            // if TK==std::complex<double>, kphase is e^{ikR}
+            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
+            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI + arg_td;
+            double sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            std::complex<double> kphase = std::complex<double>(cosp, sinp);
+
+            tmp.find_R(r_index);
+            tmp.add_to_matrix(hk, ncol, kphase, hk_type);
+        }
+    }
+}
+
+template<typename TR>
+void TD_Velocity::folding_partial_HR_td(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ix,
+                const int ncol,
+                const int hk_type)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < hR.size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<TR>& tmp = hR.get_atom_pair(i);
+        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        {
+            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+
+            //new
+            //cal tddft phase for mixing gague
+            const int iat1 = tmp.get_atom_i();
+            const int iat2 = tmp.get_atom_j();
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, r_index);
+            const double arg_td = cart_At * dtau * ucell->lat0;
+
+            //new
+            // cal k_phase
+            // if TK==std::complex<double>, kphase is e^{ikR}
+            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
+            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI + arg_td;
+            double sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            std::complex<double> kphase = std::complex<double>(cosp, sinp);
+            const ModuleBase::Vector3<double> dR_car = dR * ucell->latvec * ucell->lat0;
+
+            tmp.find_R(r_index);
+            tmp.add_to_matrix(hk, ncol, kphase * ModuleBase::IMAG_UNIT * std::complex<double>(dR_car[ix]), hk_type);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/source/module_io/cal_r_overlap_R.h b/source/module_io/cal_r_overlap_R.h
index e9e02a14f3..e1142364ff 100644
--- a/source/module_io/cal_r_overlap_R.h
+++ b/source/module_io/cal_r_overlap_R.h
@@ -69,6 +69,10 @@ class cal_r_overlap_R
     std::vector<int> iw2iN;
     std::vector<int> iw2it;
 
+  private:
+    void initialize_orb_table(const LCAO_Orbitals& orb);
+    void construct_orbs_and_orb_r(const LCAO_Orbitals& orb);
+
     ModuleBase::Sph_Bessel_Recursive::D2* psb_ = nullptr;
     ORB_gaunt_table MGT;
 
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index 7c21161e05..2a8ea79a1a 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -303,6 +303,7 @@ void Input_Conv::Convert()
     module_tddft::Evolve_elec::td_edm = PARAM.inp.td_edm;
     TD_Velocity::out_current = PARAM.inp.out_current;
     TD_Velocity::out_current_k = PARAM.inp.out_current_k;
+    TD_Velocity::out_current_comm = PARAM.inp.out_current_comm;
     TD_Velocity::out_vecpot = PARAM.inp.out_vecpot;
     TD_Velocity::init_vecpot_file = PARAM.inp.init_vecpot_file;
     read_td_efield();
diff --git a/source/module_io/input_conv.h b/source/module_io/input_conv.h
index 5ce3ac4190..d7bf46f55f 100644
--- a/source/module_io/input_conv.h
+++ b/source/module_io/input_conv.h
@@ -47,10 +47,15 @@ void parse_expression(const std::string& fn, std::vector<T>& vec)
 {
     ModuleBase::TITLE("Input_Conv", "parse_expression");
     int count = 0;
-    std::string pattern("([-+]?[0-9]+\\*[-+]?[0-9.]+|[-+]?[0-9,.]+)");
+
+    // Update the regex pattern to handle scientific notation
+    std::string pattern("([-+]?[0-9]+\\*[-+]?[0-9.eE+-]+|[-+]?[0-9,.eE+-]+)");
+
     std::vector<std::string> str;
     std::stringstream ss(fn);
     std::string section;
+
+    // Split the input string into substrings by spaces
     while (ss >> section)
     {
         int index = 0;
@@ -64,24 +69,14 @@ void parse_expression(const std::string& fn, std::vector<T>& vec)
         section.erase(0, index);
         str.push_back(section);
     }
-    // std::string::size_type pos1, pos2;
-    // std::string c = " ";
-    // pos2 = fn.find(c);
-    // pos1 = 0;
-    // while (std::string::npos != pos2)
-    // {
-    //     str.push_back(fn.substr(pos1, pos2 - pos1));
-    //     pos1 = pos2 + c.size();
-    //     pos2 = fn.find(c, pos1);
-    // }
-    // if (pos1 != fn.length())
-    // {
-    //     str.push_back(fn.substr(pos1));
-    // }
+
+    // Compile the regular expression
     regex_t reg;
     regcomp(&reg, pattern.c_str(), REG_EXTENDED);
     regmatch_t pmatch[1];
     const size_t nmatch = 1;
+
+    // Loop over each section and apply regex to extract numbers
     for (size_t i = 0; i < str.size(); ++i)
     {
         if (str[i] == "")
@@ -90,25 +85,28 @@ void parse_expression(const std::string& fn, std::vector<T>& vec)
         }
         int status = regexec(&reg, str[i].c_str(), nmatch, pmatch, 0);
         std::string sub_str = "";
+
+        // Extract the matched substring
         for (size_t j = pmatch[0].rm_so; j != pmatch[0].rm_eo; ++j)
         {
             sub_str += str[i][j];
         }
+
+        // Check if the substring contains multiplication (e.g., "2*3.14")
         std::string sub_pattern("\\*");
         regex_t sub_reg;
         regcomp(&sub_reg, sub_pattern.c_str(), REG_EXTENDED);
         regmatch_t sub_pmatch[1];
         const size_t sub_nmatch = 1;
+
         if (regexec(&sub_reg, sub_str.c_str(), sub_nmatch, sub_pmatch, 0) == 0)
         {
             int pos = sub_str.find("*");
             int num = stoi(sub_str.substr(0, pos));
-            assert(num>=0);
+            assert(num >= 0);
             T occ = stof(sub_str.substr(pos + 1, sub_str.size()));
-            // std::vector<double> ocp_temp(num, occ);
-            // const std::vector<double>::iterator dest = vec.begin() + count;
-            // copy(ocp_temp.begin(), ocp_temp.end(), dest);
-            // count += num;
+            
+            // Add the value to the vector `num` times
             for (size_t k = 0; k != num; k++)
             {
                 vec.emplace_back(occ);
@@ -116,16 +114,17 @@ void parse_expression(const std::string& fn, std::vector<T>& vec)
         }
         else
         {
-            // vec[count] = stof(sub_str);
-            // count += 1;
+            // Handle scientific notation and convert to T
             std::stringstream convert;
             convert << sub_str;
             T occ;
             convert >> occ;
             vec.emplace_back(occ);
         }
+
         regfree(&sub_reg);
     }
+
     regfree(&reg);
 }
 
diff --git a/source/module_io/read_input_item_exx_dftu.cpp b/source/module_io/read_input_item_exx_dftu.cpp
index d2e57fda53..c29a175da3 100644
--- a/source/module_io/read_input_item_exx_dftu.cpp
+++ b/source/module_io/read_input_item_exx_dftu.cpp
@@ -89,7 +89,7 @@ void ReadInput::item_exx()
                 {
                     para.input.exx_hybrid_beta = "0.8";
                 }
-                else if (dft_functional_lower == "hse")
+                else if (para.input.exx_use_ewald && dft_functional_lower == "hse")
                 {
                     para.input.exx_hybrid_beta = "0.25";
                 }
@@ -113,7 +113,7 @@ void ReadInput::item_exx()
                 std::transform(dft_functional.begin(), dft_functional.end(), dft_functional_lower.begin(), tolower);
                 if (dft_functional_lower == "lc_pbe")
                 {
-                    para.input.exx_hse_omega = "0.11";
+                    para.input.exx_hse_omega = "0.33";
                 }
                 else if (dft_functional_lower == "lc_wpbe")
                 {
@@ -199,6 +199,10 @@ void ReadInput::item_exx()
                     para.input.exx_real_number = "0";
                 }
             }
+            if (para.input.esolver_type == "tddft" && (para.input.exx_hybrid_alpha != "0" || para.input.exx_use_ewald != false) && para.input.exx_real_number != "0")
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "For RT-TDDFT with hybrid functionals, only exx_real_number=0 is supported");
+            }
         };
         this->add_item(item);
     }
diff --git a/source/module_io/read_input_item_output.cpp b/source/module_io/read_input_item_output.cpp
index 7709f60213..5465d02768 100644
--- a/source/module_io/read_input_item_output.cpp
+++ b/source/module_io/read_input_item_output.cpp
@@ -449,6 +449,12 @@ void ReadInput::item_output()
         read_sync_bool(input.out_current_k);
         this->add_item(item);
     }
+    {
+        Input_Item item("out_current_comm");
+        item.annotation = "output current using complete basis or not";
+        read_sync_bool(input.out_current_comm);
+        this->add_item(item);
+    }
     {
         Input_Item item("out_vecpot");
         item.annotation = "output TDDFT vector potential or not";
diff --git a/source/module_io/td_current_io.cpp b/source/module_io/td_current_io.cpp
index 0abfd73308..1903b66e0c 100644
--- a/source/module_io/td_current_io.cpp
+++ b/source/module_io/td_current_io.cpp
@@ -4,20 +4,948 @@
 #include "module_base/global_variable.h"
 #include "module_base/libm/libm.h"
 #include "module_base/parallel_reduce.h"
+#include "module_base/scalapack_connector.h"
 #include "module_base/timer.h"
 #include "module_base/tool_threading.h"
 #include "module_base/vector3.h"
 #include "module_elecstate/module_dm/cal_dm_psi.h"
 #include "module_elecstate/potentials/H_TDDFT_pw.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
 #include "module_hamilt_lcao/module_tddft/td_velocity.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_parameter/parameter.h"
+#ifdef __EXX
+#include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h"
+#include "module_ri/Exx_LRI.h"
+#endif
 
 #ifdef __LCAO
+// void ModuleIO::folding_rR(const hamilt::HContainer<double>* rR,
+//                           const std::complex<double>* partial_sk,
+//                           std::complex<double>* rk,
+//                           const Parallel_Orbitals* pv,
+//                           const ModuleBase::Vector3<double>& kvec_d_in,
+//                           const int ncol,
+//                           const int hk_type)
+// {
+// std::complex<double>* rk_up = new std::complex<double>[pv->nloc];
+// ModuleBase::GlobalFunc::ZEROS(rk_up, pv->nloc);
+// std::complex<double>* partial_sk_up = new std::complex<double>[pv->nloc];
+// ModuleBase::GlobalFunc::ZEROS(partial_sk_up, pv->nloc);
+
+// hamilt::folding_HR(*rR, rk, kvec_d_in, ncol, 1); // set r(k) tmp
+// for (int ir = 0; ir < pv->nrow; ir++)
+// {
+//     // const int iwt1 = pv->local2global_row(ir);
+//     // const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+//     for (int ic = ir; ic < pv->ncol; ic++)  // store r(k) upper triangle
+//     {
+//         // const int iwt2 = pv->local2global_col(ic);
+//         // const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+//         const int irc = ic * pv->nrow + ir;
+//         rk_up(ir, ic) = rk[irc];
+//         partial_sk_up(ir, ic) = partial_sk[irc];
+//     }
+// }
+// rk_up += ModuleBase::IMAG_UNIT * partial_sk_up;
+// ModuleBase::ComplexMatrix rk_down = ModuleBase::transpose(rk_up, true);
+// for (int ir = 0; ir < pv->nrow; ir++)
+// {
+//     for (int ic = 0; ic <= ir; ic++)  // // set r(k) lower triangle
+//     {
+//         const int irc = ic * pv->nrow + ir;
+//         rk[irc] = rk_down(ir, ic);
+//     }
+// }
+
+//     hamilt::folding_HR(*rR, rk, kvec_d_in, ncol, 1); // set r(k) tmp
+
+//     for (int ir = 0; ir < pv->nrow; ir++)
+//     {
+//        for (int ic = 0; ic < ir; ic++)  // set r(k) lower triangle
+//        {
+//            const int irc = ic * pv->nrow + ir;
+//            const int icr = ir * pv->ncol + ic;
+//            rk[irc] = std::conj(rk[icr] + ModuleBase::IMAG_UNIT * partial_sk[icr]);
+//            std::cout<<"ir: "<<ir<<" ic: "<<ic<<" rk_up: "<<rk[icr]<<" partial_sk_up: "<<partial_sk[icr]<<" rk:
+//            "<<rk[irc]<<std::endl;
+//        }
+//     }
+// }
+
+template <typename TR, typename TA>
+void ModuleIO::init_from_hR(const hamilt::HContainer<TR>* hR, hamilt::HContainer<TA>* aimR)
+{
+    ModuleBase::TITLE("ModuleIO", "init_from_hR");
+    ModuleBase::timer::tick("ModuleIO", "init_from_hR");
+    for (int i = 0; i < hR->size_atom_pairs(); i++)
+    {
+        hamilt::AtomPair<TR> atom_ij = hR->get_atom_pair(i);
+        const int iat1 = atom_ij.get_atom_i();
+        const int iat2 = atom_ij.get_atom_j();
+        for (int iR = 0; iR < atom_ij.get_R_size(); iR++)
+        {
+            const ModuleBase::Vector3<int> r_index = atom_ij.get_R_index(iR);
+            hamilt::AtomPair<TA> atom_ij_ta(iat1, iat2, r_index, hR->get_paraV());
+            aimR->insert_pair(atom_ij_ta);
+        }
+    }
+    aimR->allocate(nullptr, true);
+
+    ModuleBase::timer::tick("ModuleIO", "init_from_hR");
+}
+void ModuleIO::init_from_adj(const LCAO_Orbitals& orb,
+                             const Parallel_Orbitals* pv,
+                             std::vector<AdjacentAtomInfo>& adjs_all,
+                             ModuleBase::Vector3<hamilt::HContainer<double>*>& rR)
+{
+    ModuleBase::TITLE("ModuleIOTD_mixing_pot", "init_from_adj");
+    ModuleBase::timer::tick("ModuleIO", "init_from_adj");
+
+    auto orb_cutoff_ = orb.cutoffs();
+    adjs_all.clear();
+    adjs_all.reserve(GlobalC::ucell.nat);
+
+    for (int iat1 = 0; iat1 < GlobalC::ucell.nat; iat1++)
+    {
+        auto tau1 = GlobalC::ucell.get_tau(iat1);
+        int T1, I1;
+        GlobalC::ucell.iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo adjs;
+        GlobalC::GridD.Find_atom(GlobalC::ucell, tau1, T1, I1, &adjs);
+        std::vector<bool> is_adj(adjs.adj_num + 1, false);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T2 = adjs.ntype[ad1];
+            const int I2 = adjs.natom[ad1];
+            const int iat2 = GlobalC::ucell.itia2iat(T2, I2);
+            if (pv->get_row_size(iat1) <= 0 || pv->get_col_size(iat2) <= 0)
+            {
+                continue;
+            }
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (GlobalC::ucell.cal_dtau(iat1, iat2, R_index2).norm() * GlobalC::ucell.lat0
+                < orb_cutoff_[T1] + orb_cutoff_[T2])
+            {
+                is_adj[ad1] = true;
+            }
+        }
+        filter_adjs(is_adj, adjs);
+        adjs_all.push_back(adjs);
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            int iat2 = GlobalC::ucell.itia2iat(T2, I2);
+            ModuleBase::Vector3<int>& R_index = adjs.box[ad];
+            hamilt::AtomPair<double> tmp(iat1, iat2, R_index, pv);
+            for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+            {
+                rR[i_alpha]->insert_pair(tmp);
+            }
+        }
+    }
+    // allocate the memory of BaseMatrix in HR, and set the new values to zero
+    for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+    {
+        rR[i_alpha]->allocate(nullptr, true);
+    }
+    ModuleBase::timer::tick("ModuleIO", "init_from_adj");
+}
+
+void ModuleIO::set_rR_from_hR(const LCAO_Orbitals& orb,
+                              const Parallel_Orbitals* pv,
+                              cal_r_overlap_R& r_calculator,
+                              const hamilt::HContainer<std::complex<double>>* hR,
+                              ModuleBase::Vector3<hamilt::HContainer<double>*>& rR)
+{
+    ModuleBase::TITLE("ModuleIO", "set_rR_from_hR");
+    ModuleBase::timer::tick("ModuleIO", "set_rR_from_hR");
+
+    // init
+    std::vector<AdjacentAtomInfo> adjs_all;
+    init_from_adj(orb, pv, adjs_all, rR);
+
+    for (int iat1 = 0; iat1 < GlobalC::ucell.nat; iat1++)
+    {
+        auto tau1 = GlobalC::ucell.get_tau(iat1);
+        int T1, I1;
+        GlobalC::ucell.iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo& adjs = adjs_all[iat1];
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            const int iat2 = GlobalC::ucell.itia2iat(T2, I2);
+            const ModuleBase::Vector3<int>& r_index = adjs.box[ad];
+            ModuleBase::Vector3<double> dtau = GlobalC::ucell.cal_dtau(iat1, iat2, r_index);
+
+            Atom& atom1 = GlobalC::ucell.atoms[T1];
+            Atom& atom2 = GlobalC::ucell.atoms[T2];
+            const int npol = GlobalC::ucell.get_npol();
+
+            const int* iw2l1 = atom1.iw2l;
+            const int* iw2n1 = atom1.iw2n;
+            const int* iw2m1 = atom1.iw2m;
+            const int* iw2l2 = atom2.iw2l;
+            const int* iw2n2 = atom2.iw2n;
+            const int* iw2m2 = atom2.iw2m;
+
+            auto row_indexes = pv->get_indexes_row(iat1);
+            auto col_indexes = pv->get_indexes_col(iat2);
+
+            const ModuleBase::Vector3<double>& tau1 = GlobalC::ucell.get_tau(iat1);
+            // std::cout << "tau1: " << tau1 << " tau2: " << GlobalC::ucell.get_tau(iat2) << " r_index: " << r_index
+            //           << std::endl;
+            const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
+            for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+            {
+                const int iw1 = row_indexes[iw1l] / npol;
+                const int L1 = iw2l1[iw1];
+                const int N1 = iw2n1[iw1];
+                const int m1 = iw2m1[iw1];
+
+                for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+                {
+                    const int iw2 = col_indexes[iw2l] / npol;
+                    const int L2 = iw2l2[iw2];
+                    const int N2 = iw2n2[iw2];
+                    const int m2 = iw2m2[iw2];
+
+                    // std::cout<<"L1: "<<L1<<" L2: "<<L2<<" N1: "<<N1<<" N2: "<<N2<<" m1: "<<m1<<" m2:
+                    // "<<m2<<std::endl;
+                    ModuleBase::Vector3<double> tmp_r = r_calculator.get_psi_r_psi(tau1 * GlobalC::ucell.lat0,
+                                                                                   T1,
+                                                                                   L1,
+                                                                                   m1,
+                                                                                   N1,
+                                                                                   tau2 * GlobalC::ucell.lat0,
+                                                                                   T2,
+                                                                                   L2,
+                                                                                   m2,
+                                                                                   N2);
+                    for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+                    {
+                        hamilt::BaseMatrix<double>* HlocR
+                            = rR[i_alpha]->find_matrix(iat1, iat2, r_index.x, r_index.y, r_index.z);
+                        HlocR->add_element(iw1, iw2, tmp_r[i_alpha]);
+                        // if (i_alpha == 2)
+                        // {
+                        //     std::cout << "iw1: " << iw1 << " iw2: " << iw2 << " i_alpha: " << i_alpha
+                        //               << " tmp_r: " << tmp_r[i_alpha] << std::endl;
+                        // }
+                    }
+                }
+            }
+        }
+    }
+
+    // for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+    //     init_from_hR(hR, rR[i_alpha]);
+
+    // //std::cout<<"hR->size_atom_pairs()"<<hR->size_atom_pairs()<<std::endl;
+    // for (int i = 0; i < hR->size_atom_pairs(); i++)
+    // {
+    //     hamilt::AtomPair<std::complex<double>> atom_ij = hR->get_atom_pair(i);
+    //     // loop R-index
+    //     //std::cout<<"HR size: "<<atom_ij.get_R_size()<<std::endl;
+    //     for (int iR = 0; iR < atom_ij.get_R_size(); iR++)
+    //     {
+    //         // get reference of target atom-pair
+    //         const int iat1 = atom_ij.get_atom_i();
+    //         const int iat2 = atom_ij.get_atom_j();
+    //         const ModuleBase::Vector3<int> r_index = atom_ij.get_R_index(iR);
+    //         // ---------------------------------------------
+    //         // get info of orbitals of atom1 and atom2 from ucell
+    //         // ---------------------------------------------
+    //         int T1, I1;
+    //         GlobalC::ucell.iat2iait(iat1, &I1, &T1);
+    //         int T2, I2;
+    //         GlobalC::ucell.iat2iait(iat2, &I2, &T2);
+    //         Atom& atom1 = GlobalC::ucell.atoms[T1];
+    //         Atom& atom2 = GlobalC::ucell.atoms[T2];
+    //         std::cout<<"iat1: "<<iat1<<" iat2: "<<iat2<<" r_index: "<<r_index<<std::endl;
+    //         // npol is the number of polarizations,
+    //         // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
+    //         // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
+    //         const int npol = GlobalC::ucell.get_npol();
+    //         //std::cout<<"npol: "<<npol<<std::endl;
+
+    //         const int* iw2l1 = atom1.iw2l;
+    //         const int* iw2n1 = atom1.iw2n;
+    //         const int* iw2m1 = atom1.iw2m;
+    //         const int* iw2l2 = atom2.iw2l;
+    //         const int* iw2n2 = atom2.iw2n;
+    //         const int* iw2m2 = atom2.iw2m;
+
+    //         auto row_indexes = pv->get_indexes_row(iat1);
+    //         auto col_indexes = pv->get_indexes_col(iat2);
+    //         const int step_trace = col_indexes.size() + 1;
+
+    //         const ModuleBase::Vector3<double>& tau1 = GlobalC::ucell.get_tau(iat1);
+    //         const ModuleBase::Vector3<double> tau2 = tau1 + GlobalC::ucell.cal_dtau(iat1, iat2, r_index);
+    //         std::cout<<"tau1: "<<tau1<<" tau2: "<<tau2<<std::endl;
+    //         for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+    //         {
+    //             const int iw1 = row_indexes[iw1l] / npol;
+    //             const int L1 = iw2l1[iw1];
+    //             const int N1 = iw2n1[iw1];
+    //             const int m1 = iw2m1[iw1];
+
+    //             for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+    //             {
+    //                 const int iw2 = col_indexes[iw2l] / npol;
+    //                 const int L2 = iw2l2[iw2];
+    //                 const int N2 = iw2n2[iw2];
+    //                 const int m2 = iw2m2[iw2];
+
+    //                 ModuleBase::Vector3<double> tmp_r = r_calculator.get_psi_r_psi(tau1 * GlobalC::ucell.lat0,
+    //                                                                                T1,
+    //                                                                                L1,
+    //                                                                                m1,
+    //                                                                                N1,
+    //                                                                                tau2 * GlobalC::ucell.lat0,
+    //                                                                                T2,
+    //                                                                                L2,
+    //                                                                                m2,
+    //                                                                                N2);
+    //                 for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+    //                 {
+    //                     hamilt::BaseMatrix<double>* HlocR
+    //                         = rR[i_alpha]->find_matrix(iat1, iat2, r_index.x, r_index.y, r_index.z);
+    //                     HlocR->add_element(iw1, iw2, tmp_r[i_alpha]);
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+    ModuleBase::TITLE("ModuleIO", "set_rR_from_sR");
+}
+
+void ModuleIO::sum_HR(
+    const Parallel_Orbitals& pv,
+    const K_Vectors& kv,
+    const hamilt::HContainer<double>* hR,
+    hamilt::HContainer<std::complex<double>>* full_hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+#endif
+)
+{
+    ModuleBase::TITLE("ModuleIO", "sum_HR");
+    ModuleBase::timer::tick("ModuleIO", "sum_HR");
+
+    // init complex full_hR
+    init_from_hR(hR, full_hR);
+#ifdef __EXX
+    const bool use_cell_nearest = (ModuleBase::Vector3<double>(std::fmod(kv.get_koffset(0), 1.0),
+                                                               std::fmod(kv.get_koffset(1), 1.0),
+                                                               std::fmod(kv.get_koffset(2), 1.0))
+                                       .norm()
+                                   < 1e-10);
+    RI::Cell_Nearest<int, int, 3, double, 3> cell_nearest;
+    // reallocate full_hR for BvK used in EXX
+    if (GlobalC::exx_info.info_global.cal_exx)
+    {
+        const std::array<int, 3> Rs_period = {kv.nmp[0], kv.nmp[1], kv.nmp[2]};
+        if (use_cell_nearest)
+        {
+            // set cell_nearest
+            std::map<int, std::array<double, 3>> atoms_pos;
+            for (int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+            {
+                atoms_pos[iat] = RI_Util::Vector3_to_array3(
+                    GlobalC::ucell.atoms[GlobalC::ucell.iat2it[iat]].tau[GlobalC::ucell.iat2ia[iat]]);
+            }
+            const std::array<std::array<double, 3>, 3> latvec = {RI_Util::Vector3_to_array3(GlobalC::ucell.a1),
+                                                                 RI_Util::Vector3_to_array3(GlobalC::ucell.a2),
+                                                                 RI_Util::Vector3_to_array3(GlobalC::ucell.a3)};
+            cell_nearest.init(atoms_pos, latvec, Rs_period);
+            hamilt::reallocate_hcontainer(GlobalC::ucell.nat, full_hR, Rs_period, &cell_nearest);
+        }
+        else
+            hamilt::reallocate_hcontainer(GlobalC::ucell.nat, full_hR, Rs_period);
+    }
+#endif
+    // add other hR
+    add_HR(hR, full_hR);
+    // add velocity complex hR
+    if (TD_Velocity::tddft_velocity)
+    {
+        if (TD_Velocity::td_vel_op == nullptr)
+        {
+            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gauge infos is null!");
+        }
+        const hamilt::HContainer<std::complex<double>>* velocity_hR = TD_Velocity::td_vel_op->get_velocity_HR_pointer();
+        add_HR(velocity_hR, full_hR);
+    }
+#ifdef __EXX
+    // add HexxR to complex full_hR
+    if (GlobalC::exx_info.info_global.cal_exx)
+    {
+        const double coeff = (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Cam
+                              || GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Ccp)
+                                 ? 1.0
+                                 : GlobalC::exx_info.info_global.hybrid_alpha;
+        for (size_t is = 0; is != PARAM.inp.nspin; ++is)
+        {
+            if (use_cell_nearest)
+                RI_2D_Comm::add_HexxR(is, coeff, *Hexxs, pv, PARAM.globalv.npol, *full_hR, &cell_nearest);
+            else
+                RI_2D_Comm::add_HexxR(is, coeff, *Hexxs, pv, PARAM.globalv.npol, *full_hR, nullptr);
+        }
+    }
+#endif
+
+    ModuleBase::timer::tick("ModuleIO", "sum_HR");
+}
+
+template <typename Tadd, typename Tfull>
+void ModuleIO::add_HR(const hamilt::HContainer<Tadd>* hR, hamilt::HContainer<Tfull>* full_hR)
+{
+    ModuleBase::TITLE("ModuleIO", "add_HR");
+    ModuleBase::timer::tick("ModuleIO", "add_HR");
+
+    for (int ipair = 0; ipair < hR->size_atom_pairs(); ++ipair)
+    {
+        hamilt::AtomPair<Tadd> atom_ij = hR->get_atom_pair(ipair);
+        const int iat1 = atom_ij.get_atom_i();
+        const int iat2 = atom_ij.get_atom_j();
+        // loop R-index
+        for (int iR = 0; iR < atom_ij.get_R_size(); iR++)
+        {
+            const ModuleBase::Vector3<int> r_index = atom_ij.get_R_index(iR);
+            hamilt::BaseMatrix<Tfull>* full_HlocR = full_hR->find_matrix(iat1, iat2, r_index.x, r_index.y, r_index.z);
+            const hamilt::BaseMatrix<Tadd>* HlocR = hR->find_matrix(iat1, iat2, r_index.x, r_index.y, r_index.z);
+
+            if (full_HlocR == nullptr || HlocR == nullptr)
+                ModuleBase::WARNING_QUIT("ModuleIO::add_HR", "HR cannot be nullptr!");
+
+            for (int i = 0; i < atom_ij.get_row_size(); ++i)
+            {
+                for (int j = 0; j < atom_ij.get_col_size(); ++j)
+                {
+                    Tadd v = HlocR->get_value(i, j);
+                    full_HlocR->add_element(i, j, Tfull(v));
+                }
+            }
+        }
+    }
+
+    ModuleBase::timer::tick("ModuleIO", "add_HR");
+}
+
+// for molecule, if vacuum size is small, the number of R of Hs is smaller than SR
+// which may lead to some errors
+void ModuleIO::cal_velocity_basis_k(const LCAO_Orbitals& orb,
+                                    const Parallel_Orbitals* pv,
+                                    const K_Vectors& kv,
+                                    const ModuleBase::Vector3<hamilt::HContainer<double>*>& rR,
+                                    const hamilt::HContainer<double>& sR,
+                                    const hamilt::HContainer<std::complex<double>>& hR,
+                                    std::vector<ModuleBase::Vector3<std::complex<double>*>>& velocity_basis_k)
+{
+    ModuleBase::TITLE("ModuleIO", "cal_velocity_basis_k");
+    ModuleBase::timer::tick("ModuleIO", "cal_velocity_basis_k");
+
+    const double coeff = (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Cam
+                          || GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Ccp)
+                             ? 1.0
+                             : GlobalC::exx_info.info_global.hybrid_alpha;
+    const int nlocal = PARAM.globalv.nlocal;
+    const char N_char = 'N';
+    const std::complex<double> one_imag = ModuleBase::IMAG_UNIT;
+    const std::complex<double> neg_one_imag = ModuleBase::NEG_IMAG_UNIT;
+    const std::complex<double> one_real = ModuleBase::ONE;
+    const std::complex<double> neg_one_real = ModuleBase::NEG_ONE;
+    const std::complex<double> zero_complex = ModuleBase::ZERO;
+
+    std::complex<double>* hk = new std::complex<double>[pv->nloc];
+    std::complex<double>* sk = new std::complex<double>[pv->nloc];
+    std::complex<double>* partial_hk = new std::complex<double>[pv->nloc];
+    std::complex<double>* partial_sk = new std::complex<double>[pv->nloc];
+    std::complex<double>* rk = new std::complex<double>[pv->nloc];
+    std::complex<double>* h_is = new std::complex<double>[pv->nloc];
+    std::complex<double>* h_is_r = new std::complex<double>[pv->nloc];
+    std::complex<double>* r_is = new std::complex<double>[pv->nloc];
+    std::complex<double>* r_is_h = new std::complex<double>[pv->nloc];
+    std::complex<double>* h_is_ps = new std::complex<double>[pv->nloc];
+
+    // for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+    // {
+    //     for (int i = 0; i < hR.size_atom_pairs(); ++i)
+    //     {
+    //         hamilt::AtomPair<double>& tmp = rR[i_alpha]->get_atom_pair(i);
+    //         std::cout<<"cal_velocity_basis_k: "<<rR[i_alpha]->size_atom_pairs()<<" R_size:
+    //         "<<tmp.get_R_size()<<std::endl; for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+    //         {
+    //             const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+    //             std::cout<<"r_index: "<<r_index<<std::endl;
+    //         }
+    //     }
+    // }
+
+    for (size_t ik = 0; ik != kv.get_nks(); ++ik)
+    {
+        // set H(k), S(k)
+        // 1.1 set H(k)
+        ModuleBase::GlobalFunc::ZEROS(hk, pv->nloc);
+        const int nrow = pv->get_row_size();
+        if (elecstate::H_TDDFT_pw::stype == 2)
+            TD_Velocity::td_vel_op->folding_HR_td(hR, hk, kv.kvec_d[ik], nrow, 1);
+        else
+            hamilt::folding_HR(hR, hk, kv.kvec_d[ik], nrow, 1);
+        // 1.2 set S(k)
+        ModuleBase::GlobalFunc::ZEROS(sk, pv->nloc);
+        if (elecstate::H_TDDFT_pw::stype == 2)
+            TD_Velocity::td_vel_op->folding_HR_td(sR, sk, kv.kvec_d[ik], nrow, 1);
+        else
+            hamilt::folding_HR(sR, sk, kv.kvec_d[ik], nrow, 1);
+
+        // for (int ir = 0; ir < pv->nrow; ir++)
+        // {
+        //     const int iwt1 = pv->local2global_row(ir);
+        //     const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+        //     for (int ic = 0; ic < pv->ncol; ic++)
+        //     {
+        //         const int iwt2 = pv->local2global_col(ic);
+        //         const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+        //         const int irc = ic * pv->nrow + ir;
+        //         std::cout << "ik: " << ik << " iat1:" << iat1 << " iat2:" << iat2 << " iwt1: " << iwt1
+        //                   << " iwt2: " << iwt2 << " hk: " << hk[irc] << std::endl;
+        //     }
+        // }
+        // 2. set inverse S(k) -> sk will be changed to sk_inv
+        int* ipiv = new int[pv->nloc];
+        int info = 0;
+        // 2.1 compute ipiv
+        ScalapackConnector::getrf(nlocal, nlocal, sk, 1, 1, pv->desc, ipiv, &info);
+        int lwork = -1;
+        int liwotk = -1;
+        std::vector<std::complex<double>> work(1, 0);
+        std::vector<int> iwork(1, 0);
+        // 2.2 compute work
+        ScalapackConnector::getri(nlocal, sk, 1, 1, pv->desc, ipiv, work.data(), &lwork, iwork.data(), &liwotk, &info);
+        lwork = work[0].real();
+        work.resize(lwork, 0);
+        liwotk = iwork[0];
+        iwork.resize(liwotk, 0);
+        // 2.3 compute inverse matrix of Sk
+        ScalapackConnector::getri(nlocal,
+                                  sk, // return sk^-1
+                                  1,
+                                  1,
+                                  pv->desc,
+                                  ipiv,
+                                  work.data(),
+                                  &lwork,
+                                  iwork.data(),
+                                  &liwotk,
+                                  &info);
+        delete[] ipiv;
+        assert(0 == info);
+        for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+        {
+            // 3. set partial_H(k), partial_S(k) and r(k)
+            // 3.1 set partial_H(k)
+            ModuleBase::GlobalFunc::ZEROS(partial_hk, pv->nloc);
+            if (elecstate::H_TDDFT_pw::stype == 2)
+                TD_Velocity::td_vel_op->folding_partial_HR_td(hR, partial_hk, kv.kvec_d[ik], i_alpha, nrow, 1);
+            else
+                hamilt::folding_partial_HR(hR, partial_hk, kv.kvec_d[ik], GlobalC::ucell, i_alpha, nrow, 1);
+            // 3.2 set partial S(k)
+            ModuleBase::GlobalFunc::ZEROS(partial_sk, pv->nloc);
+            if (elecstate::H_TDDFT_pw::stype == 2)
+                TD_Velocity::td_vel_op->folding_partial_HR_td(sR, partial_sk, kv.kvec_d[ik], i_alpha, nrow, 1);
+            else
+                hamilt::folding_partial_HR(sR, partial_sk, kv.kvec_d[ik], GlobalC::ucell, i_alpha, nrow, 1);
+            //  if(i_alpha == 2)
+            // {
+            //     for(int ir=0;ir< pv->nrow; ir++)
+            //     {
+            //         const int iwt1 = pv->local2global_row(ir);
+            //         const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+            //         for(int ic=0;ic< pv->ncol; ic++)
+            //         {
+            //             const int iwt2 = pv->local2global_col(ic);
+            //             const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+            //             const int irc=ic*pv->nrow + ir;
+            //             std::cout<<"ik: "<<ik<<" i_alpha: "<<i_alpha<<" iat1:"<<iat1<<" iat2:"<<iat2<<" iwt1:
+            //             "<<iwt1<<" iwt2: "<<iwt2<<" partial_sk: "<<partial_sk[irc]<<std::endl;
+            //         }
+            //     }
+            // }
+            // 3.3 set r(k)
+            // std::cout << "set r(k): " << "i_alpha: " << i_alpha << std::endl;
+            ModuleBase::GlobalFunc::ZEROS(rk, pv->nloc);
+            // folding_rR(rR[i_alpha], partial_sk, rk, pv, kv.kvec_d[ik], nrow, 1);
+            if (elecstate::H_TDDFT_pw::stype == 2)
+                TD_Velocity::td_vel_op->folding_HR_td(*rR[i_alpha], rk, kv.kvec_d[ik], nrow, 1);
+            else
+                hamilt::folding_HR(*rR[i_alpha], rk, kv.kvec_d[ik], nrow, 1); // set r(k)
+            // if (i_alpha == 2)
+            // {
+            //     std::cout << "ik: " << ik << " i_alpha: " << i_alpha << std::endl;
+            //     for (int ir = 0; ir < pv->nrow; ir++)
+            //     {
+            //         const int iwt1 = pv->local2global_row(ir);
+            //         const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+            //         for (int ic = 0; ic < pv->ncol; ic++)
+            //         {
+            //             const int iwt2 = pv->local2global_col(ic);
+            //             const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+            //             const int irc = ic * pv->nrow + ir;
+            //             std::cout << " iat1: " << iat1 << " iat2: " << iat2 << " iw1: " <<
+            //             GlobalC::ucell.iwt2iw[iwt1]
+            //                       << " iw2: " << GlobalC::ucell.iwt2iw[iwt2] << " rk: " << rk[irc] << std::endl;
+            //         }
+            //     }
+            // }
+            // 4. calculate <\vu,k|v_a|\mu,k> = partial_Hk + IMAG_UNIT * (Hk * Sk_inv * rk) - IMAG_UNIT * (rk * Sk_inv *
+            // Hk) - Hk * Sk_inv * partial_Sk
+            // 4.1.1 Hk * Sk_inv (note 2.)
+            ModuleBase::GlobalFunc::ZEROS(h_is, pv->nloc);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     one_real,
+                                     hk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     sk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     zero_complex,
+                                     h_is,
+                                     1,
+                                     1,
+                                     pv->desc);
+            // 4.1.2 (Hk * Sk_inv) * rk
+            ModuleBase::GlobalFunc::ZEROS(h_is_r, pv->nloc);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     one_real,
+                                     h_is,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     rk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     zero_complex,
+                                     h_is_r,
+                                     1,
+                                     1,
+                                     pv->desc);
+            // 4.2.1 rk * Sk_inv (note 2.)
+            ModuleBase::GlobalFunc::ZEROS(r_is, pv->nloc);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     one_real,
+                                     rk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     sk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     zero_complex,
+                                     r_is,
+                                     1,
+                                     1,
+                                     pv->desc);
+            // 4.2.2 (rk * Sk_inv) * Hk
+            ModuleBase::GlobalFunc::ZEROS(r_is_h, pv->nloc);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     one_real,
+                                     r_is,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     hk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     zero_complex,
+                                     r_is_h,
+                                     1,
+                                     1,
+                                     pv->desc);
+            // 4.3.1 (Hk * Sk_inv) * partial_Sk
+            ModuleBase::GlobalFunc::ZEROS(h_is_ps, pv->nloc);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     one_real,
+                                     h_is,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     partial_sk,
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     zero_complex,
+                                     h_is_ps,
+                                     1,
+                                     1,
+                                     pv->desc);
+            // 4.4 h_is_r will be changed to partial_Hk + IMAG_UNIT * (Hk * Sk_inv * rk)
+            ScalapackConnector::geadd('N',
+                                      nlocal,
+                                      nlocal,
+                                      one_real,
+                                      partial_hk,
+                                      1,
+                                      1,
+                                      pv->desc,
+                                      one_imag,
+                                      h_is_r,
+                                      1,
+                                      1,
+                                      pv->desc);
+            // 4.5 r_is_h will be changed to h_is_r - IMAG_UNIT * (rk * Sk_inv * Hk)
+            ScalapackConnector::geadd('N',
+                                      nlocal,
+                                      nlocal,
+                                      one_real,
+                                      h_is_r,
+                                      1,
+                                      1,
+                                      pv->desc,
+                                      neg_one_imag,
+                                      r_is_h,
+                                      1,
+                                      1,
+                                      pv->desc);
+            // 4.6 h_is_ps will be changed to r_is_h - Hk * Sk_inv * partial_Sk
+            ScalapackConnector::geadd('N',
+                                      nlocal,
+                                      nlocal,
+                                      one_real,
+                                      r_is_h,
+                                      1,
+                                      1,
+                                      pv->desc,
+                                      neg_one_real,
+                                      h_is_ps,
+                                      1,
+                                      1,
+                                      pv->desc);
+            // 5. copy h_is_ps to velocity_basis_k[ik][i_alpha]
+            BlasConnector::copy(pv->nloc, h_is_ps, 1, velocity_basis_k[ik][i_alpha], 1);
+            // if(i_alpha == 2)
+            // {
+            //     for(int ir=0;ir< pv->nrow; ir++)
+            //     {
+            //         const int iwt1 = pv->local2global_row(ir);
+            //         const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+            //         for(int ic=0;ic< pv->ncol; ic++)
+            //         {
+            //             const int iwt2 = pv->local2global_col(ic);
+            //             const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+            //             const int irc=ic*pv->nrow + ir;
+            //             std::cout<<"ik: "<<ik<<" i_alpha: "<<i_alpha<<" iat1:"<<iat1<<" iat2:"<<iat2<<" v_basis_k:
+            //             "<<velocity_basis_k[ik][i_alpha][irc]<<std::endl;
+            //         }
+            //     }
+            // }
+        }
+    }
+
+    delete[] hk;
+    delete[] sk;
+    delete[] partial_hk;
+    delete[] partial_sk;
+    delete[] rk;
+    delete[] h_is;
+    delete[] h_is_r;
+    delete[] r_is;
+    delete[] r_is_h;
+    delete[] h_is_ps;
+    ModuleBase::timer::tick("ModuleIO", "cal_velocity_basis_k");
+}
+
+void ModuleIO::cal_velocity_matrix(const psi::Psi<std::complex<double>>* psi,
+                                   const Parallel_Orbitals* pv,
+                                   const K_Vectors& kv,
+                                   const std::vector<ModuleBase::Vector3<std::complex<double>*>>& velocity_basis_k,
+                                   std::vector<std::array<ModuleBase::ComplexMatrix, 3>>& velocity_k)
+{
+    ModuleBase::TITLE("ModuleIO", "cal_velocity_matrix");
+    ModuleBase::timer::tick("ModuleIO", "cal_velocity_matrix");
+
+    const char N_char = 'N';
+    const char C_char = 'C';
+    const std::complex<double> one_real = ModuleBase::ONE;
+    const std::complex<double> zero_complex = ModuleBase::ZERO;
+    const double zero_double = 0.0;
+    const int nlocal = PARAM.globalv.nlocal;
+    const int nbands = PARAM.inp.nbands;
+    std::complex<double>* vk_c = new std::complex<double>[pv->ncol_bands * pv->nrow_bands]; // local one
+    std::complex<double>* v_c = new std::complex<double>[pv->nloc_wfc];
+
+    for (int ik = 0; ik < kv.get_nks(); ik++)
+    {
+        // 1. set C
+        psi->fix_k(ik);
+        // 2. set <\Psi_{n,\mu}|v_{\mu,\nu}|\Psi_{m,\nu}> = C^\dagger_{n,\mu} * v_{\mu,\nu} * C_{\nu,m}
+        for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+        {
+            ModuleBase::GlobalFunc::ZEROS(vk_c, pv->ncol_bands * pv->nrow_bands);
+            ModuleBase::GlobalFunc::ZEROS(v_c, pv->nloc_wfc);
+            // v_c_{\mu,m} = v_{\mu,\nu} * C_{\nu,m}
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nbands,
+                                     nlocal,
+                                     one_real,
+                                     velocity_basis_k[ik][i_alpha],
+                                     1,
+                                     1,
+                                     pv->desc,
+                                     psi[0].get_pointer(),
+                                     1,
+                                     1,
+                                     pv->desc_wfc,
+                                     zero_complex,
+                                     v_c,
+                                     1,
+                                     1,
+                                     pv->desc_wfc);
+            // velocity_k_{n,m} = C^\dagger_{n,\mu} * v_c_{\mu,m}
+            ScalapackConnector::gemm(C_char,
+                                     N_char,
+                                     nbands,
+                                     nbands,
+                                     nlocal,
+                                     one_real,
+                                     psi[0].get_pointer(),
+                                     1,
+                                     1,
+                                     pv->desc_wfc,
+                                     v_c,
+                                     1,
+                                     1,
+                                     pv->desc_wfc,
+                                     zero_complex,
+                                     vk_c,
+                                     1,
+                                     1,
+                                     pv->desc_Eij);
+
+            for (int ir = 0; ir < PARAM.inp.nbands; ++ir)
+            {
+                // const int iwt1 = pv->local2global_row(ir);
+                // const int iat1 = GlobalC::ucell.iwt2iat[iwt1];
+                for (int ic = 0; ic < PARAM.inp.nbands; ++ic)
+                {
+                    const int irc = ic * pv->nrow + ir;
+                    if (pv->in_this_processor(ir, ic))
+                    {
+                        // const int iwt2 = pv->local2global_col(ic);
+                        // const int iat2 = GlobalC::ucell.iwt2iat[iwt2];
+                        velocity_k[ik][i_alpha](ir, ic) = vk_c[irc];
+                        // if (i_alpha == 0)
+                        // {
+                        //     std::cout<<"ik: "<<ik<<" i_alpha: "<<i_alpha<<" iat1:"<<iat1<<" iat2:"<<iat2<<" vk:
+                        //     "<<velocity_basis_k[ik][i_alpha][irc]<<std::endl;
+                        // }
+                    }
+                }
+            }
+        }
+    }
+
+    delete[] vk_c;
+    delete[] v_c;
+
+    ModuleBase::timer::tick("ModuleIO", "cal_velocity_matrix");
+}
+
+void ModuleIO::cal_current_comm_k(const LCAO_Orbitals& orb,
+                                  const Parallel_Orbitals* pv,
+                                  const K_Vectors& kv,
+                                  cal_r_overlap_R& r_calculator,
+                                  const hamilt::HContainer<double>& sR,
+                                  const hamilt::HContainer<std::complex<double>>& hR,
+                                  const psi::Psi<std::complex<double>>* psi,
+                                  const elecstate::ElecState* pelec,
+                                  std::vector<ModuleBase::Vector3<double>>& current_k)
+{
+    ModuleBase::TITLE("ModuleIO", "cal_current_exx");
+    ModuleBase::timer::tick("ModuleIO", "cal_current_exx");
+
+    const int nlocal = PARAM.globalv.nlocal;
+    const int nbands = PARAM.inp.nbands;
+    // init
+    ModuleBase::Vector3<hamilt::HContainer<double>*> rR;
+    std::vector<ModuleBase::Vector3<std::complex<double>*>> velocity_basis_k;
+    std::vector<std::array<ModuleBase::ComplexMatrix, 3>> velocity_k;
+    velocity_basis_k.resize(kv.get_nks());
+    velocity_k.resize(kv.get_nks());
+    for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+    {
+        rR[i_alpha] = new hamilt::HContainer<double>(pv);
+        for (int ik = 0; ik < kv.get_nks(); ik++)
+        {
+            velocity_basis_k[ik][i_alpha] = new std::complex<double>[pv->nloc];
+            ModuleBase::GlobalFunc::ZEROS(velocity_basis_k[ik][i_alpha], pv->nloc);
+            velocity_k[ik][i_alpha].create(nbands, nbands);
+        }
+    }
+    // set rR
+    set_rR_from_hR(orb, pv, r_calculator, &hR, rR);
+    // set velocity_basis_k
+    cal_velocity_basis_k(orb, pv, kv, rR, sR, hR, velocity_basis_k);
+    // set velocity_k
+    cal_velocity_matrix(psi, pv, kv, velocity_basis_k, velocity_k);
+
+    // sum n and m for current_k
+    for (size_t ik = 0; ik != kv.get_nks(); ++ik)
+        for (size_t i_alpha = 0; i_alpha != 3; ++i_alpha)
+        {
+            for (size_t ib = 0; ib != PARAM.inp.nbands; ++ib)
+                current_k[ik][i_alpha] -= pelec->wg(ik, ib) * velocity_k[ik][i_alpha](ib, ib).real() / 2.0; // for unit
+        }
+
+    for (size_t i_alpha = 0; i_alpha < 3; ++i_alpha)
+    {
+        delete rR[i_alpha];
+        for (int ik = 0; ik < kv.get_nks(); ik++)
+            delete[] velocity_basis_k[ik][i_alpha];
+    }
+
+    ModuleBase::TITLE("ModuleIO", "cal_current_exx");
+}
 void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
-                        elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
-                        int nspin)
+                          elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
+                          int nspin)
 {
     ModuleBase::TITLE("ModuleIO", "cal_tmp_DM");
     ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
@@ -32,6 +960,136 @@ void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>
     }
     ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
 }
+
+void ModuleIO::write_current(
+    const int istep,
+    const psi::Psi<std::complex<double>>* psi,
+    const elecstate::ElecState* pelec,
+    const K_Vectors& kv,
+    const Parallel_Orbitals* pv,
+    const LCAO_Orbitals& orb,
+    cal_r_overlap_R& r_calculator,
+    const hamilt::HContainer<double>* sR,
+    const hamilt::HContainer<double>* hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+#endif
+)
+{
+    ModuleBase::TITLE("ModuleIO", "write_current");
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+    double omega = GlobalC::ucell.omega;
+
+    std::vector<ModuleBase::Vector3<double>> current_k;
+    hamilt::HContainer<std::complex<double>>* full_hR;
+    full_hR = new hamilt::HContainer<std::complex<double>>(pv);
+    current_k.resize(kv.get_nks());
+    ModuleBase::Vector3<double> current_total;
+
+    if (GlobalC::exx_info.info_global.cal_exx)
+        sum_HR(*pv, kv, hR, full_hR, Hexxs);
+    else
+        sum_HR(*pv, kv, hR, full_hR);
+    cal_current_comm_k(orb, pv, kv, r_calculator, *sR, *full_hR, psi, pelec, current_k);
+    for (int dir = 0; dir < 3; dir++)
+        for (int ik = 0; ik < kv.get_nks(); ik++)
+            current_total[dir] += current_k[ik][dir];
+    delete full_hR;
+
+    if (GlobalV::MY_RANK == 0)
+    {
+        std::string filename = PARAM.globalv.global_out_dir + "current_total_comm.dat";
+        std::ofstream fout;
+        fout.open(filename, std::ios::app);
+        fout << std::setprecision(16);
+        fout << std::scientific;
+        fout << istep << " " << current_total[0] / omega << " " << current_total[1] / omega << " "
+             << current_total[2] / omega << std::endl;
+        fout.close();
+    }
+
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+}
+
+void ModuleIO::write_current_eachk(
+    const int istep,
+    const psi::Psi<std::complex<double>>* psi,
+    const elecstate::ElecState* pelec,
+    const K_Vectors& kv,
+    const Parallel_Orbitals* pv,
+    const LCAO_Orbitals& orb,
+    cal_r_overlap_R& r_calculator,
+    const hamilt::HContainer<double>* sR,
+    const hamilt::HContainer<double>* hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+#endif
+)
+{
+    ModuleBase::TITLE("ModuleIO", "write_current");
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+    double omega = GlobalC::ucell.omega;
+
+    std::vector<ModuleBase::Vector3<double>> current_k;
+    hamilt::HContainer<std::complex<double>>* full_hR;
+    full_hR = new hamilt::HContainer<std::complex<double>>(pv);
+    current_k.resize(kv.get_nks());
+    if (GlobalC::exx_info.info_global.cal_exx)
+        sum_HR(*pv, kv, hR, full_hR, Hexxs);
+    else
+        sum_HR(*pv, kv, hR, full_hR);
+    sum_HR(*pv, kv, hR, full_hR);
+    cal_current_comm_k(orb, pv, kv, r_calculator, *sR, *full_hR, psi, pelec, current_k);
+    delete full_hR;
+
+    int nspin0 = 1;
+    if (PARAM.inp.nspin == 2)
+    {
+        nspin0 = 2;
+    }
+    for (int is = 0; is < nspin0; ++is)
+    {
+        for (int ik = 0; ik < kv.get_nks(); ik++)
+        {
+            if (is == kv.isk[ik])
+            {
+                if (GlobalV::MY_RANK == 0 && TD_Velocity::out_current_k)
+                {
+                    std::string filename = PARAM.globalv.global_out_dir + "current_spin" + std::to_string(is)
+                                           + "_ik_comm" + std::to_string(ik) + ".dat";
+                    std::ofstream fout;
+                    fout.open(filename, std::ios::app);
+                    fout << std::setprecision(16);
+                    fout << std::scientific;
+                    fout << istep << " " << current_k[ik][0] / omega << " " << current_k[ik][1] / omega << " "
+                         << current_k[ik][2] / omega << std::endl;
+                    fout.close();
+                }
+            }
+        }
+    }
+
+    ModuleBase::Vector3<double> current_total;
+    for (int dir = 0; dir < 3; dir++)
+        for (int ik = 0; ik < kv.get_nks(); ik++)
+            current_total[dir] += current_k[ik][dir];
+    if (GlobalV::MY_RANK == 0)
+    {
+        std::string filename = PARAM.globalv.global_out_dir + "current_total_comm.dat";
+        std::ofstream fout;
+        fout.open(filename, std::ios::app);
+        fout << std::setprecision(16);
+        fout << std::scientific;
+        fout << istep << " " << current_total[0] / omega << " " << current_total[1] / omega << " "
+             << current_total[2] / omega << std::endl;
+        fout.close();
+    }
+
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+}
+
 void ModuleIO::write_current(const int istep,
                              const psi::Psi<std::complex<double>>* psi,
                              const elecstate::ElecState* pelec,
@@ -42,7 +1100,6 @@ void ModuleIO::write_current(const int istep,
                              const TD_current* cal_current,
                              Record_adj& ra)
 {
-
     ModuleBase::TITLE("ModuleIO", "write_current");
     ModuleBase::timer::tick("ModuleIO", "write_current");
     std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
@@ -57,18 +1114,18 @@ void ModuleIO::write_current(const int istep,
     {
         if (TD_Velocity::td_vel_op == nullptr)
         {
-            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gague infos is null!");
+            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gauge infos is null!");
         }
         for (int dir = 0; dir < 3; dir++)
         {
             current_term[dir] = TD_Velocity::td_vel_op->get_current_term_pointer(dir);
         }
     }
-    double omega=GlobalC::ucell.omega;
+    double omega = GlobalC::ucell.omega;
     // construct a DensityMatrix object
     // Since the function cal_dm_psi do not suport DMR in complex type, I replace it with two DMR in double type. Should
     // be refactored in the future.
-    const int nspin_dm = std::map<int, int>({ {1,1},{2,2},{4,1} })[PARAM.inp.nspin];
+    const int nspin_dm = std::map<int, int>({{1, 1}, {2, 2}, {4, 1}})[PARAM.inp.nspin];
     elecstate::DensityMatrix<std::complex<double>, double> DM_real(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
     elecstate::DensityMatrix<std::complex<double>, double> DM_imag(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
     // calculate DMK
@@ -87,8 +1144,8 @@ void ModuleIO::write_current(const int istep,
     {
         double local_current[3] = {0.0, 0.0, 0.0};
 #else
-        // ModuleBase::matrix& local_soverlap = soverlap;
-        double* local_current = current_total;
+    // ModuleBase::matrix& local_soverlap = soverlap;
+    double* local_current = current_total;
 #endif
         ModuleBase::Vector3<double> tau1, dtau, tau2;
 
@@ -117,8 +1174,9 @@ void ModuleIO::write_current(const int istep,
                 double Rx = ra.info[iat][cb][0];
                 double Ry = ra.info[iat][cb][1];
                 double Rz = ra.info[iat][cb][2];
-                //std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:" << Rz << std::endl;
-                //  get BaseMatrix
+                // std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:" << Rz
+                // << std::endl;
+                //   get BaseMatrix
                 hamilt::BaseMatrix<double>* tmp_matrix_real
                     = DM_real.get_DMR_pointer(1)->find_matrix(iat1, iat2, Rx, Ry, Rz);
                 hamilt::BaseMatrix<double>* tmp_matrix_imag
@@ -157,7 +1215,7 @@ void ModuleIO::write_current(const int istep,
                         // std::cout<<"mu: "<< mu <<" nu: "<< nu << std::endl;
                         // std::cout<<"dm2d1_real: "<< dm2d1_real << " dm2d1_imag: "<< dm2d1_imag << std::endl;
                         // std::cout<<"rvz: "<< rvz.real() << " " << rvz.imag() << std::endl;
-                        local_current[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();    
+                        local_current[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();
                         local_current[1] -= dm2d1_real * rvy.real() - dm2d1_imag * rvy.imag();
                         local_current[2] -= dm2d1_real * rvz.real() - dm2d1_imag * rvz.imag();
                     } // end kk
@@ -183,7 +1241,8 @@ void ModuleIO::write_current(const int istep,
         fout.open(filename, std::ios::app);
         fout << std::setprecision(16);
         fout << std::scientific;
-        fout << istep << " " << current_total[0]/omega << " " << current_total[1]/omega << " " << current_total[2]/omega << std::endl;
+        fout << istep << " " << current_total[0] / omega << " " << current_total[1] / omega << " "
+             << current_total[2] / omega << std::endl;
         fout.close();
     }
 
@@ -191,11 +1250,11 @@ void ModuleIO::write_current(const int istep,
     return;
 }
 void ModuleIO::cal_tmp_DM_k(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
-                          elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
-                          const int ik,
-                          const int nspin,
-                          const int is,
-                          const bool reset)
+                            elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
+                            const int ik,
+                            const int nspin,
+                            const int is,
+                            const bool reset)
 {
     ModuleBase::TITLE("ModuleIO", "cal_tmp_DM_k");
     ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM_k");
@@ -206,7 +1265,7 @@ void ModuleIO::cal_tmp_DM_k(elecstate::DensityMatrix<std::complex<double>, doubl
 
     hamilt::HContainer<double>* tmp_DMR_real = DM_real.get_DMR_vector()[is - 1];
     hamilt::HContainer<double>* tmp_DMR_imag = DM_imag.get_DMR_vector()[is - 1];
-    if(reset)
+    if (reset)
     {
         tmp_DMR_real->set_zero();
         tmp_DMR_imag->set_zero();
@@ -239,22 +1298,24 @@ void ModuleIO::cal_tmp_DM_k(elecstate::DensityMatrix<std::complex<double>, doubl
             if (PARAM.inp.nspin != 4)
             {
                 double arg_td = 0.0;
-                if(elecstate::H_TDDFT_pw::stype == 2)
+                if (elecstate::H_TDDFT_pw::stype == 2)
                 {
-                    //new
-                    //cal tddft phase for mixing gague
+                    // new
+                    // cal tddft phase for mixing gague
                     const int iat1 = tmp_ap_real.get_atom_i();
                     const int iat2 = tmp_ap_real.get_atom_j();
-                    ModuleBase::Vector3<double> dtau = TD_Velocity::td_vel_op->get_ucell()->cal_dtau(iat1, iat2, r_index);
+                    ModuleBase::Vector3<double> dtau
+                        = TD_Velocity::td_vel_op->get_ucell()->cal_dtau(iat1, iat2, r_index);
                     double& tmp_lat0 = TD_Velocity::td_vel_op->get_ucell()->lat0;
                     arg_td = TD_Velocity::td_vel_op->cart_At * dtau * tmp_lat0;
 
                     /*std::cout << "arg_td " << arg_td << std::endl;
-                    std::cout << "cart_At " << TD_Velocity::td_vel_op->cart_At[0] << " "<< TD_Velocity::td_vel_op->cart_At[1] << " " << TD_Velocity::td_vel_op->cart_At[2] << std::endl;
+                    std::cout << "cart_At " << TD_Velocity::td_vel_op->cart_At[0] << " "<<
+                    TD_Velocity::td_vel_op->cart_At[1] << " " << TD_Velocity::td_vel_op->cart_At[2] << std::endl;
                     std::cout << "dtau " << dtau[0] << " "<< dtau[1] << " " << dtau[2] << std::endl;
                     std::cout << "ucell->lat0 " << tmp_lat0 << std::endl;
                     std::cout << "iat1 " << iat1 << " " << "iat2 " << iat2 << std::endl;*/
-                    //new
+                    // new
                 }
                 // cal k_phase
                 // if TK==std::complex<double>, kphase is e^{ikR}
@@ -313,16 +1374,15 @@ void ModuleIO::cal_tmp_DM_k(elecstate::DensityMatrix<std::complex<double>, doubl
 }
 
 void ModuleIO::write_current_eachk(const int istep,
-                             const psi::Psi<std::complex<double>>* psi,
-                             const elecstate::ElecState* pelec,
-                             const K_Vectors& kv,
-                             const TwoCenterIntegrator* intor,
-                             const Parallel_Orbitals* pv,
-                             const LCAO_Orbitals& orb,
-                             const TD_current* cal_current,
-                             Record_adj& ra)
+                                   const psi::Psi<std::complex<double>>* psi,
+                                   const elecstate::ElecState* pelec,
+                                   const K_Vectors& kv,
+                                   const TwoCenterIntegrator* intor,
+                                   const Parallel_Orbitals* pv,
+                                   const LCAO_Orbitals& orb,
+                                   const TD_current* cal_current,
+                                   Record_adj& ra)
 {
-
     ModuleBase::TITLE("ModuleIO", "write_current");
     ModuleBase::timer::tick("ModuleIO", "write_current");
     std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
@@ -344,11 +1404,11 @@ void ModuleIO::write_current_eachk(const int istep,
             current_term[dir] = TD_Velocity::td_vel_op->get_current_term_pointer(dir);
         }
     }
-    double omega=GlobalC::ucell.omega;
+    double omega = GlobalC::ucell.omega;
     // construct a DensityMatrix object
     // Since the function cal_dm_psi do not suport DMR in complex type, I replace it with two DMR in double type. Should
     // be refactored in the future.
-    const int nspin_dm = std::map<int, int>({ {1,1},{2,2},{4,1} })[PARAM.inp.nspin];
+    const int nspin_dm = std::map<int, int>({{1, 1}, {2, 2}, {4, 1}})[PARAM.inp.nspin];
     elecstate::DensityMatrix<std::complex<double>, double> DM_real(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
     elecstate::DensityMatrix<std::complex<double>, double> DM_imag(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
     // calculate DMK
@@ -408,8 +1468,9 @@ void ModuleIO::write_current_eachk(const int istep,
                         double Rx = ra.info[iat][cb][0];
                         double Ry = ra.info[iat][cb][1];
                         double Rz = ra.info[iat][cb][2];
-                        //std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:" << Rz << std::endl;
-                        //  get BaseMatrix
+                        // std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:"
+                        // << Rz << std::endl;
+                        //   get BaseMatrix
                         hamilt::BaseMatrix<double>* tmp_matrix_real
                             = DM_real.get_DMR_pointer(is)->find_matrix(iat1, iat2, Rx, Ry, Rz);
                         hamilt::BaseMatrix<double>* tmp_matrix_imag
@@ -448,7 +1509,7 @@ void ModuleIO::write_current_eachk(const int istep,
                                 // std::cout<<"mu: "<< mu <<" nu: "<< nu << std::endl;
                                 // std::cout<<"dm2d1_real: "<< dm2d1_real << " dm2d1_imag: "<< dm2d1_imag << std::endl;
                                 // std::cout<<"rvz: "<< rvz.real() << " " << rvz.imag() << std::endl;
-                                local_current_ik[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();    
+                                local_current_ik[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();
                                 local_current_ik[1] -= dm2d1_real * rvy.real() - dm2d1_imag * rvy.imag();
                                 local_current_ik[2] -= dm2d1_real * rvz.real() - dm2d1_imag * rvz.imag();
                             } // end kk
@@ -479,7 +1540,8 @@ void ModuleIO::write_current_eachk(const int istep,
                 fout.open(filename, std::ios::app);
                 fout << std::setprecision(16);
                 fout << std::scientific;
-                fout << istep << " " << current_ik[0]/omega << " " << current_ik[1]/omega << " " << current_ik[2]/omega << std::endl;
+                fout << istep << " " << current_ik[0] / omega << " " << current_ik[1] / omega << " "
+                     << current_ik[2] / omega << std::endl;
                 fout.close();
             }
             // write end
@@ -492,7 +1554,8 @@ void ModuleIO::write_current_eachk(const int istep,
         fout.open(filename, std::ios::app);
         fout << std::setprecision(16);
         fout << std::scientific;
-        fout << istep << " " << current_total[0]/omega << " " << current_total[1]/omega << " " << current_total[2]/omega << std::endl;
+        fout << istep << " " << current_total[0] / omega << " " << current_total[1] / omega << " "
+             << current_total[2] / omega << std::endl;
         fout.close();
     }
 
diff --git a/source/module_io/td_current_io.h b/source/module_io/td_current_io.h
index 0c0cba3d8d..f65b2896c3 100644
--- a/source/module_io/td_current_io.h
+++ b/source/module_io/td_current_io.h
@@ -1,48 +1,143 @@
 #ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_IO_TD_CURRENT_IO_H
 #define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_IO_TD_CURRENT_IO_H
 
+#include "cal_r_overlap_R.h"
 #include "module_basis/module_nao/two_center_bundle.h"
 #include "module_elecstate/elecstate_lcao.h"
 #include "module_elecstate/module_dm/density_matrix.h"
-#include "module_psi/psi.h"
 #include "module_hamilt_lcao/module_tddft/td_current.h"
+#include "module_psi/psi.h"
 
 namespace ModuleIO
 {
 #ifdef __LCAO
 /// @brief func to output current, only used in tddft
 void write_current_eachk(const int istep,
-                        const psi::Psi<std::complex<double>>* psi,
-                        const elecstate::ElecState* pelec,
-                        const K_Vectors& kv,
-                        const TwoCenterIntegrator* intor,
-                        const Parallel_Orbitals* pv,
-                        const LCAO_Orbitals& orb,
-                        const TD_current* cal_current,
-                        Record_adj& ra);
+                         const psi::Psi<std::complex<double>>* psi,
+                         const elecstate::ElecState* pelec,
+                         const K_Vectors& kv,
+                         const TwoCenterIntegrator* intor,
+                         const Parallel_Orbitals* pv,
+                         const LCAO_Orbitals& orb,
+                         const TD_current* cal_current,
+                         Record_adj& ra);
 
 void write_current(const int istep,
-                const psi::Psi<std::complex<double>>* psi,
-                const elecstate::ElecState* pelec,
-                const K_Vectors& kv,
-                const TwoCenterIntegrator* intor,
-                const Parallel_Orbitals* pv,
-                const LCAO_Orbitals& orb,
-                const TD_current* cal_current,
-                Record_adj& ra);
+                   const psi::Psi<std::complex<double>>* psi,
+                   const elecstate::ElecState* pelec,
+                   const K_Vectors& kv,
+                   const TwoCenterIntegrator* intor,
+                   const Parallel_Orbitals* pv,
+                   const LCAO_Orbitals& orb,
+                   const TD_current* cal_current,
+                   Record_adj& ra);
+
+/// @brief func to output current calculated using i[r,H] directly
+void write_current_eachk(
+    const int istep,
+    const psi::Psi<std::complex<double>>* psi,
+    const elecstate::ElecState* pelec,
+    const K_Vectors& kv,
+    const Parallel_Orbitals* pv,
+    const LCAO_Orbitals& orb,
+    cal_r_overlap_R& r_calculator,
+    const hamilt::HContainer<double>* sR,
+    const hamilt::HContainer<double>* hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+    = nullptr
+#endif
+);
+
+void write_current(
+    const int istep,
+    const psi::Psi<std::complex<double>>* psi,
+    const elecstate::ElecState* pelec,
+    const K_Vectors& kv,
+    const Parallel_Orbitals* pv,
+    const LCAO_Orbitals& orb,
+    cal_r_overlap_R& r_calculator,
+    const hamilt::HContainer<double>* sR,
+    const hamilt::HContainer<double>* hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+    = nullptr
+#endif
+);
 
 /// @brief calculate sum_n[𝜌_(𝑛𝑘,𝜇𝜈)] for current calculation
 void cal_tmp_DM_k(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
-                elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
-                const int ik,
-                const int nspin,
-                const int is,
-                const bool reset = true);
+                  elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
+                  const int ik,
+                  const int nspin,
+                  const int is,
+                  const bool reset = true);
 
 void cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
                 elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
                 const int nspin);
 
+void set_rR_from_hR(const LCAO_Orbitals& orb,
+                    const Parallel_Orbitals* pv,
+                    cal_r_overlap_R& r_calculator,
+                    const hamilt::HContainer<std::complex<double>>* hR,
+                    ModuleBase::Vector3<hamilt::HContainer<double>*>& rR);
+
+void sum_HR(
+    const Parallel_Orbitals& pv,
+    const K_Vectors& kv,
+    const hamilt::HContainer<double>* hR,
+    hamilt::HContainer<std::complex<double>>* full_hR,
+#ifdef __EXX
+    const std::vector<std::map<int, std::map<std::pair<int, std::array<int, 3>>, RI::Tensor<std::complex<double>>>>>*
+        Hexxs
+    = nullptr
+#endif
+);
+
+// void folding_rR(const hamilt::HContainer<double>* rR,
+//                 const std::complex<double>* partial_sk,
+//                 std::complex<double>* rk,
+//                 const Parallel_Orbitals* pv,
+//                 const ModuleBase::Vector3<double>& kvec_d_in,
+//                 const int ncol,
+//                 const int hk_type);
+template <typename Tadd, typename Tfull>
+void add_HR(const hamilt::HContainer<Tadd>* hR, hamilt::HContainer<Tfull>* full_hR);
+
+void init_from_adj(const LCAO_Orbitals& orb,
+                   const Parallel_Orbitals* pv,
+                   std::vector<AdjacentAtomInfo>& adjs_all,
+                   ModuleBase::Vector3<hamilt::HContainer<double>*>& rR);
+template <typename TR, typename TA>
+void init_from_hR(const hamilt::HContainer<TR>* hR, hamilt::HContainer<TA>* aimR);
+
+void cal_velocity_basis_k(const LCAO_Orbitals& orb,
+                          const Parallel_Orbitals* pv,
+                          const K_Vectors& kv,
+                          const ModuleBase::Vector3<hamilt::HContainer<double>*>& rR,
+                          const hamilt::HContainer<double>& sR,
+                          const hamilt::HContainer<std::complex<double>>& hR,
+                          std::vector<ModuleBase::Vector3<std::complex<double>*>>& velocity_basis_k);
+
+void cal_velocity_matrix(const psi::Psi<std::complex<double>>* psi,
+                         const Parallel_Orbitals* pv,
+                         const K_Vectors& kv,
+                         const std::vector<ModuleBase::Vector3<std::complex<double>*>>& velocity_basis_k,
+                         std::vector<std::array<ModuleBase::ComplexMatrix, 3>>& velocity_k);
+
+void cal_current_comm_k(const LCAO_Orbitals& orb,
+                        const Parallel_Orbitals* pv,
+                        const K_Vectors& kv,
+                        cal_r_overlap_R& r_calculator,
+                        const hamilt::HContainer<double>& sR,
+                        const hamilt::HContainer<std::complex<double>>& hR,
+                        const psi::Psi<std::complex<double>>* psi,
+                        const elecstate::ElecState* pelec,
+                        std::vector<ModuleBase::Vector3<double>>& current_k);
+
 #endif // __LCAO
 } // namespace ModuleIO
 #endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_IO_TD_CURRENT_IO_H
diff --git a/source/module_io/test/for_testing_input_conv.h b/source/module_io/test/for_testing_input_conv.h
index 86d6e4c55b..e85f9f5a5d 100644
--- a/source/module_io/test/for_testing_input_conv.h
+++ b/source/module_io/test/for_testing_input_conv.h
@@ -36,6 +36,7 @@ bool module_tddft::Evolve_elec::out_dipole;
 bool module_tddft::Evolve_elec::out_efield;
 bool TD_Velocity::out_current;
 bool TD_Velocity::out_current_k;
+bool TD_Velocity::out_current_comm;
 bool TD_Velocity::out_vecpot;
 bool TD_Velocity::init_vecpot_file;
 double module_tddft::Evolve_elec::td_print_eij;
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 007851e62e..dabf5a444c 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -359,6 +359,7 @@ struct Input_para
     bool out_efield = false;              ///< output the efield or not
     bool out_current = false;             ///< output the current or not
     bool out_current_k = false;           ///< output tddft current for all k points
+    bool out_current_comm = false;           ///< output the current using complete basis or not
     bool out_vecpot = false;              ///< output the vector potential or not
     bool restart_save = false;            ///< restart //Peize Lin add 2020-04-04
     bool rpa = false;                     ///< rpa calculation
diff --git a/source/module_ri/Exx_LRI_interface.hpp b/source/module_ri/Exx_LRI_interface.hpp
index 57b85c206f..af5bc56eb3 100644
--- a/source/module_ri/Exx_LRI_interface.hpp
+++ b/source/module_ri/Exx_LRI_interface.hpp
@@ -58,10 +58,12 @@ void Exx_LRI_Interface<T, Tdata>::exx_beforescf(const int istep, const K_Vectors
 #ifdef __MPI
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        if (GlobalC::restart.info_load.load_H_finish && !GlobalC::restart.info_load.restart_exx) { XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func);
-        } 
-        else if (istep > 0) { XC_Functional::set_xc_type(GlobalC::ucell.atoms[0].ncpp.xc_func);
-        } 
+        if ((GlobalC::restart.info_load.load_H_finish && !GlobalC::restart.info_load.restart_exx)
+            || (istep > 0)
+            || (PARAM.inp.init_wfc == "file"))
+        {
+            XC_Functional::set_xc_type(ucell.atoms[0].ncpp.xc_func);
+        }
         else
         {
             if (ucell.atoms[0].ncpp.xc_func == "HF" || ucell.atoms[0].ncpp.xc_func == "PBE0" || ucell.atoms[0].ncpp.xc_func == "HSE" ||
@@ -111,7 +113,8 @@ void Exx_LRI_Interface<T, Tdata>::exx_eachiterinit(const int istep, const elecst
 {
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        if (!GlobalC::exx_info.info_global.separate_loop && (this->two_level_step || istep > 0))
+        if (!GlobalC::exx_info.info_global.separate_loop && (this->two_level_step || istep > 0 || PARAM.inp.init_wfc == "file") // non separate loop case
+             || (GlobalC::exx_info.info_global.separate_loop && PARAM.inp.init_wfc == "file" && this->two_level_step == 0 && iter == 1))  // the first iter in separate loop case
         {
             const bool flag_restart = (iter == 1) ? true : false;
             auto cal = [this, &kv, &flag_restart](const elecstate::DensityMatrix<T, double>& dm_in)
@@ -285,7 +288,8 @@ bool Exx_LRI_Interface<T, Tdata>::exx_after_converge(
                 std::cout << " Updating EXX " << std::flush;
                 timeval t_start;       gettimeofday(&t_start, nullptr);
 
-                const bool flag_restart = (this->two_level_step == 0) ? true : false;
+                // if init_wfc == "file", DM is calculated in the 1st iter of the 1st two-level step, so we mix it here
+                 const bool flag_restart = (this->two_level_step == 0 && PARAM.inp.init_wfc != "file") ? true : false;
 
                 if (this->exx_spacegroup_symmetry)
                     {this->mix_DMk_2D.mix(symrot_.restore_dm(kv, dm.get_DMK_vector(), *dm.get_paraV_pointer()), flag_restart);}
diff --git a/source/module_ri/LRI_CV.h b/source/module_ri/LRI_CV.h
index d8c7de95b1..e3f4cd1827 100644
--- a/source/module_ri/LRI_CV.h
+++ b/source/module_ri/LRI_CV.h
@@ -63,12 +63,12 @@ class LRI_CV
     }
 
   private:
-    std::vector<double> orb_cutoff_;
     std::vector<std::vector<std::vector<Numerical_Orbital_Lm>>> lcaos;
     std::vector<std::vector<std::vector<Numerical_Orbital_Lm>>> abfs;
     std::vector<std::vector<std::vector<Numerical_Orbital_Lm>>> abfs_ccp;
     ModuleBase::Element_Basis_Index::IndexLNM index_lcaos;
     ModuleBase::Element_Basis_Index::IndexLNM index_abfs;
+    std::vector<double> lcaos_rcut;
     std::vector<double> abfs_ccp_rcut;
 
   public:
diff --git a/source/module_ri/LRI_CV.hpp b/source/module_ri/LRI_CV.hpp
index 12307f80e9..021a19b090 100644
--- a/source/module_ri/LRI_CV.hpp
+++ b/source/module_ri/LRI_CV.hpp
@@ -48,11 +48,11 @@ void LRI_CV<Tdata>::set_orbitals(
     ModuleBase::TITLE("LRI_CV", "set_orbitals");
     ModuleBase::timer::tick("LRI_CV", "set_orbitals");
 
-    this->orb_cutoff_ = orb.cutoffs();
     this->lcaos = lcaos_in;
     this->abfs = abfs_in;
     this->abfs_ccp = abfs_ccp_in;
 
+    this->lcaos_rcut = Exx_Abfs::Construct_Orbs::get_Rcut(this->lcaos);
     this->abfs_ccp_rcut = Exx_Abfs::Construct_Orbs::get_Rcut(this->abfs_ccp);
     const double lcaos_rmax = Exx_Abfs::Construct_Orbs::get_Rmax(this->lcaos);
     const double abfs_ccp_rmax
@@ -97,13 +97,13 @@ void LRI_CV<Tdata>::set_orbitals(
 
 template <typename Tdata>
 double LRI_CV<Tdata>::cal_V_Rcut(const int it0, const int it1) {
-    return this->abfs_ccp_rcut[it0] + this->orb_cutoff_[it1];
+    return this->abfs_ccp_rcut[it0] + this->lcaos_rcut[it1];
 }
 
 template <typename Tdata>
 double LRI_CV<Tdata>::cal_C_Rcut(const int it0, const int it1) {
-    return std::min(this->abfs_ccp_rcut[it0], this->orb_cutoff_[it0])
-           + this->orb_cutoff_[it1];
+    return std::min(this->abfs_ccp_rcut[it0], this->lcaos_rcut[it0])
+           + this->lcaos_rcut[it1];
 }
 
 template <typename Tdata>
diff --git a/source/module_ri/RI_2D_Comm.h b/source/module_ri/RI_2D_Comm.h
index 1b41cb8c78..9bc7e15a6f 100644
--- a/source/module_ri/RI_2D_Comm.h
+++ b/source/module_ri/RI_2D_Comm.h
@@ -48,6 +48,15 @@ extern void add_Hexx(const K_Vectors& kv,
                      const Parallel_Orbitals& pv,
                      TK* hk);
 
+template <typename Tdata, typename TK>
+extern void  add_Hexx_td(const K_Vectors& kv,
+                          const int ik,
+                          const double alpha,
+                          const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
+                          const Parallel_Orbitals& pv,
+                          const ModuleBase::Vector3<double>& At,
+                          TK* hk);
+
 template <typename Tdata, typename TR>
 extern void add_HexxR(const int current_spin,
                       const double alpha,
@@ -78,9 +87,9 @@ extern inline int get_iwt(const int iat, const int iw_b, const int is_b);
 
 template <typename TA, typename TAC, typename T>
 extern std::map<TA, std::map<TAC, T>> comm_map2_first(const MPI_Comm& mpi_comm,
-                                                             const std::map<TA, std::map<TAC, T>>& Ds_in,
-                                                             const std::set<TA>& s0,
-                                                             const std::set<TA>& s1);
+                                                      const std::map<TA, std::map<TAC, T>>& Ds_in,
+                                                      const std::set<TA>& s0,
+                                                      const std::set<TA>& s1);
 template <typename TA, typename TAC, typename T, typename Tjudge>
 extern std::map<TA, std::map<TAC, T>> comm_map2(const MPI_Comm& mpi_comm,
                                                 const std::map<TA, std::map<TAC, T>>& Ds_in,
diff --git a/source/module_ri/RI_2D_Comm.hpp b/source/module_ri/RI_2D_Comm.hpp
index f9eef46505..e0287eaa33 100644
--- a/source/module_ri/RI_2D_Comm.hpp
+++ b/source/module_ri/RI_2D_Comm.hpp
@@ -6,25 +6,24 @@
 #ifndef RI_2D_COMM_HPP
 #define RI_2D_COMM_HPP
 
+#include "RI_2D_Comm.h"
+#include "RI_Util.h"
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_parameter/parameter.h"
+
 #include <Comm/Comm_Assemble/Comm_Assemble.h>
 #include <Comm/example/Communicate_Map-1.h>
 #include <Comm/example/Communicate_Map-2.h>
 #include <RI/comm/example/Communicate_Map_Period.h>
 #include <RI/comm/mix/Communicate_Tensors_Map.h>
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
-#include "module_parameter/parameter.h"
 #include <RI/global/Global_Func-2.h>
-
 #include <cmath>
 #include <stdexcept>
 #include <string>
 
-#include "RI_2D_Comm.h"
-#include "RI_Util.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
-#include "module_hamilt_pw/hamilt_pwdft/global.h"
-
 // inline RI::Tensor<double> tensor_conj(const RI::Tensor<double>& t) { return t; }
 // inline RI::Tensor<std::complex<double>> tensor_conj(const RI::Tensor<std::complex<double>>& t)
 // {
@@ -44,227 +43,316 @@ template <typename Tdata, typename Tmatrix>
 auto RI_2D_Comm::split_m2D_ktoR(const K_Vectors& kv,
                                 const std::vector<const Tmatrix*>& mks_2D,
                                 const Parallel_2D& pv,
-                                const int nspin, 
-                                const bool spgsym)
-    -> std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>
+                                const int nspin,
+                                const bool spgsym) -> std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>
 {
     ModuleBase::TITLE("RI_2D_Comm", "split_m2D_ktoR");
     ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
 
-	const TC period = RI_Util::get_Born_vonKarmen_period(kv);
-	const std::map<int,int> nspin_k = {{1,1}, {2,2}, {4,1}};
-    const double SPIN_multiple = std::map<int, double>{ {1,0.5}, {2,1}, {4,1} }.at(nspin);							// why?
+    const TC period = RI_Util::get_Born_vonKarmen_period(kv);
+    const std::map<int, int> nspin_k = {{1, 1}, {2, 2}, {4, 1}};
+    const double SPIN_multiple = std::map<int, double>{{1, 0.5}, {2, 1}, {4, 1}}.at(nspin); // why?
 
     std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>> mRs_a2D(nspin);
     for (int is_k = 0; is_k < nspin_k.at(nspin); ++is_k)
-	{
-		const std::vector<int> ik_list = RI_2D_Comm::get_ik_list(kv, is_k);
-		for(const TC &cell : RI_Util::get_Born_von_Karmen_cells(period))
-		{
+    {
+        const std::vector<int> ik_list = RI_2D_Comm::get_ik_list(kv, is_k);
+        for (const TC& cell: RI_Util::get_Born_von_Karmen_cells(period))
+        {
             RI::Tensor<Tdata> mR_2D;
             int ik_full = 0;
-            for (const int ik : ik_list)
+            for (const int ik: ik_list)
             {
                 auto set_mR_2D = [&mR_2D](auto&& mk_frac) {
-                    if (mR_2D.empty()) {
+                    if (mR_2D.empty())
+                    {
                         mR_2D = RI::Global_Func::convert<Tdata>(mk_frac);
-                    } else {
-                        mR_2D
-                            = mR_2D + RI::Global_Func::convert<Tdata>(mk_frac);
+                    }
+                    else
+                    {
+                        mR_2D = mR_2D + RI::Global_Func::convert<Tdata>(mk_frac);
                     }
                 };
                 using Tdata_m = typename Tmatrix::value_type;
                 if (!spgsym)
                 {
-                    RI::Tensor<Tdata_m> mk_2D = RI_Util::Vector_to_Tensor<Tdata_m>(*mks_2D[ik], pv.get_col_size(), pv.get_row_size());
-                    const Tdata_m frac = SPIN_multiple
-                        * RI::Global_Func::convert<Tdata_m>(std::exp(
-                            -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
+                    RI::Tensor<Tdata_m> mk_2D
+                        = RI_Util::Vector_to_Tensor<Tdata_m>(*mks_2D[ik], pv.get_col_size(), pv.get_row_size());
+                    const Tdata_m frac
+                        = SPIN_multiple
+                          * RI::Global_Func::convert<Tdata_m>(
+                              std::exp(-ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT
+                                       * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
                     if (static_cast<int>(std::round(SPIN_multiple * kv.wk[ik] * kv.get_nkstot_full())) == 2)
-                        { set_mR_2D(tensor_real(mk_2D * frac)); }
-                    else { set_mR_2D(mk_2D * frac); }
+                    {
+                        set_mR_2D(tensor_real(mk_2D * frac));
+                    }
+                    else
+                    {
+                        set_mR_2D(mk_2D * frac);
+                    }
                 }
                 else
                 { // traverse kstar, ik means ik_ibz
-                    for (auto& isym_kvd : kv.kstars[ik % ik_list.size()])
+                    for (auto& isym_kvd: kv.kstars[ik % ik_list.size()])
                     {
-                        RI::Tensor<Tdata_m> mk_2D = RI_Util::Vector_to_Tensor<Tdata_m>(*mks_2D[ik_full + is_k * kv.get_nkstot_full()], pv.get_col_size(), pv.get_row_size());
+                        RI::Tensor<Tdata_m> mk_2D
+                            = RI_Util::Vector_to_Tensor<Tdata_m>(*mks_2D[ik_full + is_k * kv.get_nkstot_full()],
+                                                                 pv.get_col_size(),
+                                                                 pv.get_row_size());
                         const Tdata_m frac = SPIN_multiple
-                            * RI::Global_Func::convert<Tdata_m>(std::exp(
-                                -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT * ((isym_kvd.second * GlobalC::ucell.G) * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
+                                             * RI::Global_Func::convert<Tdata_m>(std::exp(
+                                                 -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT
+                                                 * ((isym_kvd.second * GlobalC::ucell.G)
+                                                    * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
                         set_mR_2D(mk_2D * frac);
                         ++ik_full;
                     }
                 }
             }
-			for(int iwt0_2D=0; iwt0_2D!=mR_2D.shape[0]; ++iwt0_2D)
-			{
-				const int iwt0 =ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)
-                    ? pv.local2global_col(iwt0_2D)
-                    : pv.local2global_row(iwt0_2D);
-				int iat0, iw0_b, is0_b;
-				std::tie(iat0,iw0_b,is0_b) = RI_2D_Comm::get_iat_iw_is_block(iwt0);
-				const int it0 = GlobalC::ucell.iat2it[iat0];
-				for(int iwt1_2D=0; iwt1_2D!=mR_2D.shape[1]; ++iwt1_2D)
-				{
-					const int iwt1 =ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)
-                        ? pv.local2global_row(iwt1_2D)
-                        : pv.local2global_col(iwt1_2D);
-					int iat1, iw1_b, is1_b;
-					std::tie(iat1,iw1_b,is1_b) = RI_2D_Comm::get_iat_iw_is_block(iwt1);
-					const int it1 = GlobalC::ucell.iat2it[iat1];
+            for (int iwt0_2D = 0; iwt0_2D != mR_2D.shape[0]; ++iwt0_2D)
+            {
+                const int iwt0 = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)
+                                     ? pv.local2global_col(iwt0_2D)
+                                     : pv.local2global_row(iwt0_2D);
+                int iat0, iw0_b, is0_b;
+                std::tie(iat0, iw0_b, is0_b) = RI_2D_Comm::get_iat_iw_is_block(iwt0);
+                const int it0 = GlobalC::ucell.iat2it[iat0];
+                for (int iwt1_2D = 0; iwt1_2D != mR_2D.shape[1]; ++iwt1_2D)
+                {
+                    const int iwt1 = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)
+                                         ? pv.local2global_row(iwt1_2D)
+                                         : pv.local2global_col(iwt1_2D);
+                    int iat1, iw1_b, is1_b;
+                    std::tie(iat1, iw1_b, is1_b) = RI_2D_Comm::get_iat_iw_is_block(iwt1);
+                    const int it1 = GlobalC::ucell.iat2it[iat1];
 
-					const int is_b = RI_2D_Comm::get_is_block(is_k, is0_b, is1_b);
-					RI::Tensor<Tdata> &mR_a2D = mRs_a2D[is_b][iat0][{iat1,cell}];
-                    if (mR_a2D.empty()) {
-                        mR_a2D = RI::Tensor<Tdata>(
-                            {static_cast<size_t>(GlobalC::ucell.atoms[it0].nw),
-                             static_cast<size_t>(
-                                 GlobalC::ucell.atoms[it1].nw)});
+                    const int is_b = RI_2D_Comm::get_is_block(is_k, is0_b, is1_b);
+                    RI::Tensor<Tdata>& mR_a2D = mRs_a2D[is_b][iat0][{iat1, cell}];
+                    if (mR_a2D.empty())
+                    {
+                        mR_a2D = RI::Tensor<Tdata>({static_cast<size_t>(GlobalC::ucell.atoms[it0].nw),
+                                                    static_cast<size_t>(GlobalC::ucell.atoms[it1].nw)});
                     }
-                    mR_a2D(iw0_b,iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
-				}
-			}
+                    mR_a2D(iw0_b, iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
+                }
+            }
         }
     }
-	ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
-	return mRs_a2D;
+    ModuleBase::timer::tick("RI_2D_Comm", "split_m2D_ktoR");
+    return mRs_a2D;
 }
 
+template <typename Tdata, typename TK>
+void RI_2D_Comm::add_Hexx(const K_Vectors& kv,
+                          const int ik,
+                          const double alpha,
+                          const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
+                          const Parallel_Orbitals& pv,
+                          TK* hk)
+{
+    ModuleBase::TITLE("RI_2D_Comm", "add_Hexx");
+    ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+
+    const std::map<int, std::vector<int>> is_list = {{1, {0}}, {2, {kv.isk[ik]}}, {4, {0, 1, 2, 3}}};
+    for (const int is_b: is_list.at(PARAM.inp.nspin))
+    {
+        int is0_b, is1_b;
+        std::tie(is0_b, is1_b) = RI_2D_Comm::split_is_block(is_b);
+        for (const auto& Hs_tmpA: Hs[is_b])
+        {
+            const TA& iat0 = Hs_tmpA.first;
+            for (const auto& Hs_tmpB: Hs_tmpA.second)
+            {
+                const TA& iat1 = Hs_tmpB.first.first;
+                const TC& cell1 = Hs_tmpB.first.second;
+                const std::complex<double> frac
+                    = alpha
+                      * std::exp(ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT
+                                 * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell1) * GlobalC::ucell.latvec)));
+                const RI::Tensor<Tdata>& H = Hs_tmpB.second;
+                for (size_t iw0_b = 0; iw0_b < H.shape[0]; ++iw0_b)
+                {
+                    const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
+                    if (pv.global2local_row(iwt0) < 0)
+                    {
+                        continue;
+                    }
+                    for (size_t iw1_b = 0; iw1_b < H.shape[1]; ++iw1_b)
+                    {
+                        const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
+                        if (pv.global2local_col(iwt1) < 0)
+                        {
+                            continue;
+                        }
+                        LCAO_domain::set_mat2d(iwt0,
+                                               iwt1,
+                                               RI::Global_Func::convert<TK>(H(iw0_b, iw1_b))
+                                                   * RI::Global_Func::convert<TK>(frac),
+                                               pv,
+                                               hk);
+                    }
+                }
+            }
+        }
+    }
+    ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+}
 
-template<typename Tdata, typename TK>
-void RI_2D_Comm::add_Hexx(
-	const K_Vectors &kv,
-	const int ik,
-	const double alpha,
-	const std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> &Hs,
-    const Parallel_Orbitals& pv,
-    TK* hk)
+template <typename Tdata, typename TK>
+void RI_2D_Comm::add_Hexx_td(const K_Vectors& kv,
+                          const int ik,
+                          const double alpha,
+                          const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
+                          const Parallel_Orbitals& pv,
+                          const ModuleBase::Vector3<double>& At,
+                          TK* hk)
 {
-	ModuleBase::TITLE("RI_2D_Comm","add_Hexx");
-	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+    ModuleBase::TITLE("RI_2D_Comm", "add_Hexx_td");
+    ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx_td");
+
+    const std::map<int, std::vector<int>> is_list = {{1, {0}}, {2, {kv.isk[ik]}}, {4, {0, 1, 2, 3}}};
+    for (const int is_b: is_list.at(PARAM.inp.nspin))
+    {
+        int is0_b, is1_b;
+        std::tie(is0_b, is1_b) = RI_2D_Comm::split_is_block(is_b);
+        for (const auto& Hs_tmpA: Hs[is_b])
+        {
+            const TA& iat0 = Hs_tmpA.first;
+            for (const auto& Hs_tmpB: Hs_tmpA.second)
+            {
+                const TA& iat1 = Hs_tmpB.first.first;
+                const TC& cell1 = Hs_tmpB.first.second;
+                const ModuleBase::Vector3<int> r_index = RI_Util::array3_to_Vector3(cell1);
+                // cal tddft phase for mixing gauge
+                ModuleBase::Vector3<double> dtau = GlobalC::ucell.cal_dtau(iat0, iat1, r_index);
+                const double arg_td = At * dtau * GlobalC::ucell.lat0;
+
+                const std::complex<double> frac
+                    = alpha
+                      * std::exp(ModuleBase::IMAG_UNIT
+                                 * ((ModuleBase::TWO_PI * kv.kvec_c[ik] * (r_index * GlobalC::ucell.latvec)) + arg_td));
 
-	const std::map<int, std::vector<int>> is_list = {{1,{0}}, {2,{kv.isk[ik]}}, {4,{0,1,2,3}}};
-	for(const int is_b : is_list.at(PARAM.inp.nspin))
-	{
-		int is0_b, is1_b;
-		std::tie(is0_b,is1_b) = RI_2D_Comm::split_is_block(is_b);
-		for(const auto &Hs_tmpA : Hs[is_b])
-		{
-			const TA &iat0 = Hs_tmpA.first;
-			for(const auto &Hs_tmpB : Hs_tmpA.second)
-			{
-				const TA &iat1 = Hs_tmpB.first.first;
-				const TC &cell1 = Hs_tmpB.first.second;
-				const std::complex<double> frac = alpha
-					* std::exp( ModuleBase::TWO_PI*ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell1)*GlobalC::ucell.latvec)) );
-				const RI::Tensor<Tdata> &H = Hs_tmpB.second;
-				for(size_t iw0_b=0; iw0_b<H.shape[0]; ++iw0_b)
-				{
-					const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
-                    if (pv.global2local_row(iwt0) < 0) {
+                const RI::Tensor<Tdata>& H = Hs_tmpB.second;
+                for (size_t iw0_b = 0; iw0_b < H.shape[0]; ++iw0_b)
+                {
+                    const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
+                    if (pv.global2local_row(iwt0) < 0)
+                    {
                         continue;
                     }
-                    for(size_t iw1_b=0; iw1_b<H.shape[1]; ++iw1_b)
-					{
-						const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
-                        if (pv.global2local_col(iwt1) < 0) {
+                    for (size_t iw1_b = 0; iw1_b < H.shape[1]; ++iw1_b)
+                    {
+                        const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
+                        if (pv.global2local_col(iwt1) < 0)
+                        {
                             continue;
                         }
-                        LCAO_domain::set_mat2d(iwt0, iwt1, RI::Global_Func::convert<TK>(H(iw0_b, iw1_b)) * RI::Global_Func::convert<TK>(frac), pv, hk);
-					}
-				}
-			}
-		}
-	}
-	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
+                        LCAO_domain::set_mat2d(iwt0,
+                                               iwt1,
+                                               RI::Global_Func::convert<TK>(H(iw0_b, iw1_b))
+                                                   * RI::Global_Func::convert<TK>(frac),
+                                               pv,
+                                               hk);
+                    }
+                }
+            }
+        }
+    }
+    ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx_td");
 }
 
 std::tuple<int, int, int> RI_2D_Comm::get_iat_iw_is_block(const int iwt)
 {
-	const int iat = GlobalC::ucell.iwt2iat[iwt];
-	const int iw = GlobalC::ucell.iwt2iw[iwt];
-	switch(PARAM.inp.nspin)
-	{
-		case 1: case 2:
-			return std::make_tuple(iat, iw, 0);
-		case 4:
-			return std::make_tuple(iat, iw/2, iw%2);
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
+    const int iat = GlobalC::ucell.iwt2iat[iwt];
+    const int iw = GlobalC::ucell.iwt2iw[iwt];
+    switch (PARAM.inp.nspin)
+    {
+    case 1:
+    case 2:
+        return std::make_tuple(iat, iw, 0);
+    case 4:
+        return std::make_tuple(iat, iw / 2, iw % 2);
+    default:
+        throw std::invalid_argument(std::string(__FILE__) + " line " + std::to_string(__LINE__));
+    }
 }
 
 int RI_2D_Comm::get_is_block(const int is_k, const int is_row_b, const int is_col_b)
 {
-	switch(PARAM.inp.nspin)
-	{
-		case 1:		return 0;
-		case 2:		return is_k;
-		case 4:		return is_row_b*2+is_col_b;
-		default:	throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
+    switch (PARAM.inp.nspin)
+    {
+    case 1:
+        return 0;
+    case 2:
+        return is_k;
+    case 4:
+        return is_row_b * 2 + is_col_b;
+    default:
+        throw std::invalid_argument(std::string(__FILE__) + " line " + std::to_string(__LINE__));
+    }
 }
 
 std::tuple<int, int> RI_2D_Comm::split_is_block(const int is_b)
 {
-	switch(PARAM.inp.nspin)
-	{
-		case 1:	case 2:
-			return std::make_tuple(0, 0);
-		case 4:
-			return std::make_tuple(is_b/2, is_b%2);
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
+    switch (PARAM.inp.nspin)
+    {
+    case 1:
+    case 2:
+        return std::make_tuple(0, 0);
+    case 4:
+        return std::make_tuple(is_b / 2, is_b % 2);
+    default:
+        throw std::invalid_argument(std::string(__FILE__) + " line " + std::to_string(__LINE__));
+    }
 }
 
 int RI_2D_Comm::get_iwt(const int iat, const int iw_b, const int is_b)
 {
-	const int it = GlobalC::ucell.iat2it[iat];
-	const int ia = GlobalC::ucell.iat2ia[iat];
-	int iw=-1;
-	switch(PARAM.inp.nspin)
-	{
-		case 1: case 2:
-			iw = iw_b;			break;
-		case 4:
-			iw = iw_b*2+is_b;	break;
-		default:
-			throw std::invalid_argument(std::string(__FILE__)+" line "+std::to_string(__LINE__));
-	}
-	const int iwt = GlobalC::ucell.itiaiw2iwt(it,ia,iw);
-	return iwt;
+    const int it = GlobalC::ucell.iat2it[iat];
+    const int ia = GlobalC::ucell.iat2ia[iat];
+    int iw = -1;
+    switch (PARAM.inp.nspin)
+    {
+    case 1:
+    case 2:
+        iw = iw_b;
+        break;
+    case 4:
+        iw = iw_b * 2 + is_b;
+        break;
+    default:
+        throw std::invalid_argument(std::string(__FILE__) + " line " + std::to_string(__LINE__));
+    }
+    const int iwt = GlobalC::ucell.itiaiw2iwt(it, ia, iw);
+    return iwt;
 }
 
-template<typename Tdata, typename TR>
-void RI_2D_Comm::add_HexxR(
-    const int current_spin,
-    const double alpha,
-    const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
-    const Parallel_Orbitals& pv,
-    const int npol,
-    hamilt::HContainer<TR>& hR,
-    const RI::Cell_Nearest<int, int, 3, double, 3>* const cell_nearest)
+template <typename Tdata, typename TR>
+void RI_2D_Comm::add_HexxR(const int current_spin,
+                           const double alpha,
+                           const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
+                           const Parallel_Orbitals& pv,
+                           const int npol,
+                           hamilt::HContainer<TR>& hR,
+                           const RI::Cell_Nearest<int, int, 3, double, 3>* const cell_nearest)
 {
     ModuleBase::TITLE("RI_2D_Comm", "add_HexxR");
     ModuleBase::timer::tick("RI_2D_Comm", "add_HexxR");
-    const std::map<int, std::vector<int>> is_list = { {1,{0}}, {2,{current_spin}}, {4,{0,1,2,3}} };
-    for (const int is_hs : is_list.at(PARAM.inp.nspin))
+    const std::map<int, std::vector<int>> is_list = {{1, {0}}, {2, {current_spin}}, {4, {0, 1, 2, 3}}};
+    for (const int is_hs: is_list.at(PARAM.inp.nspin))
     {
         int is0_b = 0, is1_b = 0;
         std::tie(is0_b, is1_b) = RI_2D_Comm::split_is_block(is_hs);
-        for (const auto& Hs_tmpA : Hs[is_hs])
+        for (const auto& Hs_tmpA: Hs[is_hs])
         {
             const TA& iat0 = Hs_tmpA.first;
-            for (const auto& Hs_tmpB : Hs_tmpA.second)
+            for (const auto& Hs_tmpB: Hs_tmpA.second)
             {
                 const TA& iat1 = Hs_tmpB.first.first;
                 const TC& cell = Hs_tmpB.first.second;
                 const Abfs::Vector3_Order<int> R = RI_Util::array3_to_Vector3(
-                    (cell_nearest ?
-                        cell_nearest->get_cell_nearest_discrete(iat0, iat1, cell)
-                        : cell));
+                    (cell_nearest ? cell_nearest->get_cell_nearest_discrete(iat0, iat1, cell) : cell));
                 hamilt::BaseMatrix<TR>* HlocR = hR.find_matrix(iat0, iat1, R.x, R.y, R.z);
                 if (HlocR == nullptr)
                 { // add R to HContainer
@@ -275,11 +363,11 @@ void RI_2D_Comm::add_HexxR(
                 auto row_indexes = pv.get_indexes_row(iat0);
                 auto col_indexes = pv.get_indexes_col(iat1);
                 const RI::Tensor<Tdata>& HexxR = (Tdata)alpha * Hs_tmpB.second;
-                for (int lw0_b = 0;lw0_b < row_indexes.size();lw0_b += npol)    // block
+                for (int lw0_b = 0; lw0_b < row_indexes.size(); lw0_b += npol) // block
                 {
                     const int& gw0 = row_indexes[lw0_b] / npol;
                     const int& lw0 = (npol == 2) ? (lw0_b + is0_b) : lw0_b;
-                    for (int lw1_b = 0;lw1_b < col_indexes.size();lw1_b += npol)
+                    for (int lw1_b = 0; lw1_b < col_indexes.size(); lw1_b += npol)
                     {
                         const int& gw1 = col_indexes[lw1_b] / npol;
                         const int& lw1 = (npol == 2) ? (lw1_b + is1_b) : lw1_b;
diff --git a/source/module_ri/ewald_Vq.h b/source/module_ri/ewald_Vq.h
index 3fcd1f49f4..b84c98bde1 100644
--- a/source/module_ri/ewald_Vq.h
+++ b/source/module_ri/ewald_Vq.h
@@ -85,7 +85,6 @@ class Ewald_Vq
     const int nspin0 = std::map<int, int>{{1, 1}, {2, 2}, {4, 1}}.at(PARAM.inp.nspin);
     int nks0;
     std::vector<TA> atoms_vec;
-    std::set<TA> atoms;
     std::map<std::string, double> parameter;
 
     std::vector<std::vector<std::vector<Numerical_Orbital_Lm>>> g_lcaos;
diff --git a/source/module_ri/ewald_Vq.hpp b/source/module_ri/ewald_Vq.hpp
index f4cd9a1ffe..57b1d09548 100644
--- a/source/module_ri/ewald_Vq.hpp
+++ b/source/module_ri/ewald_Vq.hpp
@@ -65,7 +65,6 @@ void Ewald_Vq<Tdata>::init(const LCAO_Orbitals& orb,
 
     this->atoms_vec.resize(GlobalC::ucell.nat);
     std::iota(this->atoms_vec.begin(), this->atoms_vec.end(), 0);
-    this->atoms.insert(this->atoms_vec.begin(), this->atoms_vec.end());
     this->nmp = {this->p_kv->nmp[0], this->p_kv->nmp[1], this->p_kv->nmp[2]};
 
     ModuleBase::timer::tick("Ewald_Vq", "init");
@@ -216,18 +215,14 @@ auto Ewald_Vq<Tdata>::cal_dVs_minus_gauss(const std::vector<TA>& list_A0,
 template <typename Tdata>
 double Ewald_Vq<Tdata>::cal_V_Rcut(const int it0, const int it1)
 {
-    // return this->g_abfs_ccp_rcut[it0] + this->g_lcaos_rcut[it1];
-    return this->g_lcaos_rcut[it0] + this->g_lcaos_rcut[it1];
+    return this->g_abfs_ccp_rcut[it0] + this->g_lcaos_rcut[it1];
 }
 
 template <typename Tdata>
 double Ewald_Vq<Tdata>::get_Rcut_max(const int it0, const int it1)
 {
-    double lcaos_rmax = this->lcaos_rcut[it0] //* this->info.ccp_rmesh_times
-                        + this->lcaos_rcut[it1];
-    double g_lcaos_rmax = this->g_lcaos_rcut[it0] //* this->info.ccp_rmesh_times
-                          + this->g_lcaos_rcut[it1];
-
+    double lcaos_rmax = this->lcaos_rcut[it0] * this->info.ccp_rmesh_times + this->lcaos_rcut[it1];
+    double g_lcaos_rmax = this->g_lcaos_rcut[it0] * this->info.ccp_rmesh_times + this->g_lcaos_rcut[it1];
     return std::min(lcaos_rmax, g_lcaos_rmax);
 }
 
@@ -552,7 +547,6 @@ auto Ewald_Vq<Tdata>::set_Vq_dVq_minus_gauss(const std::vector<TA>& list_A0,
     //           std::chrono::microseconds::period::num
     //                  / std::chrono::microseconds::period::den
     //           << " s" << std::endl;
-
     ModuleBase::timer::tick("Ewald_Vq", "set_Vq_dVq_minus_gauss");
     return datas;
 }
@@ -641,16 +635,32 @@ auto Ewald_Vq<Tdata>::set_Vq_dVq(const std::vector<TA>& list_A0_pair_k,
     const int shift_for_mpi = std::floor(this->nks0 / 2.0);
 
     // MPI: {ia0, {ia1, R}} to {ia0, ia1}
+    std::set<TA> atoms00;
+    std::set<TA> atoms01;
+    for (const auto& outerPair: Vs_dVs_minus_gauss_in)
+    {
+        atoms00.insert(outerPair.first);
+
+        for (const auto& innerPair: outerPair.second)
+            atoms01.insert(innerPair.first.first);
+    }
     std::map<TA, std::map<TAC, Tin>> Vs_dVs_minus_gauss
-        = RI_2D_Comm::comm_map2_first(this->mpi_comm, Vs_dVs_minus_gauss_in, this->atoms, this->atoms);
+        = RI_2D_Comm::comm_map2_first(this->mpi_comm, Vs_dVs_minus_gauss_in, atoms00, atoms01);
     std::map<TA, std::map<TAK, Tout>> Vq_dVq_minus_gauss = func_cal_Vq_dVq_minus_gauss(Vs_dVs_minus_gauss); //{ia0, ia1}
 
     // MPI: {ia0, {ia1, k}} to {ia0, ia1}
     std::map<TA, std::map<TAK, Tout>> Vq_dVq_gauss_out = func_cal_Vq_dVq_gauss(shift_for_mpi); //{ia0, {ia1, k}}
-    std::map<TA, std::map<TAK, Tout>> Vq_dVq_gauss = RI_2D_Comm::comm_map2_first(this->mpi_comm,
-                                                                                 Vq_dVq_gauss_out,
-                                                                                 this->atoms,
-                                                                                 this->atoms); //{ia0, ia1}
+    std::set<TA> atoms10;
+    std::set<TA> atoms11;
+    for (const auto& outerPair: Vq_dVq_gauss_out)
+    {
+        atoms10.insert(outerPair.first);
+
+        for (const auto& innerPair: outerPair.second)
+            atoms11.insert(innerPair.first.first);
+    }
+    std::map<TA, std::map<TAK, Tout>> Vq_dVq_gauss
+        = RI_2D_Comm::comm_map2_first(this->mpi_comm, Vq_dVq_gauss_out, atoms10, atoms11); //{ia0, ia1}
 
 #pragma omp parallel
     for (size_t i0 = 0; i0 < list_A0_pair_k.size(); ++i0)
@@ -704,7 +714,8 @@ auto Ewald_Vq<Tdata>::set_Vq_dVq(const std::vector<TA>& list_A0_pair_k,
             }
 
 #pragma omp critical(Ewald_Vq_set_Vq_dVq)
-            Vq_dVq[list_A0_pair_k[i0]][re_index] = Vq_dVq_minus_gauss[list_A0_pair_k[i0]][re_index] + data;
+            if (LRI_CV_Tools::exist(Vq_dVq_minus_gauss[list_A0_pair_k[i0]][re_index]))
+                Vq_dVq[list_A0_pair_k[i0]][re_index] = Vq_dVq_minus_gauss[list_A0_pair_k[i0]][re_index] + data;
         }
     }
 
@@ -776,19 +787,21 @@ auto Ewald_Vq<Tdata>::set_Vs_dVs(const std::vector<TA>& list_A0_pair_R,
                     const TA iat0 = list_A0_pair_R[i0];
                     const TA iat1 = list_A1_pair_R[i1].first;
                     const TC& cell1 = list_A1_pair_R[i1].second;
-
                     const std::complex<double> frac
                         = std::exp(-ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT
                                    * (this->kvec_c[ik] * (RI_Util::array3_to_Vector3(cell1) * GlobalC::ucell.latvec)))
                           * cfrac;
 
                     const TAK index = std::make_pair(iat1, std::array<int, 1>{static_cast<int>(ik)});
-                    Tout Vq_tmp = LRI_CV_Tools::convert<Tin_convert>(LRI_CV_Tools::mul2(frac, Vq[iat0][index]));
+                    if (LRI_CV_Tools::exist(Vq[iat0][index]))
+                    {
+                        Tout Vq_tmp = LRI_CV_Tools::convert<Tin_convert>(LRI_CV_Tools::mul2(frac, Vq[iat0][index]));
 
-                    if (!LRI_CV_Tools::exist(local_datas[iat0][list_A1_pair_R[i1]]))
-                        local_datas[iat0][list_A1_pair_R[i1]] = Vq_tmp;
-                    else
-                        local_datas[iat0][list_A1_pair_R[i1]] = local_datas[iat0][list_A1_pair_R[i1]] + Vq_tmp;
+                        if (!LRI_CV_Tools::exist(local_datas[iat0][list_A1_pair_R[i1]]))
+                            local_datas[iat0][list_A1_pair_R[i1]] = Vq_tmp;
+                        else
+                            local_datas[iat0][list_A1_pair_R[i1]] = local_datas[iat0][list_A1_pair_R[i1]] + Vq_tmp;
+                    }
                 }
             }
         }