diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37302f998f..6f8fc6fe0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -331,9 +331,13 @@ endif()
 
 if(USE_CUDA)
   cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below
+  
+  # Always find CUDAToolkit to get CUDAToolkit_VERSION
+  find_package(CUDAToolkit REQUIRED)
+  
   set_if_higher(CMAKE_CXX_STANDARD 14)
-  if(CUDA_VERSION VERSION_GREATER_EQUAL "13.0")
-    message(STATUS "CUDA ${CUDA_VERSION} detected. Setting CMAKE_CUDA_STANDARD to 17.")
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
+    message(STATUS "CUDA ${CUDAToolkit_VERSION} detected. Setting CMAKE_CUDA_STANDARD to 17.")
     set_if_higher(CMAKE_CXX_STANDARD 17)
   endif()
   set(CMAKE_CXX_EXTENSIONS ON)
@@ -341,16 +345,24 @@ if(USE_CUDA)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    find_package(CUDAToolkit REQUIRED)
     # check
     # https://gitlab.kitware.com/cmake/cmake/-/blob/master/Modules/Internal/CMakeCUDAArchitecturesAll.cmake
-    # for available architechures in different CUDA versions
-    set(CMAKE_CUDA_ARCHITECTURES
-        60 # P100
-        70 # V100
-        # Add your CUDA arch here Check the Compute Capability version of your
-        # GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-    )
+    # for available architectures in different CUDA versions
+    
+    # CUDA 13.0+ dropped support for architectures below 75
+    if(CUDAToolkit_VERSION VERSION_LESS "13.0")
+      set(CMAKE_CUDA_ARCHITECTURES
+          60 # P100
+          70 # V100
+      )
+    else()
+      # Start with empty list; architectures 75+ will be added below
+      set(CMAKE_CUDA_ARCHITECTURES)
+    endif()
+    
+    # Add your CUDA arch here Check the Compute Capability version of your
+    # GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    
     if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
       list(APPEND CMAKE_CUDA_ARCHITECTURES 75) # T4
     endif()
diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md
index 5a468e7fb4..70e448f4e0 100644
--- a/docs/advanced/acceleration/cuda.md
+++ b/docs/advanced/acceleration/cuda.md
@@ -45,7 +45,7 @@ We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/devel
 ## Known limitations
 PW basis:
 - Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically.
-- By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).
+- By default, CUDA architectures are automatically selected based on the CUDA Toolkit version. For CUDA versions before 13.0, architectures 60, 70, 75, 80, 86, 89, and 90 are compiled (if supported by the CUDA version). For CUDA 13.0 and later, only architectures 75 and above are compiled, as CUDA 13 dropped support for older architectures. This can be overridden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).
 
 LCAO basis:
 - Unless there is a specific reason, avoid using multiple GPUs, as it can be slower than using a single GPU. This is because the generalized eigenvalue solution of the LCAO basis set will incur additional communication overhead when calculated on multiple cards. When the memory limit of a GPU card makes it insufficient to complete the task, it is recommended to use multiple cards for calculation.