diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp index e4fc7cf159f..4eff5f947fb 100644 --- a/Common/include/linear_algebra/CPreconditioner.hpp +++ b/Common/include/linear_algebra/CPreconditioner.hpp @@ -77,6 +77,18 @@ class CPreconditioner { template CPreconditioner::~CPreconditioner() {} +/*! + * \class CIdentityPreconditioner + * \brief No-op preconditioner used when Krylov solvers run without preconditioning. + */ +template +class CIdentityPreconditioner final : public CPreconditioner { + public: + inline void operator()(const CSysVector& u, CSysVector& v) const override { v = u; } + + inline bool IsIdentity() const override { return true; } +}; + /*! * \class CJacobiPreconditioner * \brief Specialization of preconditioner that uses CSysMatrix class. @@ -332,6 +344,9 @@ CPreconditioner* CPreconditioner::Create(ENUM_LINEAR_SOL CPreconditioner* prec = nullptr; switch (kind) { + case NO_PRECONDITIONER: + prec = new CIdentityPreconditioner(); + break; case JACOBI: prec = new CJacobiPreconditioner(jacobian, geometry, config); break; diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index ecb26959a06..6368670587e 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -108,6 +108,16 @@ struct CSysMatrixComms { MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX); }; +/*! + * \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path. + */ +struct CudaSpMVResources; + +/*! + * \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path. + */ +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources); + /*! * \class CSysMatrix * \ingroup SpLinSys @@ -148,8 +158,10 @@ class CSysMatrix { ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */ const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */ const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */ + ScalarType* d_invM = nullptr; /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */ bool useCuda = false; /*!< \brief Boolean that indicates whether user has enabled CUDA or not. Mainly used to conditionally free GPU memory in the class destructor. */ + mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */ ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */ unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */ @@ -924,6 +936,13 @@ class CSysMatrix { */ void BuildJacobiPreconditioner(); + /*! + * \brief Build the Jacobi preconditioner on the GPU/device side. + * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers. + * The actual implementation belongs in CSysMatrixGPU.cu. + */ + void BuildJacobiPreconditionerGPU(); + /*! * \brief Multiply CSysVector by the preconditioner * \param[in] vec - CSysVector to be multiplied by the preconditioner. @@ -934,6 +953,14 @@ class CSysMatrix { void ComputeJacobiPreconditioner(const CSysVector& vec, CSysVector& prod, CGeometry* geometry, const CConfig* config) const; + /*! + * \brief Apply the Jacobi preconditioner on the GPU/device side. + * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers. + * The actual implementation belongs in CSysMatrixGPU.cu. + */ + void ComputeJacobiPreconditionerGPU(const CSysVector& vec, CSysVector& prod, + CGeometry* geometry, const CConfig* config) const; + /*! * \brief Build the ILU preconditioner. */ diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp index 2ea3cbf7df3..67033992372 100644 --- a/Common/include/linear_algebra/CSysSolve.hpp +++ b/Common/include/linear_algebra/CSysSolve.hpp @@ -304,6 +304,16 @@ class CSysSolve { ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode, unsigned long custom_m) const; + /*! + * \brief CUDA/GPU implementation of the Flexible GMRES solver. + * \note This helper exists so the public solver interface can remain unchanged while the + * implementation is dispatched internally based on the runtime CUDA setting. + * The actual implementation lives in CSysSolveGPU.cu. + */ + unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec, + const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config) const; + /*! * \brief Creates the inner solver for nested preconditioning if the settings allow it. * \returns True if the inner solver can be used. diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp index 1498b549bbb..decffe26d29 100644 --- a/Common/include/linear_algebra/CSysVector.hpp +++ b/Common/include/linear_algebra/CSysVector.hpp @@ -240,6 +240,45 @@ class CSysVector : public VecExpr::CVecExpr, ScalarType> */ void GPUSetVal(ScalarType val, bool trigger = true) const; + /*! + * \brief Copy another vector into this vector on the device. + * \note This is an explicit GPU helper for Krylov solver implementations that keep the + * iteration state resident on the device. The implementation belongs in + * CSysVectorGPU.cu. + * \param[in] src - Source vector. + */ + void GPUCopy(const CSysVector& src) const; + + /*! + * \brief Scale this vector on the device. + * \note Explicit GPU helper for solver-side vector operations. + * \param[in] alpha - Scalar multiplier. + */ + void GPUScale(ScalarType alpha) const; + + /*! + * \brief Perform the AXPY operation on the device: this := this + alpha * x. + * \note Explicit GPU helper for solver-side vector operations. + * \param[in] alpha - Scalar multiplier. + * \param[in] x - Input vector. + */ + void GPUAxpy(ScalarType alpha, const CSysVector& x) const; + + /*! + * \brief Dot product between this vector and another vector on the device. + * \note Explicit GPU helper for solver-side reductions. + * \param[in] other - Input vector. + * \return Dot product result. + */ + ScalarType GPUDot(const CSysVector& other) const; + + /*! + * \brief L2 norm of this vector on the device. + * \note Explicit GPU helper for solver-side reductions. + * \return L2 norm result. + */ + ScalarType GPUNorm() const; + /*! * \brief return device pointer that points to the CSysVector values in GPU memory */ diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh index 13854381877..eb727477f21 100644 --- a/Common/include/linear_algebra/GPUComms.cuh +++ b/Common/include/linear_algebra/GPUComms.cuh @@ -25,6 +25,11 @@ * License along with SU2. If not, see . */ +#pragma once + +#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH +#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH + #include #include @@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t } #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); } + +#endif // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp index 4c521c29239..65ba6dfa1ab 100644 --- a/Common/include/option_structure.hpp +++ b/Common/include/option_structure.hpp @@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC { PASTIX_ILU=10, /*!< \brief PaStiX ILU(k) preconditioner. */ PASTIX_LU_P, /*!< \brief PaStiX LU as preconditioner. */ PASTIX_LDLT_P, /*!< \brief PaStiX LDLT as preconditioner. */ + NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */ }; static const MapType Linear_Solver_Prec_Map = { MakePair("JACOBI", JACOBI) @@ -2526,6 +2527,7 @@ static const MapType Linear_Solver_Prec_Ma MakePair("PASTIX_ILU", PASTIX_ILU) MakePair("PASTIX_LU", PASTIX_LU_P) MakePair("PASTIX_LDLT", PASTIX_LDLT_P) + MakePair("NONE", NO_PRECONDITIONER) }; /*! diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index 00ef51c2023..fafa093734e 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -54,6 +54,7 @@ CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G col_ind_ilu = nullptr; invM = nullptr; + d_invM = nullptr; #ifdef USE_MKL MatrixMatrixProductJitter = nullptr; @@ -67,6 +68,8 @@ template CSysMatrix::~CSysMatrix() { SU2_ZONE_SCOPED + ReleaseCudaSpMVResources(spmv_resources); + delete[] omp_partitions; MemoryAllocation::aligned_free(ILU_matrix); MemoryAllocation::aligned_free(matrix); @@ -76,6 +79,7 @@ CSysMatrix::~CSysMatrix() { GPUMemoryAllocation::gpu_free(d_matrix); GPUMemoryAllocation::gpu_free(d_row_ptr); GPUMemoryAllocation::gpu_free(d_col_ind); + GPUMemoryAllocation::gpu_free(d_invM); } #ifdef USE_MKL @@ -86,6 +90,10 @@ CSysMatrix::~CSysMatrix() { #endif } +#ifndef HAVE_CUDA +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; } +#endif + template void CSysMatrix::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar, unsigned short neqn, bool EdgeConnect, CGeometry* geometry, @@ -196,6 +204,10 @@ void CSysMatrix::Initialize(unsigned long npoint, unsigned long npoi if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn); + if (useCuda && diag_needed) { + d_invM = GPUMemoryAllocation::gpu_alloc(nPointDomain * nVar * nEqn * sizeof(ScalarType)); + } + /*--- Thread parallel initialization. ---*/ int num_threads = omp_get_max_threads(); @@ -672,6 +684,19 @@ void CSysMatrix::MatrixVectorProduct(const CSysVector& v template void CSysMatrix::BuildJacobiPreconditioner() { SU2_ZONE_SCOPED + + if (useCuda) { +#ifdef HAVE_CUDA + BuildJacobiPreconditionerGPU(); + return; +#else + SU2_MPI::Error( + "\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options " + "enabled in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif + } + /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/ SU2_OMP_FOR_DYN(omp_heavy_size) for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++) @@ -684,6 +709,19 @@ void CSysMatrix::ComputeJacobiPreconditioner(const CSysVector& prod, CGeometry* geometry, const CConfig* config) const { SU2_ZONE_SCOPED + + if (config->GetCUDA()) { +#ifdef HAVE_CUDA + ComputeJacobiPreconditionerGPU(vec, prod, geometry, config); + return; +#else + SU2_MPI::Error( + "\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options " + "enabled in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif + } + /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/ SU2_OMP_BARRIER SU2_OMP_FOR_DYN(omp_heavy_size) diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu index a3f1a77ca40..b8bb16ce5a9 100644 --- a/Common/src/linear_algebra/CSysMatrixGPU.cu +++ b/Common/src/linear_algebra/CSysMatrixGPU.cu @@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() { } } +struct CudaSpMVResources { + cusparseHandle_t handle = nullptr; + cusparseConstSpMatDescr_t mat = nullptr; + size_t buffer_size = 0; + void* buffer = nullptr; +}; + +void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { + if (resources == nullptr) { + return; + } + + if (resources->buffer != nullptr) { + gpuErrChk(cudaFree(resources->buffer)); + resources->buffer = nullptr; + resources->buffer_size = 0; + } + + if (resources->mat != nullptr) { + cusparseErrChk(cusparseDestroySpMat(resources->mat)); + resources->mat = nullptr; + } + + if (resources->handle != nullptr) { + cusparseErrChk(cusparseDestroy(resources->handle)); + resources->handle = nullptr; + } + + delete resources; + resources = nullptr; +} + template constexpr cudaDataType GetCudaDataType() { if constexpr (std::is_same::value) { @@ -83,8 +115,6 @@ void CSysMatrix::GPUMatrixVectorProduct(const CSysVector ScalarType* d_vec = vec.GetDevicePointer(); ScalarType* d_prod = prod.GetDevicePointer(); - vec.HtDTransfer(); - const auto indexType = GetCusparseIndexType(); const auto valueType = GetCudaDataType(); @@ -100,42 +130,56 @@ void CSysMatrix::GPUMatrixVectorProduct(const CSysVector const ScalarType alpha = 1.0; const ScalarType beta = 0.0; - cusparseHandle_t handle = nullptr; - cusparseConstSpMatDescr_t matA = nullptr; - cusparseDnVecDescr_t vecX = nullptr; - cusparseDnVecDescr_t vecY = nullptr; + if (spmv_resources == nullptr) { + spmv_resources = new CudaSpMVResources; - cusparseErrChk(cusparseCreate(&handle)); + cusparseErrChk(cusparseCreate(&spmv_resources->handle)); - cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix, - indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW)); + cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, + d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, + valueType, CUSPARSE_ORDER_ROW)); + } + + cusparseDnVecDescr_t vecX = nullptr; + cusparseDnVecDescr_t vecY = nullptr; cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType)); cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType)); - size_t bufferSize = 0; - void* dBuffer = nullptr; + size_t required_buffer_size = 0; - cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, - valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize)); + cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, + &required_buffer_size)); - if (bufferSize > 0) { - gpuErrChk(cudaMalloc(&dBuffer, bufferSize)); - } + if (required_buffer_size > spmv_resources->buffer_size) { + if (spmv_resources->buffer != nullptr) { + gpuErrChk(cudaFree(spmv_resources->buffer)); + } - cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType, - CUSPARSE_SPMV_BSR_ALG1, dBuffer)); + if (required_buffer_size > 0) { + gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size)); + } else { + spmv_resources->buffer = nullptr; + } - if (dBuffer != nullptr) { - gpuErrChk(cudaFree(dBuffer)); + spmv_resources->buffer_size = required_buffer_size; } + cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat, + vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer)); + cusparseErrChk(cusparseDestroyDnVec(vecY)); cusparseErrChk(cusparseDestroyDnVec(vecX)); - cusparseErrChk(cusparseDestroySpMat(matA)); - cusparseErrChk(cusparseDestroy(handle)); - - prod.DtHTransfer(); } - -template class CSysMatrix; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. +template void CSysMatrix::HtDTransfer(bool trigger) const; +template void CSysMatrix::GPUMatrixVectorProduct(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysMatrix::HtDTransfer(bool trigger) const; +template void CSysMatrix::GPUMatrixVectorProduct(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysPreconditionerGPU.cu b/Common/src/linear_algebra/CSysPreconditionerGPU.cu new file mode 100644 index 00000000000..adee93264eb --- /dev/null +++ b/Common/src/linear_algebra/CSysPreconditionerGPU.cu @@ -0,0 +1,109 @@ +/*! + * \file CSysPreconditionerGPU.cu + * \brief CUDA/GPU skeleton implementations for matrix-based preconditioners. + * \author Jesse Li + * \version 8.5.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#include "../../include/linear_algebra/CSysMatrix.inl" +#include "../../include/linear_algebra/GPUComms.cuh" + +namespace { + +template +__global__ void ApplyJacobiPreconditionerKernel(const ScalarType* invM, const ScalarType* vec, ScalarType* prod, + unsigned long nPointDomain, unsigned long nVar) { + const auto iPoint = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (iPoint >= nPointDomain) return; + + const auto block = &invM[iPoint * nVar * nVar]; + const auto rhs = &vec[iPoint * nVar]; + auto out = &prod[iPoint * nVar]; + + for (auto iVar = 0ul; iVar < nVar; ++iVar) { + ScalarType sum = ScalarType(0); + for (auto jVar = 0ul; jVar < nVar; ++jVar) { + sum += block[iVar * nVar + jVar] * rhs[jVar]; + } + out[iVar] = sum; + } +} + +} // namespace + +template +void CSysMatrix::BuildJacobiPreconditionerGPU() { + SU2_ZONE_SCOPED + + if (nVar != nEqn) { + SU2_MPI::Error("CUDA Jacobi preconditioner requires square blocks.", CURRENT_FUNCTION); + } + + if (invM == nullptr) { + SU2_MPI::Error("CUDA Jacobi preconditioner was requested without host inverse block storage.", CURRENT_FUNCTION); + } + + if (d_invM == nullptr) { + d_invM = GPUMemoryAllocation::gpu_alloc(nPointDomain * nVar * nEqn * sizeof(ScalarType)); + } + + for (auto iPoint = 0ul; iPoint < nPointDomain; ++iPoint) { + InverseDiagonalBlock(iPoint, &(invM[iPoint * nVar * nVar])); + } + + gpuErrChk(cudaMemcpy(d_invM, invM, nPointDomain * nVar * nVar * sizeof(ScalarType), cudaMemcpyHostToDevice)); +} + +template +void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, CGeometry* geometry, + const CConfig* config) const { + (void)geometry; + (void)config; + + SU2_ZONE_SCOPED + + if (d_invM == nullptr) { + SU2_MPI::Error("CUDA Jacobi preconditioner used before BuildJacobiPreconditionerGPU.", CURRENT_FUNCTION); + } + + constexpr unsigned threadsPerBlock = 128; + const auto blocks = static_cast((nPointDomain + threadsPerBlock - 1) / threadsPerBlock); + ApplyJacobiPreconditionerKernel<<>>(d_invM, vec.GetDevicePointer(), prod.GetDevicePointer(), + nPointDomain, nVar); + gpuErrChk(cudaPeekAtLastError()); +} + +template void CSysMatrix::BuildJacobiPreconditionerGPU(); +template void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, + CGeometry* geometry, + const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysMatrix::BuildJacobiPreconditionerGPU(); +template void CSysMatrix::ComputeJacobiPreconditionerGPU(const CSysVector& vec, + CSysVector& prod, + CGeometry* geometry, + const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp index cd100f311ab..3eba623f051 100644 --- a/Common/src/linear_algebra/CSysSolve.cpp +++ b/Common/src/linear_algebra/CSysSolve.cpp @@ -421,6 +421,17 @@ unsigned long CSysSolve::FGMRES_LinSolver(const CSysVectorGetCUDA()) { +#ifdef HAVE_CUDA + return FGMRES_LinSolver_GPU(b, x, mat_vec, precond, tol, m, residual, monitoring, config); +#else + SU2_MPI::Error( + "\nError in launching FGMRES solver\nENABLE_CUDA is set to YES\nPlease compile with CUDA options enabled " + "in Meson to access GPU Functions", + CURRENT_FUNCTION); +#endif + } + const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE); const bool flexible = !precond.IsIdentity(); /*--- If we call the solver outside of a parallel region, but the number of threads allows, diff --git a/Common/src/linear_algebra/CSysSolveGPU.cu b/Common/src/linear_algebra/CSysSolveGPU.cu new file mode 100644 index 00000000000..cb1ead10d89 --- /dev/null +++ b/Common/src/linear_algebra/CSysSolveGPU.cu @@ -0,0 +1,242 @@ +/*! + * \file CSysSolveGPU.cu + * \brief CUDA/GPU skeleton implementations for Krylov solvers. + * \author Jesse Li + * \version 8.5.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#include "../../include/linear_algebra/CSysSolve.hpp" +#include "../../include/linear_algebra/CMatrixVectorProduct.hpp" +#include "../../include/linear_algebra/CPreconditioner.hpp" +#include "../../include/linear_algebra/GPUComms.cuh" + +#include +#include + +void SU2_GPU_BeginSolverBLASContext(); +void SU2_GPU_EndSolverBLASContext(); + +namespace { + +class CGPUSolverBLASContextGuard { + public: + CGPUSolverBLASContextGuard() { SU2_GPU_BeginSolverBLASContext(); } + + ~CGPUSolverBLASContextGuard() { SU2_GPU_EndSolverBLASContext(); } + + CGPUSolverBLASContextGuard(const CGPUSolverBLASContextGuard&) = delete; + CGPUSolverBLASContextGuard& operator=(const CGPUSolverBLASContextGuard&) = delete; +}; + +} // namespace + +template +unsigned long CSysSolve::FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, + const ProductType& mat_vec, const PrecondType& precond, + ScalarType tol, unsigned long m, ScalarType& residual, + bool monitoring, const CConfig* config) const { + SU2_ZONE_SCOPED + + const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE); + const bool flexible = !precond.IsIdentity(); + + if (m < 1) { + SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION); + } + + if (m > 1000) { + SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION); + } + + CGPUSolverBLASContextGuard blas_context; + if (V.size() <= m || (flexible && Z.size() <= m)) { + BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS { + V.resize(m + 1); + for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); + if (flexible) { + Z.resize(m + 1); + for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr); + } + } + END_SU2_OMP_SAFE_GLOBAL_ACCESS + } + + su2vector g(m + 1), sn(m + 1), cs(m + 1), y(m); + g = ScalarType(0); + sn = ScalarType(0); + cs = ScalarType(0); + y = ScalarType(0); + + su2matrix H(m + 1, m); + H = ScalarType(0); + + /*--- b and x enter from the host side. The solver owns all subsequent + * synchronization for vectors it updates with GPU primitives. ---*/ + b.HtDTransfer(); + if (xIsZero) { + x.GPUSetVal(ScalarType(0)); + } else { + x.HtDTransfer(); + } + + ScalarType norm0 = b.GPUNorm(); + + if (!xIsZero) { + mat_vec(x, V[0]); + V[0].GPUAxpy(ScalarType(-1), b); + } else { + V[0].GPUCopy(b); + V[0].GPUScale(ScalarType(-1)); + } + + ScalarType beta = xIsZero ? norm0 : V[0].GPUNorm(); + + SU2_OMP_MASTER + ResetDeflation(); + END_SU2_OMP_MASTER + + if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta; + + if (beta < tol * norm0 || beta < eps) { + if (masterRank) { + SU2_OMP_MASTER + cout << "CSysSolve::FGMRES_GPU(): system solved by initial guess." << endl; + END_SU2_OMP_MASTER + } + residual = beta; + x.DtHTransfer(); + return 0; + } + + V[0].GPUScale(ScalarType(-1) / beta); + g[0] = beta; + + unsigned long i = 0; + if (monitoring && masterRank) { + SU2_OMP_MASTER { + WriteHeader("FGMRES_GPU", tol, beta); + WriteHistory(i, beta / norm0); + } + END_SU2_OMP_MASTER + } + + for (i = 0; i < m; i++) { + if (beta < tol * norm0) break; + + /*--- mat_vec consumes and produces device-resident vectors on the CUDA path. ---*/ + if (flexible) { + precond(V[i], Z[i]); + mat_vec(Z[i], V[i + 1]); + } else { + mat_vec(V[i], V[i + 1]); + } + + /*--- Classical Gram-Schmidt twice, matching the CPU implementation's + * numerical structure while using GPU dot/axpy/norm primitives. ---*/ + for (unsigned long k = 0; k <= i; k++) { + H(k, i) = V[k].GPUDot(V[i + 1]); + V[i + 1].GPUAxpy(-H(k, i), V[k]); + } + + for (unsigned long k = 0; k <= i; k++) { + const ScalarType dh = V[k].GPUDot(V[i + 1]); + H(k, i) += dh; + V[i + 1].GPUAxpy(-dh, V[k]); + } + + const ScalarType nrm = V[i + 1].GPUNorm(); + if (nrm <= ScalarType(0) || nrm != nrm) { + H(i + 1, i) = ScalarType(0); + if (masterRank) { + SU2_OMP_MASTER + cout << "WARNING: FGMRES GPU orthogonalization failed, linear solver diverged." << endl; + END_SU2_OMP_MASTER + } + break; + } + + H(i + 1, i) = nrm; + V[i + 1].GPUScale(ScalarType(1) / nrm); + + for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i)); + GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]); + ApplyGivens(sn[i], cs[i], g[i], g[i + 1]); + + beta = fabs(g[i + 1]); + + if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) { + SU2_OMP_MASTER + WriteHistory(i + 1, beta / norm0); + END_SU2_OMP_MASTER + } + } + + SolveReduced(i, H, g, y); + + const auto& basis = flexible ? Z : V; + + for (unsigned long k = 0; k < i; k++) { + x.GPUAxpy(y[k], basis[k]); + } + + x.DtHTransfer(); + + if (monitoring && config->GetComm_Level() == COMM_FULL) { + if (masterRank) { + SU2_OMP_MASTER + WriteFinalResidual("FGMRES_GPU", i, beta / norm0); + END_SU2_OMP_MASTER + } + + if (recomputeRes) { + mat_vec(x, V[0]); + V[0].GPUAxpy(ScalarType(-1), b); + const ScalarType res = V[0].GPUNorm(); + + if (fabs(res - beta) > tol * 10) { + if (masterRank) { + SU2_OMP_MASTER + WriteWarning(beta, res, tol); + END_SU2_OMP_MASTER + } + } + } + } + + residual = beta / norm0; + return i; +} + +/*--- Explicit instantiations for the GPU solver helper. + * Keep these aligned with the scalar types instantiated for CSysSolve. ---*/ +template unsigned long CSysSolve::FGMRES_LinSolver_GPU( + const CSysVector& b, CSysVector& x, + const CMatrixVectorProduct& mat_vec, const CPreconditioner& precond, + su2mixedfloat tol, unsigned long m, su2mixedfloat& residual, bool monitoring, const CConfig* config) const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template unsigned long CSysSolve::FGMRES_LinSolver_GPU( + const CSysVector& b, CSysVector& x, + const CMatrixVectorProduct& mat_vec, const CPreconditioner& precond, + passivedouble tol, unsigned long m, passivedouble& residual, bool monitoring, const CConfig* config) const; +#endif diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu index 94ec17bb88f..d69e9fcf234 100644 --- a/Common/src/linear_algebra/CSysVectorGPU.cu +++ b/Common/src/linear_algebra/CSysVectorGPU.cu @@ -27,6 +27,86 @@ #include "../../include/linear_algebra/CSysVector.hpp" #include "../../include/linear_algebra/GPUComms.cuh" +#include +#include + +namespace { + +constexpr unsigned GPU_VEC_OP_BLOCK_SIZE = 256; + +template +__global__ void GPUCopyKernel(const ScalarType* src, ScalarType* dst, unsigned long nElm) { + const unsigned long idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < nElm) dst[idx] = src[idx]; +} + +template +__global__ void GPUScaleKernel(ScalarType alpha, ScalarType* vec, unsigned long nElm) { + const unsigned long idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < nElm) vec[idx] *= alpha; +} + +template +__global__ void GPUAxpyKernel(ScalarType alpha, const ScalarType* x, ScalarType* y, unsigned long nElm) { + const unsigned long idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < nElm) y[idx] += alpha * x[idx]; +} + +inline dim3 MakeVectorGrid(unsigned long nElm) { + const auto grid = (nElm + GPU_VEC_OP_BLOCK_SIZE - 1) / GPU_VEC_OP_BLOCK_SIZE; + return dim3(static_cast(grid), 1, 1); +} + +thread_local cublasHandle_t active_solver_blas_handle = nullptr; +thread_local unsigned active_solver_blas_depth = 0; + +cublasHandle_t GetBlasHandle(const char* creation_error, bool& owns_handle) { + if (active_solver_blas_handle != nullptr) { + owns_handle = false; + return active_solver_blas_handle; + } + + cublasHandle_t handle = nullptr; + owns_handle = true; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error(creation_error, CURRENT_FUNCTION); + } + return handle; +} + +void ReleaseBlasHandle(cublasHandle_t handle, bool owns_handle, const char* destruction_error) { + if (owns_handle && handle != nullptr && cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error(destruction_error, CURRENT_FUNCTION); + } +} + +} // namespace + +void SU2_GPU_BeginSolverBLASContext() { + if (active_solver_blas_depth == 0) { + cublasHandle_t handle = nullptr; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error("cuBLAS handle creation failed for the GPU linear solver context.", CURRENT_FUNCTION); + } + active_solver_blas_handle = handle; + } + ++active_solver_blas_depth; +} + +void SU2_GPU_EndSolverBLASContext() { + if (active_solver_blas_depth == 0) { + SU2_MPI::Error("GPU linear solver BLAS context ended without a matching begin.", CURRENT_FUNCTION); + } + + --active_solver_blas_depth; + if (active_solver_blas_depth == 0) { + auto status = cublasDestroy(active_solver_blas_handle); + active_solver_blas_handle = nullptr; + if (status != CUBLAS_STATUS_SUCCESS) { + SU2_MPI::Error("cuBLAS handle destruction failed for the GPU linear solver context.", CURRENT_FUNCTION); + } + } +} template void CSysVector::HtDTransfer(bool trigger) const @@ -46,4 +126,98 @@ void CSysVector::GPUSetVal(ScalarType val, bool trigger) const if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm))); } -template class CSysVector; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. +template +void CSysVector::GPUCopy(const CSysVector& src) const { + if (nElm == 0) return; + + GPUCopyKernel<<>>(src.GetDevicePointer(), GetDevicePointer(), nElm); + gpuErrChk(cudaPeekAtLastError()); +} + +template +void CSysVector::GPUScale(ScalarType alpha) const { + if (nElm == 0) return; + + GPUScaleKernel<<>>(alpha, GetDevicePointer(), nElm); + gpuErrChk(cudaPeekAtLastError()); +} + +template +void CSysVector::GPUAxpy(ScalarType alpha, const CSysVector& x) const { + if (nElm == 0) return; + + GPUAxpyKernel<<>>(alpha, x.GetDevicePointer(), GetDevicePointer(), + nElm); + gpuErrChk(cudaPeekAtLastError()); +} + +template +ScalarType CSysVector::GPUDot(const CSysVector& other) const { + bool owns_handle = false; + cublasHandle_t handle = GetBlasHandle("cuBLAS handle creation failed in CSysVector::GPUDot.", owns_handle); + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + ScalarType local_dot = ScalarType(0); + + if constexpr (std::is_same_v) { + status = cublasSdot(handle, static_cast(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1, + &local_dot); + } else if constexpr (std::is_same_v) { + status = cublasDdot(handle, static_cast(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1, + &local_dot); + } else { + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + SU2_MPI::Error("Unsupported ScalarType in CSysVector::GPUDot.", CURRENT_FUNCTION); + return ScalarType(0); + } + + if (status != CUBLAS_STATUS_SUCCESS) { + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + SU2_MPI::Error("cuBLAS dot failed in CSysVector::GPUDot.", CURRENT_FUNCTION); + return ScalarType(0); + } + + ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot."); + + ScalarType global_dot = ScalarType(0); + const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE; + SelectMPIWrapper::W::Allreduce(&local_dot, &global_dot, 1, mpi_type, MPI_SUM, SU2_MPI::GetComm()); + + return global_dot; +} + +template +ScalarType CSysVector::GPUNorm() const { + return sqrt(GPUDot(*this)); +} + +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template void CSysVector::GPUSetVal(su2mixedfloat val, bool trigger) const; +template void CSysVector::GPUCopy(const CSysVector& src) const; +template void CSysVector::GPUScale(su2mixedfloat alpha) const; +template void CSysVector::GPUAxpy(su2mixedfloat alpha, const CSysVector& x) const; +template su2mixedfloat CSysVector::GPUDot(const CSysVector& other) const; +template su2mixedfloat CSysVector::GPUNorm() const; + +#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION) +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template void CSysVector::GPUSetVal(passivedouble val, bool trigger) const; +template void CSysVector::GPUCopy(const CSysVector& src) const; +template void CSysVector::GPUScale(passivedouble alpha) const; +template void CSysVector::GPUAxpy(passivedouble alpha, const CSysVector& x) const; +template passivedouble CSysVector::GPUDot(const CSysVector& other) const; +template passivedouble CSysVector::GPUNorm() const; +#endif + +#ifdef CODI_REVERSE_TYPE +template void CSysVector::HtDTransfer(bool trigger) const; +template void CSysVector::DtHTransfer(bool trigger) const; +template void CSysVector::GPUSetVal(su2double val, bool trigger) const; +template void CSysVector::GPUCopy(const CSysVector& src) const; +template void CSysVector::GPUScale(su2double alpha) const; +template void CSysVector::GPUAxpy(su2double alpha, const CSysVector& x) const; +template su2double CSysVector::GPUDot(const CSysVector& other) const; +template su2double CSysVector::GPUNorm() const; +#endif diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build index 7b880b29c1e..29bb63cd680 100644 --- a/Common/src/linear_algebra/meson.build +++ b/Common/src/linear_algebra/meson.build @@ -6,5 +6,5 @@ common_src += files(['CSysSolve_b.cpp', 'blas_structure.cpp']) if get_option('enable-cuda') - common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu',]) + common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu', 'CSysSolveGPU.cu', 'CSysPreconditionerGPU.cu',]) endif diff --git a/Common/src/meson.build b/Common/src/meson.build index f385c4a32ed..710e764303f 100644 --- a/Common/src/meson.build +++ b/Common/src/meson.build @@ -6,6 +6,27 @@ common_src =files(['graph_coloring_structure.cpp', '../include/parallelization/mpi_structure.cpp', '../include/parallelization/omp_structure.cpp']) +common_cuda_cpp_args = [] +foreach arg : su2_cpp_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_cpp_args += arg + endif +endforeach + +common_cuda_rev_args = common_cuda_cpp_args +foreach arg : codi_rev_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_rev_args += arg + endif +endforeach + +common_cuda_for_args = common_cuda_cpp_args +foreach arg : codi_for_args + if arg.startswith('-D') or arg.startswith('-U') + common_cuda_for_args += arg + endif +endforeach + subdir('linear_algebra') subdir('toolboxes') subdir('geometry') @@ -24,7 +45,8 @@ if get_option('enable-normal') common_src, install : false, dependencies : su2_deps, - cpp_args: [default_warning_flags, su2_cpp_args]) + cpp_args: [default_warning_flags, su2_cpp_args], + cuda_args: common_cuda_cpp_args) common_dep = declare_dependency(link_with: common, include_directories : common_include) @@ -36,7 +58,8 @@ if get_option('enable-autodiff') common_src, install : false, dependencies : [su2_deps, codi_dep], - cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args]) + cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args], + cuda_args: common_cuda_rev_args) commonAD_dep = declare_dependency(link_with: commonAD, include_directories : common_include) @@ -49,7 +72,8 @@ if get_option('enable-directdiff') common_src, install : false, dependencies : [su2_deps, codi_dep], - cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args]) + cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args], + cuda_args: common_cuda_for_args) commonDD_dep = declare_dependency(link_with: commonDD, include_directories : common_include) diff --git a/meson.build b/meson.build index d43e2b031d8..43786c100e9 100644 --- a/meson.build +++ b/meson.build @@ -20,13 +20,18 @@ python = pymod.find_installation() if get_option('enable-cuda') add_languages('cuda') add_global_arguments('-arch=sm_86', language : 'cuda') - cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)] + cuda_deps = [ + meson.get_compiler('cuda').find_library('cusparse', required : true), + meson.get_compiler('cuda').find_library('cublas', required : true), + ] else cuda_deps = [] endif su2_cpp_args = [] su2_deps = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps +codi_rev_args = [] +codi_for_args = [] default_warning_flags = [] if build_machine.system() != 'windows'