diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index e4fc7cf159f..4eff5f947fb 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -77,6 +77,18 @@ class CPreconditioner {
 template <class ScalarType>
 CPreconditioner<ScalarType>::~CPreconditioner() {}
 
+/*!
+ * \class CIdentityPreconditioner
+ * \brief No-op preconditioner used when Krylov solvers run without preconditioning.
+ */
+template <class ScalarType>
+class CIdentityPreconditioner final : public CPreconditioner<ScalarType> {
+ public:
+  inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { v = u; }
+
+  inline bool IsIdentity() const override { return true; }
+};
+
 /*!
  * \class CJacobiPreconditioner
  * \brief Specialization of preconditioner that uses CSysMatrix class.
@@ -332,6 +344,9 @@ CPreconditioner<ScalarType>* CPreconditioner<ScalarType>::Create(ENUM_LINEAR_SOL
   CPreconditioner<ScalarType>* prec = nullptr;
 
   switch (kind) {
+    case NO_PRECONDITIONER:
+      prec = new CIdentityPreconditioner<ScalarType>();
+      break;
     case JACOBI:
       prec = new CJacobiPreconditioner<ScalarType>(jacobian, geometry, config);
       break;
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index ecb26959a06..6368670587e 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -108,6 +108,16 @@ struct CSysMatrixComms {
                        MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX);
 };
 
+/*!
+ * \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+struct CudaSpMVResources;
+
+/*!
+ * \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources);
+
 /*!
  * \class CSysMatrix
  * \ingroup SpLinSys
@@ -148,8 +158,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  ScalarType* d_invM = nullptr;   /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */
   bool useCuda = false;           /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
                                      Mainly used to conditionally free GPU memory in the class destructor. */
+  mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -924,6 +936,13 @@ class CSysMatrix {
    */
   void BuildJacobiPreconditioner();
 
+  /*!
+   * \brief Build the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void BuildJacobiPreconditionerGPU();
+
   /*!
    * \brief Multiply CSysVector by the preconditioner
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
@@ -934,6 +953,14 @@ class CSysMatrix {
   void ComputeJacobiPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                                    const CConfig* config) const;
 
+  /*!
+   * \brief Apply the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
+
   /*!
    * \brief Build the ILU preconditioner.
    */
diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index 2ea3cbf7df3..67033992372 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -304,6 +304,16 @@ class CSysSolve {
                                       ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
                                       unsigned long custom_m) const;
 
+  /*!
+   * \brief CUDA/GPU implementation of the Flexible GMRES solver.
+   * \note This helper exists so the public solver interface can remain unchanged while the
+   *       implementation is dispatched internally based on the runtime CUDA setting.
+   *       The actual implementation lives in CSysSolveGPU.cu.
+   */
+  unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec,
+                                     const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
+                                     bool monitoring, const CConfig* config) const;
+
   /*!
    * \brief Creates the inner solver for nested preconditioning if the settings allow it.
    * \returns True if the inner solver can be used.
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index 1498b549bbb..decffe26d29 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -240,6 +240,45 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
    */
   void GPUSetVal(ScalarType val, bool trigger = true) const;
 
+  /*!
+   * \brief Copy another vector into this vector on the device.
+   * \note This is an explicit GPU helper for Krylov solver implementations that keep the
+   *       iteration state resident on the device. The implementation belongs in
+   *       CSysVectorGPU.cu.
+   * \param[in] src - Source vector.
+   */
+  void GPUCopy(const CSysVector& src) const;
+
+  /*!
+   * \brief Scale this vector on the device.
+   * \note Explicit GPU helper for solver-side vector operations.
+   * \param[in] alpha - Scalar multiplier.
+   */
+  void GPUScale(ScalarType alpha) const;
+
+  /*!
+   * \brief Perform the AXPY operation on the device: this := this + alpha * x.
+   * \note Explicit GPU helper for solver-side vector operations.
+   * \param[in] alpha - Scalar multiplier.
+   * \param[in] x - Input vector.
+   */
+  void GPUAxpy(ScalarType alpha, const CSysVector& x) const;
+
+  /*!
+   * \brief Dot product between this vector and another vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \param[in] other - Input vector.
+   * \return Dot product result.
+   */
+  ScalarType GPUDot(const CSysVector& other) const;
+
+  /*!
+   * \brief L2 norm of this vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \return L2 norm result.
+   */
+  ScalarType GPUNorm() const;
+
   /*!
    * \brief return device pointer that points to the CSysVector values in GPU memory
    */
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 13854381877..eb727477f21 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,6 +25,11 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
+#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+
 #include<cuda_runtime.h>
 #include<iostream>
 
@@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 
 #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+#endif  // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index 4c521c29239..65ba6dfa1ab 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC {
   PASTIX_ILU=10,  /*!< \brief PaStiX ILU(k) preconditioner. */
   PASTIX_LU_P,    /*!< \brief PaStiX LU as preconditioner. */
   PASTIX_LDLT_P,  /*!< \brief PaStiX LDLT as preconditioner. */
+  NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */
 };
 static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Map = {
   MakePair("JACOBI", JACOBI)
@@ -2526,6 +2527,7 @@ static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Ma
   MakePair("PASTIX_ILU", PASTIX_ILU)
   MakePair("PASTIX_LU", PASTIX_LU_P)
   MakePair("PASTIX_LDLT", PASTIX_LDLT_P)
+  MakePair("NONE", NO_PRECONDITIONER)
 };
 
 /*!
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 00ef51c2023..fafa093734e 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -54,6 +54,7 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind_ilu = nullptr;
 
   invM = nullptr;
+  d_invM = nullptr;
 
 #ifdef USE_MKL
   MatrixMatrixProductJitter = nullptr;
@@ -67,6 +68,8 @@ template <class ScalarType>
 CSysMatrix<ScalarType>::~CSysMatrix() {
   SU2_ZONE_SCOPED
 
+  ReleaseCudaSpMVResources(spmv_resources);
+
   delete[] omp_partitions;
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
@@ -76,6 +79,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
     GPUMemoryAllocation::gpu_free(d_matrix);
     GPUMemoryAllocation::gpu_free(d_row_ptr);
     GPUMemoryAllocation::gpu_free(d_col_ind);
+    GPUMemoryAllocation::gpu_free(d_invM);
   }
 
 #ifdef USE_MKL
@@ -86,6 +90,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
 #endif
 }
 
+#ifndef HAVE_CUDA
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; }
+#endif
+
 template <class ScalarType>
 void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar,
                                         unsigned short neqn, bool EdgeConnect, CGeometry* geometry,
@@ -196,6 +204,10 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
 
   if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn);
 
+  if (useCuda && diag_needed) {
+    d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
+  }
+
   /*--- Thread parallel initialization. ---*/
 
   int num_threads = omp_get_max_threads();
@@ -672,6 +684,19 @@ void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>& v
 template <class ScalarType>
 void CSysMatrix<ScalarType>::BuildJacobiPreconditioner() {
   SU2_ZONE_SCOPED
+
+  if (useCuda) {
+#ifdef HAVE_CUDA
+    BuildJacobiPreconditionerGPU();
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
@@ -684,6 +709,19 @@ void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<Scalar
                                                          CSysVector<ScalarType>& prod, CGeometry* geometry,
                                                          const CConfig* config) const {
   SU2_ZONE_SCOPED
+
+  if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+    ComputeJacobiPreconditionerGPU(vec, prod, geometry, config);
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/
   SU2_OMP_BARRIER
   SU2_OMP_FOR_DYN(omp_heavy_size)
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index a3f1a77ca40..b8bb16ce5a9 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() {
   }
 }
 
+struct CudaSpMVResources {
+  cusparseHandle_t handle = nullptr;
+  cusparseConstSpMatDescr_t mat = nullptr;
+  size_t buffer_size = 0;
+  void* buffer = nullptr;
+};
+
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) {
+  if (resources == nullptr) {
+    return;
+  }
+
+  if (resources->buffer != nullptr) {
+    gpuErrChk(cudaFree(resources->buffer));
+    resources->buffer = nullptr;
+    resources->buffer_size = 0;
+  }
+
+  if (resources->mat != nullptr) {
+    cusparseErrChk(cusparseDestroySpMat(resources->mat));
+    resources->mat = nullptr;
+  }
+
+  if (resources->handle != nullptr) {
+    cusparseErrChk(cusparseDestroy(resources->handle));
+    resources->handle = nullptr;
+  }
+
+  delete resources;
+  resources = nullptr;
+}
+
 template <class ScalarType>
 constexpr cudaDataType GetCudaDataType() {
   if constexpr (std::is_same<ScalarType, float>::value) {
@@ -83,8 +115,6 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   ScalarType* d_vec = vec.GetDevicePointer();
   ScalarType* d_prod = prod.GetDevicePointer();
 
-  vec.HtDTransfer();
-
   const auto indexType = GetCusparseIndexType();
   const auto valueType = GetCudaDataType<ScalarType>();
 
@@ -100,42 +130,56 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   const ScalarType alpha = 1.0;
   const ScalarType beta = 0.0;
 
-  cusparseHandle_t handle = nullptr;
-  cusparseConstSpMatDescr_t matA = nullptr;
-  cusparseDnVecDescr_t vecX = nullptr;
-  cusparseDnVecDescr_t vecY = nullptr;
+  if (spmv_resources == nullptr) {
+    spmv_resources = new CudaSpMVResources;
 
-  cusparseErrChk(cusparseCreate(&handle));
+    cusparseErrChk(cusparseCreate(&spmv_resources->handle));
 
-  cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
-                                        indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));
+    cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr,
+                                          d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO,
+                                          valueType, CUSPARSE_ORDER_ROW));
+  }
+
+  cusparseDnVecDescr_t vecX = nullptr;
+  cusparseDnVecDescr_t vecY = nullptr;
 
   cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
   cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));
 
-  size_t bufferSize = 0;
-  void* dBuffer = nullptr;
+  size_t required_buffer_size = 0;
 
-  cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
-                                         valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));
+  cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                         spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1,
+                                         &required_buffer_size));
 
-  if (bufferSize > 0) {
-    gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
-  }
+  if (required_buffer_size > spmv_resources->buffer_size) {
+    if (spmv_resources->buffer != nullptr) {
+      gpuErrChk(cudaFree(spmv_resources->buffer));
+    }
 
-  cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
-                              CUSPARSE_SPMV_BSR_ALG1, dBuffer));
+    if (required_buffer_size > 0) {
+      gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size));
+    } else {
+      spmv_resources->buffer = nullptr;
+    }
 
-  if (dBuffer != nullptr) {
-    gpuErrChk(cudaFree(dBuffer));
+    spmv_resources->buffer_size = required_buffer_size;
   }
 
+  cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat,
+                              vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer));
+
   cusparseErrChk(cusparseDestroyDnVec(vecY));
   cusparseErrChk(cusparseDestroyDnVec(vecX));
-  cusparseErrChk(cusparseDestroySpMat(matA));
-  cusparseErrChk(cusparseDestroy(handle));
-
-  prod.DtHTransfer();
 }
-
-template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template void CSysMatrix<su2mixedfloat>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<su2mixedfloat>::GPUMatrixVectorProduct(const CSysVector<su2mixedfloat>& vec,
+                                                                CSysVector<su2mixedfloat>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysMatrix<passivedouble>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<passivedouble>::GPUMatrixVectorProduct(const CSysVector<passivedouble>& vec,
+                                                                CSysVector<passivedouble>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysPreconditionerGPU.cu b/Common/src/linear_algebra/CSysPreconditionerGPU.cu
new file mode 100644
index 00000000000..adee93264eb
--- /dev/null
+++ b/Common/src/linear_algebra/CSysPreconditionerGPU.cu
@@ -0,0 +1,109 @@
+/*!
+ * \file CSysPreconditionerGPU.cu
+ * \brief CUDA/GPU skeleton implementations for matrix-based preconditioners.
+ * \author Jesse Li
+ * \version 8.5.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../include/linear_algebra/CSysMatrix.inl"
+#include "../../include/linear_algebra/GPUComms.cuh"
+
+namespace {
+
+template <class ScalarType>
+__global__ void ApplyJacobiPreconditionerKernel(const ScalarType* invM, const ScalarType* vec, ScalarType* prod,
+                                                unsigned long nPointDomain, unsigned long nVar) {
+  const auto iPoint = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (iPoint >= nPointDomain) return;
+
+  const auto block = &invM[iPoint * nVar * nVar];
+  const auto rhs = &vec[iPoint * nVar];
+  auto out = &prod[iPoint * nVar];
+
+  for (auto iVar = 0ul; iVar < nVar; ++iVar) {
+    ScalarType sum = ScalarType(0);
+    for (auto jVar = 0ul; jVar < nVar; ++jVar) {
+      sum += block[iVar * nVar + jVar] * rhs[jVar];
+    }
+    out[iVar] = sum;
+  }
+}
+
+}  // namespace
+
+template <class ScalarType>
+void CSysMatrix<ScalarType>::BuildJacobiPreconditionerGPU() {
+  SU2_ZONE_SCOPED
+
+  if (nVar != nEqn) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner requires square blocks.", CURRENT_FUNCTION);
+  }
+
+  if (invM == nullptr) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner was requested without host inverse block storage.", CURRENT_FUNCTION);
+  }
+
+  if (d_invM == nullptr) {
+    d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
+  }
+
+  for (auto iPoint = 0ul; iPoint < nPointDomain; ++iPoint) {
+    InverseDiagonalBlock(iPoint, &(invM[iPoint * nVar * nVar]));
+  }
+
+  gpuErrChk(cudaMemcpy(d_invM, invM, nPointDomain * nVar * nVar * sizeof(ScalarType), cudaMemcpyHostToDevice));
+}
+
+template <class ScalarType>
+void CSysMatrix<ScalarType>::ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec,
+                                                            CSysVector<ScalarType>& prod, CGeometry* geometry,
+                                                            const CConfig* config) const {
+  (void)geometry;
+  (void)config;
+
+  SU2_ZONE_SCOPED
+
+  if (d_invM == nullptr) {
+    SU2_MPI::Error("CUDA Jacobi preconditioner used before BuildJacobiPreconditionerGPU.", CURRENT_FUNCTION);
+  }
+
+  constexpr unsigned threadsPerBlock = 128;
+  const auto blocks = static_cast<unsigned>((nPointDomain + threadsPerBlock - 1) / threadsPerBlock);
+  ApplyJacobiPreconditionerKernel<<<blocks, threadsPerBlock>>>(d_invM, vec.GetDevicePointer(), prod.GetDevicePointer(),
+                                                               nPointDomain, nVar);
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template void CSysMatrix<su2mixedfloat>::BuildJacobiPreconditionerGPU();
+template void CSysMatrix<su2mixedfloat>::ComputeJacobiPreconditionerGPU(const CSysVector<su2mixedfloat>& vec,
+                                                                        CSysVector<su2mixedfloat>& prod,
+                                                                        CGeometry* geometry,
+                                                                        const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysMatrix<passivedouble>::BuildJacobiPreconditionerGPU();
+template void CSysMatrix<passivedouble>::ComputeJacobiPreconditionerGPU(const CSysVector<passivedouble>& vec,
+                                                                        CSysVector<passivedouble>& prod,
+                                                                        CGeometry* geometry,
+                                                                        const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index cd100f311ab..3eba623f051 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -421,6 +421,17 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
                                                       const CConfig* config) const {
   SU2_ZONE_SCOPED
 
+  if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+    return FGMRES_LinSolver_GPU(b, x, mat_vec, precond, tol, m, residual, monitoring, config);
+#else
+    SU2_MPI::Error(
+        "\nError in launching FGMRES solver\nENABLE_CUDA is set to YES\nPlease compile with CUDA options enabled "
+        "in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
   const bool flexible = !precond.IsIdentity();
   /*--- If we call the solver outside of a parallel region, but the number of threads allows,
diff --git a/Common/src/linear_algebra/CSysSolveGPU.cu b/Common/src/linear_algebra/CSysSolveGPU.cu
new file mode 100644
index 00000000000..cb1ead10d89
--- /dev/null
+++ b/Common/src/linear_algebra/CSysSolveGPU.cu
@@ -0,0 +1,242 @@
+/*!
+ * \file CSysSolveGPU.cu
+ * \brief CUDA/GPU skeleton implementations for Krylov solvers.
+ * \author Jesse Li
+ * \version 8.5.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2026, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../include/linear_algebra/CSysSolve.hpp"
+#include "../../include/linear_algebra/CMatrixVectorProduct.hpp"
+#include "../../include/linear_algebra/CPreconditioner.hpp"
+#include "../../include/linear_algebra/GPUComms.cuh"
+
+#include <cmath>
+#include <iostream>
+
+void SU2_GPU_BeginSolverBLASContext();
+void SU2_GPU_EndSolverBLASContext();
+
+namespace {
+
+class CGPUSolverBLASContextGuard {
+ public:
+  CGPUSolverBLASContextGuard() { SU2_GPU_BeginSolverBLASContext(); }
+
+  ~CGPUSolverBLASContextGuard() { SU2_GPU_EndSolverBLASContext(); }
+
+  CGPUSolverBLASContextGuard(const CGPUSolverBLASContextGuard&) = delete;
+  CGPUSolverBLASContextGuard& operator=(const CGPUSolverBLASContextGuard&) = delete;
+};
+
+}  // namespace
+
+template <class ScalarType>
+unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x,
+                                                          const ProductType& mat_vec, const PrecondType& precond,
+                                                          ScalarType tol, unsigned long m, ScalarType& residual,
+                                                          bool monitoring, const CConfig* config) const {
+  SU2_ZONE_SCOPED
+
+  const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
+  const bool flexible = !precond.IsIdentity();
+
+  if (m < 1) {
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
+  }
+
+  if (m > 1000) {
+    SU2_MPI::Error("FGMRES subspace is too large (>1000).", CURRENT_FUNCTION);
+  }
+
+  CGPUSolverBLASContextGuard blas_context;
+  if (V.size() <= m || (flexible && Z.size() <= m)) {
+    BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
+      V.resize(m + 1);
+      for (auto& w : V) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+      if (flexible) {
+        Z.resize(m + 1);
+        for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+      }
+    }
+    END_SU2_OMP_SAFE_GLOBAL_ACCESS
+  }
+
+  su2vector<ScalarType> g(m + 1), sn(m + 1), cs(m + 1), y(m);
+  g = ScalarType(0);
+  sn = ScalarType(0);
+  cs = ScalarType(0);
+  y = ScalarType(0);
+
+  su2matrix<ScalarType> H(m + 1, m);
+  H = ScalarType(0);
+
+  /*--- b and x enter from the host side. The solver owns all subsequent
+   * synchronization for vectors it updates with GPU primitives. ---*/
+  b.HtDTransfer();
+  if (xIsZero) {
+    x.GPUSetVal(ScalarType(0));
+  } else {
+    x.HtDTransfer();
+  }
+
+  ScalarType norm0 = b.GPUNorm();
+
+  if (!xIsZero) {
+    mat_vec(x, V[0]);
+    V[0].GPUAxpy(ScalarType(-1), b);
+  } else {
+    V[0].GPUCopy(b);
+    V[0].GPUScale(ScalarType(-1));
+  }
+
+  ScalarType beta = xIsZero ? norm0 : V[0].GPUNorm();
+
+  SU2_OMP_MASTER
+  ResetDeflation();
+  END_SU2_OMP_MASTER
+
+  if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta;
+
+  if (beta < tol * norm0 || beta < eps) {
+    if (masterRank) {
+      SU2_OMP_MASTER
+      cout << "CSysSolve::FGMRES_GPU(): system solved by initial guess." << endl;
+      END_SU2_OMP_MASTER
+    }
+    residual = beta;
+    x.DtHTransfer();
+    return 0;
+  }
+
+  V[0].GPUScale(ScalarType(-1) / beta);
+  g[0] = beta;
+
+  unsigned long i = 0;
+  if (monitoring && masterRank) {
+    SU2_OMP_MASTER {
+      WriteHeader("FGMRES_GPU", tol, beta);
+      WriteHistory(i, beta / norm0);
+    }
+    END_SU2_OMP_MASTER
+  }
+
+  for (i = 0; i < m; i++) {
+    if (beta < tol * norm0) break;
+
+    /*--- mat_vec consumes and produces device-resident vectors on the CUDA path. ---*/
+    if (flexible) {
+      precond(V[i], Z[i]);
+      mat_vec(Z[i], V[i + 1]);
+    } else {
+      mat_vec(V[i], V[i + 1]);
+    }
+
+    /*--- Classical Gram-Schmidt twice, matching the CPU implementation's
+     * numerical structure while using GPU dot/axpy/norm primitives. ---*/
+    for (unsigned long k = 0; k <= i; k++) {
+      H(k, i) = V[k].GPUDot(V[i + 1]);
+      V[i + 1].GPUAxpy(-H(k, i), V[k]);
+    }
+
+    for (unsigned long k = 0; k <= i; k++) {
+      const ScalarType dh = V[k].GPUDot(V[i + 1]);
+      H(k, i) += dh;
+      V[i + 1].GPUAxpy(-dh, V[k]);
+    }
+
+    const ScalarType nrm = V[i + 1].GPUNorm();
+    if (nrm <= ScalarType(0) || nrm != nrm) {
+      H(i + 1, i) = ScalarType(0);
+      if (masterRank) {
+        SU2_OMP_MASTER
+        cout << "WARNING: FGMRES GPU orthogonalization failed, linear solver diverged." << endl;
+        END_SU2_OMP_MASTER
+      }
+      break;
+    }
+
+    H(i + 1, i) = nrm;
+    V[i + 1].GPUScale(ScalarType(1) / nrm);
+
+    for (unsigned long k = 0; k < i; k++) ApplyGivens(sn[k], cs[k], H(k, i), H(k + 1, i));
+    GenerateGivens(H(i, i), H(i + 1, i), sn[i], cs[i]);
+    ApplyGivens(sn[i], cs[i], g[i], g[i + 1]);
+
+    beta = fabs(g[i + 1]);
+
+    if (monitoring && masterRank && ((i + 1) % monitorFreq == 0)) {
+      SU2_OMP_MASTER
+      WriteHistory(i + 1, beta / norm0);
+      END_SU2_OMP_MASTER
+    }
+  }
+
+  SolveReduced(i, H, g, y);
+
+  const auto& basis = flexible ? Z : V;
+
+  for (unsigned long k = 0; k < i; k++) {
+    x.GPUAxpy(y[k], basis[k]);
+  }
+
+  x.DtHTransfer();
+
+  if (monitoring && config->GetComm_Level() == COMM_FULL) {
+    if (masterRank) {
+      SU2_OMP_MASTER
+      WriteFinalResidual("FGMRES_GPU", i, beta / norm0);
+      END_SU2_OMP_MASTER
+    }
+
+    if (recomputeRes) {
+      mat_vec(x, V[0]);
+      V[0].GPUAxpy(ScalarType(-1), b);
+      const ScalarType res = V[0].GPUNorm();
+
+      if (fabs(res - beta) > tol * 10) {
+        if (masterRank) {
+          SU2_OMP_MASTER
+          WriteWarning(beta, res, tol);
+          END_SU2_OMP_MASTER
+        }
+      }
+    }
+  }
+
+  residual = beta / norm0;
+  return i;
+}
+
+/*--- Explicit instantiations for the GPU solver helper.
+ * Keep these aligned with the scalar types instantiated for CSysSolve. ---*/
+template unsigned long CSysSolve<su2mixedfloat>::FGMRES_LinSolver_GPU(
+    const CSysVector<su2mixedfloat>& b, CSysVector<su2mixedfloat>& x,
+    const CMatrixVectorProduct<su2mixedfloat>& mat_vec, const CPreconditioner<su2mixedfloat>& precond,
+    su2mixedfloat tol, unsigned long m, su2mixedfloat& residual, bool monitoring, const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template unsigned long CSysSolve<passivedouble>::FGMRES_LinSolver_GPU(
+    const CSysVector<passivedouble>& b, CSysVector<passivedouble>& x,
+    const CMatrixVectorProduct<passivedouble>& mat_vec, const CPreconditioner<passivedouble>& precond,
+    passivedouble tol, unsigned long m, passivedouble& residual, bool monitoring, const CConfig* config) const;
+#endif
diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu
index 94ec17bb88f..d69e9fcf234 100644
--- a/Common/src/linear_algebra/CSysVectorGPU.cu
+++ b/Common/src/linear_algebra/CSysVectorGPU.cu
@@ -27,6 +27,86 @@
 
 #include "../../include/linear_algebra/CSysVector.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
+#include <cublas_v2.h>
+#include <type_traits>
+
+namespace {
+
+constexpr unsigned GPU_VEC_OP_BLOCK_SIZE = 256;
+
+template <class ScalarType>
+__global__ void GPUCopyKernel(const ScalarType* src, ScalarType* dst, unsigned long nElm) {
+  const unsigned long idx = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < nElm) dst[idx] = src[idx];
+}
+
+template <class ScalarType>
+__global__ void GPUScaleKernel(ScalarType alpha, ScalarType* vec, unsigned long nElm) {
+  const unsigned long idx = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < nElm) vec[idx] *= alpha;
+}
+
+template <class ScalarType>
+__global__ void GPUAxpyKernel(ScalarType alpha, const ScalarType* x, ScalarType* y, unsigned long nElm) {
+  const unsigned long idx = static_cast<unsigned long>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < nElm) y[idx] += alpha * x[idx];
+}
+
+inline dim3 MakeVectorGrid(unsigned long nElm) {
+  const auto grid = (nElm + GPU_VEC_OP_BLOCK_SIZE - 1) / GPU_VEC_OP_BLOCK_SIZE;
+  return dim3(static_cast<unsigned>(grid), 1, 1);
+}
+
+thread_local cublasHandle_t active_solver_blas_handle = nullptr;
+thread_local unsigned active_solver_blas_depth = 0;
+
+cublasHandle_t GetBlasHandle(const char* creation_error, bool& owns_handle) {
+  if (active_solver_blas_handle != nullptr) {
+    owns_handle = false;
+    return active_solver_blas_handle;
+  }
+
+  cublasHandle_t handle = nullptr;
+  owns_handle = true;
+  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+    SU2_MPI::Error(creation_error, CURRENT_FUNCTION);
+  }
+  return handle;
+}
+
+void ReleaseBlasHandle(cublasHandle_t handle, bool owns_handle, const char* destruction_error) {
+  if (owns_handle && handle != nullptr && cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS) {
+    SU2_MPI::Error(destruction_error, CURRENT_FUNCTION);
+  }
+}
+
+}  // namespace
+
+void SU2_GPU_BeginSolverBLASContext() {
+  if (active_solver_blas_depth == 0) {
+    cublasHandle_t handle = nullptr;
+    if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+      SU2_MPI::Error("cuBLAS handle creation failed for the GPU linear solver context.", CURRENT_FUNCTION);
+    }
+    active_solver_blas_handle = handle;
+  }
+  ++active_solver_blas_depth;
+}
+
+void SU2_GPU_EndSolverBLASContext() {
+  if (active_solver_blas_depth == 0) {
+    SU2_MPI::Error("GPU linear solver BLAS context ended without a matching begin.", CURRENT_FUNCTION);
+  }
+
+  --active_solver_blas_depth;
+  if (active_solver_blas_depth == 0) {
+    auto status = cublasDestroy(active_solver_blas_handle);
+    active_solver_blas_handle = nullptr;
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      SU2_MPI::Error("cuBLAS handle destruction failed for the GPU linear solver context.", CURRENT_FUNCTION);
+    }
+  }
+}
 
 template<class ScalarType>
 void CSysVector<ScalarType>::HtDTransfer(bool trigger) const
@@ -46,4 +126,98 @@ void CSysVector<ScalarType>::GPUSetVal(ScalarType val, bool trigger) const
    if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm)));
 }
 
-template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template <class ScalarType>
+void CSysVector<ScalarType>::GPUCopy(const CSysVector& src) const {
+  if (nElm == 0) return;
+
+  GPUCopyKernel<<<MakeVectorGrid(nElm), GPU_VEC_OP_BLOCK_SIZE>>>(src.GetDevicePointer(), GetDevicePointer(), nElm);
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template <class ScalarType>
+void CSysVector<ScalarType>::GPUScale(ScalarType alpha) const {
+  if (nElm == 0) return;
+
+  GPUScaleKernel<<<MakeVectorGrid(nElm), GPU_VEC_OP_BLOCK_SIZE>>>(alpha, GetDevicePointer(), nElm);
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template <class ScalarType>
+void CSysVector<ScalarType>::GPUAxpy(ScalarType alpha, const CSysVector& x) const {
+  if (nElm == 0) return;
+
+  GPUAxpyKernel<<<MakeVectorGrid(nElm), GPU_VEC_OP_BLOCK_SIZE>>>(alpha, x.GetDevicePointer(), GetDevicePointer(),
+                                                                 nElm);
+  gpuErrChk(cudaPeekAtLastError());
+}
+
+template <class ScalarType>
+ScalarType CSysVector<ScalarType>::GPUDot(const CSysVector& other) const {
+  bool owns_handle = false;
+  cublasHandle_t handle = GetBlasHandle("cuBLAS handle creation failed in CSysVector::GPUDot.", owns_handle);
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  ScalarType local_dot = ScalarType(0);
+
+  if constexpr (std::is_same_v<ScalarType, float>) {
+    status = cublasSdot(handle, static_cast<int>(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1,
+                        &local_dot);
+  } else if constexpr (std::is_same_v<ScalarType, double>) {
+    status = cublasDdot(handle, static_cast<int>(nElmDomain), GetDevicePointer(), 1, other.GetDevicePointer(), 1,
+                        &local_dot);
+  } else {
+    ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+    SU2_MPI::Error("Unsupported ScalarType in CSysVector::GPUDot.", CURRENT_FUNCTION);
+    return ScalarType(0);
+  }
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+    SU2_MPI::Error("cuBLAS dot failed in CSysVector::GPUDot.", CURRENT_FUNCTION);
+    return ScalarType(0);
+  }
+
+  ReleaseBlasHandle(handle, owns_handle, "cuBLAS handle destruction failed in CSysVector::GPUDot.");
+
+  ScalarType global_dot = ScalarType(0);
+  const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE;
+  SelectMPIWrapper<ScalarType>::W::Allreduce(&local_dot, &global_dot, 1, mpi_type, MPI_SUM, SU2_MPI::GetComm());
+
+  return global_dot;
+}
+
+template <class ScalarType>
+ScalarType CSysVector<ScalarType>::GPUNorm() const {
+  return sqrt(GPUDot(*this));
+}
+
+template void CSysVector<su2mixedfloat>::HtDTransfer(bool trigger) const;
+template void CSysVector<su2mixedfloat>::DtHTransfer(bool trigger) const;
+template void CSysVector<su2mixedfloat>::GPUSetVal(su2mixedfloat val, bool trigger) const;
+template void CSysVector<su2mixedfloat>::GPUCopy(const CSysVector<su2mixedfloat>& src) const;
+template void CSysVector<su2mixedfloat>::GPUScale(su2mixedfloat alpha) const;
+template void CSysVector<su2mixedfloat>::GPUAxpy(su2mixedfloat alpha, const CSysVector<su2mixedfloat>& x) const;
+template su2mixedfloat CSysVector<su2mixedfloat>::GPUDot(const CSysVector<su2mixedfloat>& other) const;
+template su2mixedfloat CSysVector<su2mixedfloat>::GPUNorm() const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysVector<passivedouble>::HtDTransfer(bool trigger) const;
+template void CSysVector<passivedouble>::DtHTransfer(bool trigger) const;
+template void CSysVector<passivedouble>::GPUSetVal(passivedouble val, bool trigger) const;
+template void CSysVector<passivedouble>::GPUCopy(const CSysVector<passivedouble>& src) const;
+template void CSysVector<passivedouble>::GPUScale(passivedouble alpha) const;
+template void CSysVector<passivedouble>::GPUAxpy(passivedouble alpha, const CSysVector<passivedouble>& x) const;
+template passivedouble CSysVector<passivedouble>::GPUDot(const CSysVector<passivedouble>& other) const;
+template passivedouble CSysVector<passivedouble>::GPUNorm() const;
+#endif
+
+#ifdef CODI_REVERSE_TYPE
+template void CSysVector<su2double>::HtDTransfer(bool trigger) const;
+template void CSysVector<su2double>::DtHTransfer(bool trigger) const;
+template void CSysVector<su2double>::GPUSetVal(su2double val, bool trigger) const;
+template void CSysVector<su2double>::GPUCopy(const CSysVector<su2double>& src) const;
+template void CSysVector<su2double>::GPUScale(su2double alpha) const;
+template void CSysVector<su2double>::GPUAxpy(su2double alpha, const CSysVector<su2double>& x) const;
+template su2double CSysVector<su2double>::GPUDot(const CSysVector<su2double>& other) const;
+template su2double CSysVector<su2double>::GPUNorm() const;
+#endif
diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build
index 7b880b29c1e..29bb63cd680 100644
--- a/Common/src/linear_algebra/meson.build
+++ b/Common/src/linear_algebra/meson.build
@@ -6,5 +6,5 @@ common_src += files(['CSysSolve_b.cpp',
                      'blas_structure.cpp'])
 
   if get_option('enable-cuda')
-    common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu',])
+    common_src += files(['CSysMatrixGPU.cu', 'CSysVectorGPU.cu', 'CSysSolveGPU.cu', 'CSysPreconditionerGPU.cu',])
 endif
diff --git a/Common/src/meson.build b/Common/src/meson.build
index f385c4a32ed..710e764303f 100644
--- a/Common/src/meson.build
+++ b/Common/src/meson.build
@@ -6,6 +6,27 @@ common_src =files(['graph_coloring_structure.cpp',
            '../include/parallelization/mpi_structure.cpp',
            '../include/parallelization/omp_structure.cpp'])
 
+common_cuda_cpp_args = []
+foreach arg : su2_cpp_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_cpp_args += arg
+  endif
+endforeach
+
+common_cuda_rev_args = common_cuda_cpp_args
+foreach arg : codi_rev_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_rev_args += arg
+  endif
+endforeach
+
+common_cuda_for_args = common_cuda_cpp_args
+foreach arg : codi_for_args
+  if arg.startswith('-D') or arg.startswith('-U')
+    common_cuda_for_args += arg
+  endif
+endforeach
+
 subdir('linear_algebra')
 subdir('toolboxes')
 subdir('geometry')
@@ -24,7 +45,8 @@ if get_option('enable-normal')
                           common_src,
                           install : false,
                           dependencies : su2_deps,
-                          cpp_args: [default_warning_flags, su2_cpp_args])
+                          cpp_args: [default_warning_flags, su2_cpp_args],
+                          cuda_args: common_cuda_cpp_args)
 
   common_dep = declare_dependency(link_with: common,
                                   include_directories : common_include)
@@ -36,7 +58,8 @@ if get_option('enable-autodiff')
                             common_src,
                             install : false,
                             dependencies : [su2_deps, codi_dep],
-                            cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args])
+                            cpp_args: [default_warning_flags, su2_cpp_args, codi_rev_args],
+                            cuda_args: common_cuda_rev_args)
 
   commonAD_dep = declare_dependency(link_with: commonAD,
                                     include_directories : common_include)
@@ -49,7 +72,8 @@ if get_option('enable-directdiff')
                             common_src,
                             install : false,
                             dependencies : [su2_deps, codi_dep],
-                            cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args])
+                            cpp_args: [default_warning_flags, su2_cpp_args, codi_for_args],
+                            cuda_args: common_cuda_for_args)
 
   commonDD_dep = declare_dependency(link_with: commonDD,
                                     include_directories : common_include)
diff --git a/meson.build b/meson.build
index d43e2b031d8..43786c100e9 100644
--- a/meson.build
+++ b/meson.build
@@ -20,13 +20,18 @@ python = pymod.find_installation()
 if get_option('enable-cuda')
   add_languages('cuda')
   add_global_arguments('-arch=sm_86', language : 'cuda')
-  cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)]
+  cuda_deps = [
+    meson.get_compiler('cuda').find_library('cusparse', required : true),
+    meson.get_compiler('cuda').find_library('cublas', required : true),
+  ]
 else
   cuda_deps = []
 endif
 
 su2_cpp_args = []
 su2_deps     = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps
+codi_rev_args = []
+codi_for_args = []
 
 default_warning_flags = []
 if build_machine.system() != 'windows'