diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 90d3e7b..8ec2c21 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,6 +13,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest] + cuda: ["12.4.0", "12.5.0", "12.6.0", "12.8.0", "12.9.0", "13.0.0", "13.1.0"] runs-on: ${{ matrix.os }} steps: @@ -21,7 +22,8 @@ jobs: - uses: Jimver/cuda-toolkit@v0.2.30 id: cuda-toolkit with: - cuda: "13.1.0" + cuda: ${{ matrix.cuda }} + linux-local-args: '["--toolkit"]' - name: CUDA info run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fac171..2edf592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,11 @@ project(cupdlpx LANGUAGES C CXX CUDA) set(CUPDLPX_VERSION_MAJOR 0) set(CUPDLPX_VERSION_MINOR 2) -set(CUPDLPX_VERSION_PATCH 7) +set(CUPDLPX_VERSION_PATCH 8) set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}") add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}") +add_compile_definitions(CUSPARSE_ENABLE_EXPERIMENTAL_API) if (WIN32) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/README.md b/README.md index ff5ba0e..919e8e0 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,9 @@ Our work is presented in two papers: * **GPU:** NVIDIA GPU with CUDA 12.4+. * **Build Tools:** CMake (≥ 3.20), GCC, NVCC. +> **SpMV backend** is selected automatically at compile time based on cuSPARSE version: +> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3) +> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3) ### Build from Source Clone the repository and compile the project using CMake. diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h new file mode 100644 index 0000000..9eaecb2 --- /dev/null +++ b/internal/cusparse_compat.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +// cusparseSpMVOp_bufferSize was introduced in cuSPARSE 12.7.3 (CUDA 13.1 Update 1). +// CUSPARSE_VERSION encoding: major*1000 + minor*100 + patch. +#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION >= 12703 +#define CUPDLPX_HAS_SPMVOP 1 +#else +#define CUPDLPX_HAS_SPMVOP 0 +#endif + +#if !CUPDLPX_HAS_SPMVOP +// The SpMVOp types were added to cusparse.h before the functions +// (e.g. CUDA 13.1 base has the types but not the functions). +// Only provide fallback typedefs for cuSPARSE versions that lack them entirely. +#if !defined(CUSPARSE_VERSION) || CUSPARSE_VERSION < 12700 +typedef void *cusparseSpMVOpDescr_t; +typedef void *cusparseSpMVOpPlan_t; +#endif +#endif diff --git a/internal/internal_types.h b/internal/internal_types.h index 0934222..1302430 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -17,6 +17,7 @@ limitations under the License. #pragma once #include "cupdlpx_types.h" +#include "cusparse_compat.h" #include #include #include @@ -114,19 +115,7 @@ typedef struct cusparseHandle_t sparse_handle; cublasHandle_t blas_handle; - size_t spmv_buffer_size; - size_t primal_spmv_buffer_size; - size_t dual_spmv_buffer_size; - void *primal_spmv_buffer; - void *dual_spmv_buffer; - void *spmv_buffer; - - cusparseSpMatDescr_t matA; - cusparseSpMatDescr_t matAt; - cusparseDnVecDescr_t vec_primal_sol; - cusparseDnVecDescr_t vec_dual_sol; - cusparseDnVecDescr_t vec_primal_prod; - cusparseDnVecDescr_t vec_dual_prod; + void *spmv_ctx; double *ones_primal_d; double *ones_dual_d; diff --git a/internal/utils.h b/internal/utils.h index ce8d674..c31fd62 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -16,6 +16,7 @@ limitations under the License. #pragma once +#include "cusparse_compat.h" #include "internal_types.h" #include #include @@ -63,9 +64,6 @@ extern "C" #define THREADS_PER_BLOCK 256 - extern const double HOST_ONE; - extern const double HOST_ZERO; - void *safe_malloc(size_t size); void *safe_calloc(size_t num, size_t size); @@ -79,6 +77,45 @@ extern "C" int max_iterations, double tolerance); + bool cupdlpx_use_spmvop_by_default(void); + + void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + size_t *buffer_size); + + void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + void **descr, + void **plan); + + void cupdlpx_spmv_release(void *descr, void *plan); + + void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + void *plan); + + void *cupdlpx_spmv_ctx_create(cusparseHandle_t sparse_handle, + const cu_sparse_matrix_csr_t *A, + const cu_sparse_matrix_csr_t *AT, + const double *ax_x_init, + double *ax_y_init, + const double *atx_x_init, + double *atx_y_init); + + void cupdlpx_spmv_ctx_destroy(void *ctx); + + void cupdlpx_spmv_Ax(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y); + + void cupdlpx_spmv_ATx(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y); + void compute_interaction_and_movement(pdhg_solver_state_t *solver_state, double *interaction, double *movement); bool should_do_adaptive_restart(pdhg_solver_state_t *solver_state, diff --git a/pyproject.toml b/pyproject.toml index 1d3ada0..35f3ebf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.2.7" +version = "0.2.8" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/python/README.md b/python/README.md index af8a8de..47d0a66 100644 --- a/python/README.md +++ b/python/README.md @@ -18,6 +18,11 @@ It provides a high-level, Pythonic API for constructing, modifying, and solving - An NVIDIA GPU with CUDA support (≥12.4 required) - A C/C++ toolchain with GCC and NVCC +> **SpMV backend** is selected automatically at compile time based on cuSPARSE version: +> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3) +> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3) + + ### Install Install from PyPI: @@ -262,4 +267,4 @@ or ```python m.setWarmStart(primal=None, dual=None) -``` \ No newline at end of file +``` diff --git a/python/cupdlpx/PDLP.py b/python/cupdlpx/PDLP.py index 4e21730..0dfa1f0 100644 --- a/python/cupdlpx/PDLP.py +++ b/python/cupdlpx/PDLP.py @@ -60,4 +60,4 @@ # presolve "Presolve": "presolve", "MatrixZeroTol": "matrix_zero_tol", -} \ No newline at end of file +} diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu index 5f07e54..879d241 100644 --- a/src/feasibility_polish.cu +++ b/src/feasibility_polish.cu @@ -341,6 +341,14 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solve primal_state->relative_objective_gap = 0.0; primal_state->objective_gap = 0.0; + primal_state->spmv_ctx = cupdlpx_spmv_ctx_create(primal_state->sparse_handle, + primal_state->constraint_matrix, + primal_state->constraint_matrix_t, + primal_state->pdhg_primal_solution, + primal_state->primal_product, + primal_state->pdhg_dual_solution, + primal_state->dual_product); + return primal_state; } @@ -372,6 +380,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } @@ -473,6 +482,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_ dual_state->absolute_primal_residual = 0.0; dual_state->relative_objective_gap = 0.0; dual_state->objective_gap = 0.0; + + dual_state->spmv_ctx = cupdlpx_spmv_ctx_create(dual_state->sparse_handle, + dual_state->constraint_matrix, + dual_state->constraint_matrix_t, + dual_state->pdhg_primal_solution, + dual_state->primal_product, + dual_state->pdhg_dual_solution, + dual_state->dual_product); + return dual_state; } @@ -514,6 +532,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } diff --git a/src/solver.cu b/src/solver.cu index 7135d25..7ad9301 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -586,87 +586,6 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ state->step_size = 0.0; state->is_this_major_iteration = false; - size_t primal_spmv_buffer_size; - size_t dual_spmv_buffer_size; - - CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, - state->num_constraints, - state->num_variables, - state->constraint_matrix->num_nonzeros, - state->constraint_matrix->row_ptr, - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - CUDA_CHECK(cudaGetLastError()); - - CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, - state->num_variables, - state->num_constraints, - state->constraint_matrix_t->num_nonzeros, - state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, - state->constraint_matrix_t->val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - CUDA_CHECK(cudaGetLastError()); - - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &primal_spmv_buffer_size)); - - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &dual_spmv_buffer_size)); - CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - - CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); - CUDA_CHECK(cudaMalloc(&state->d_primal_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_dual_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_inner_count, sizeof(int))); @@ -696,6 +615,14 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ CUSPARSE_CHECK(cusparseSetStream(state->sparse_handle, state->stream)); CUBLAS_CHECK(cublasSetStream(state->blas_handle, state->stream)); + state->spmv_ctx = cupdlpx_spmv_ctx_create(state->sparse_handle, + state->constraint_matrix, + state->constraint_matrix_t, + state->pdhg_primal_solution, + state->primal_product, + state->pdhg_dual_solution, + state->dual_product); + free(ones_dual_h); if (params->verbose) { @@ -946,19 +873,7 @@ void compute_next_primal_solution(pdhg_solver_state_t *state, const double reflection_coefficient, bool is_major) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->current_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->current_dual_solution, state->dual_product); double step = state->step_size / state->primal_weight; @@ -1003,19 +918,7 @@ void compute_next_dual_solution(pdhg_solver_state_t *state, const double reflection_coefficient, bool is_major) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->reflected_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->reflected_primal_solution, state->primal_product); double step = state->step_size * state->primal_weight; @@ -1158,19 +1061,7 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state) state->num_variables, state->num_constraints); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->delta_dual_solution, state->dual_product); double interaction, movement; @@ -1203,26 +1094,14 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) return; } - if (state->matA) - CUSPARSE_CHECK(cusparseDestroySpMat(state->matA)); - if (state->matAt) - CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt)); - if (state->vec_primal_sol) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol)); - if (state->vec_dual_sol) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol)); - if (state->vec_primal_prod) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); - if (state->vec_dual_prod) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); - if (state->primal_spmv_buffer) - CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); - if (state->dual_spmv_buffer) - CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); + if (state->spmv_ctx) + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); if (state->sparse_handle) CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle)); if (state->blas_handle) CUBLAS_CHECK(cublasDestroy(state->blas_handle)); + if (state->stream) + CUDA_CHECK(cudaStreamDestroy(state->stream)); if (state->variable_lower_bound) CUDA_CHECK(cudaFree(state->variable_lower_bound)); @@ -1335,19 +1214,7 @@ static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, co cupdlpx_result_t *results = (cupdlpx_result_t *)safe_calloc(1, sizeof(cupdlpx_result_t)); // Compute reduced cost - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_and_rescale_reduced_cost_kernel<<num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>( state->dual_slack, diff --git a/src/spmv_backend.cu b/src/spmv_backend.cu new file mode 100644 index 0000000..4b7618a --- /dev/null +++ b/src/spmv_backend.cu @@ -0,0 +1,258 @@ +/* +Copyright 2025 Haihao Lu + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "utils.h" + +static const double HOST_ONE = 1.0; +static const double HOST_ZERO = 0.0; + +typedef struct +{ + cusparseSpMatDescr_t matA; + cusparseSpMatDescr_t matAT; + cusparseDnVecDescr_t vec_ax_x; + cusparseDnVecDescr_t vec_ax_y; + cusparseDnVecDescr_t vec_atx_x; + cusparseDnVecDescr_t vec_atx_y; + void *ax_buffer; + void *atx_buffer; + void *ax_descr; + void *ax_plan; + void *atx_descr; + void *atx_plan; +} cupdlpx_spmv_ctx_t; + +bool cupdlpx_use_spmvop_by_default(void) +{ + return CUPDLPX_HAS_SPMVOP; +} + +void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + size_t *buffer_size) +{ +#if CUPDLPX_HAS_SPMVOP + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize( + sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mat, vec_x, vec_y, vec_y, CUDA_R_64F, buffer_size)); +#else + CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer_size)); +#endif +} + +void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + void **descr, + void **plan) +{ +#if CUPDLPX_HAS_SPMVOP + cusparseSpMVOpDescr_t local_descr = NULL; + cusparseSpMVOpPlan_t local_plan = NULL; + CUSPARSE_CHECK(cusparseSpMVOp_createDescr( + sparse_handle, &local_descr, CUSPARSE_OPERATION_NON_TRANSPOSE, mat, vec_x, vec_y, vec_y, CUDA_R_64F, buffer)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, local_descr, &local_plan, NULL, 0)); + *descr = (void *)local_descr; + *plan = (void *)local_plan; +#else + (void)descr; + (void)plan; + CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer)); +#endif +} + +void cupdlpx_spmv_release(void *descr, void *plan) +{ +#if CUPDLPX_HAS_SPMVOP + if (descr) + { + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr((cusparseSpMVOpDescr_t)descr)); + } + if (plan) + { + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan((cusparseSpMVOpPlan_t)plan)); + } +#else + (void)descr; + (void)plan; +#endif +} + +void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + void *plan) +{ +#if CUPDLPX_HAS_SPMVOP + (void)mat; + (void)buffer; + CUSPARSE_CHECK( + cusparseSpMVOp(sparse_handle, (cusparseSpMVOpPlan_t)plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); +#else + (void)plan; + CUSPARSE_CHECK(cusparseSpMV(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer)); +#endif +} + +void *cupdlpx_spmv_ctx_create(cusparseHandle_t sparse_handle, + const cu_sparse_matrix_csr_t *A, + const cu_sparse_matrix_csr_t *AT, + const double *ax_x_init, + double *ax_y_init, + const double *atx_x_init, + double *atx_y_init) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)safe_calloc(1, sizeof(cupdlpx_spmv_ctx_t)); + size_t ax_buffer_size = 0; + size_t atx_buffer_size = 0; + + CUSPARSE_CHECK(cusparseCreateCsr(&ctx->matA, + A->num_rows, + A->num_cols, + A->num_nonzeros, + A->row_ptr, + A->col_ind, + A->val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + + CUSPARSE_CHECK(cusparseCreateCsr(&ctx->matAT, + AT->num_rows, + AT->num_cols, + AT->num_nonzeros, + AT->row_ptr, + AT->col_ind, + AT->val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_ax_x, A->num_cols, (void *)ax_x_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_ax_y, A->num_rows, ax_y_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_atx_x, AT->num_cols, (void *)atx_x_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_atx_y, AT->num_rows, atx_y_init, CUDA_R_64F)); + + cupdlpx_spmv_buffer_size(sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, &ax_buffer_size); + cupdlpx_spmv_buffer_size(sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, &atx_buffer_size); + CUDA_CHECK(cudaMalloc(&ctx->ax_buffer, ax_buffer_size)); + CUDA_CHECK(cudaMalloc(&ctx->atx_buffer, atx_buffer_size)); + + cupdlpx_spmv_prepare( + sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, ctx->ax_buffer, &ctx->ax_descr, &ctx->ax_plan); + cupdlpx_spmv_prepare( + sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, ctx->atx_buffer, &ctx->atx_descr, &ctx->atx_plan); + + return (void *)ctx; +} + +void cupdlpx_spmv_ctx_destroy(void *ctx_void) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + if (ctx == NULL) + { + return; + } + + cupdlpx_spmv_release(ctx->ax_descr, ctx->ax_plan); + cupdlpx_spmv_release(ctx->atx_descr, ctx->atx_plan); + + if (ctx->ax_buffer) + { + CUDA_CHECK(cudaFree(ctx->ax_buffer)); + } + if (ctx->atx_buffer) + { + CUDA_CHECK(cudaFree(ctx->atx_buffer)); + } + + if (ctx->vec_ax_x) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_ax_x)); + } + if (ctx->vec_ax_y) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_ax_y)); + } + if (ctx->vec_atx_x) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_atx_x)); + } + if (ctx->vec_atx_y) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_atx_y)); + } + if (ctx->matA) + { + CUSPARSE_CHECK(cusparseDestroySpMat(ctx->matA)); + } + if (ctx->matAT) + { + CUSPARSE_CHECK(cusparseDestroySpMat(ctx->matAT)); + } + + free(ctx); +} + +void cupdlpx_spmv_Ax(cusparseHandle_t sparse_handle, void *ctx_void, const double *x, double *y) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_ax_x, (void *)x)); + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_ax_y, y)); + cupdlpx_spmv_execute(sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, ctx->ax_buffer, ctx->ax_plan); +} + +void cupdlpx_spmv_ATx(cusparseHandle_t sparse_handle, void *ctx_void, const double *x, double *y) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_atx_x, (void *)x)); + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_atx_y, y)); + cupdlpx_spmv_execute(sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, ctx->atx_buffer, ctx->atx_plan); +} diff --git a/src/utils.cu b/src/utils.cu index 3b1d9b2..5f9f303 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -25,9 +25,6 @@ limitations under the License. std::mt19937 gen(1); std::normal_distribution dist(0.0, 1.0); -const double HOST_ONE = 1.0; -const double HOST_ZERO = 0.0; - void *safe_malloc(size_t size) { void *ptr = malloc(size); @@ -73,8 +70,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, int max_iterations, double tolerance) { - int m = A->num_rows; - int n = A->num_cols; + const int m = A->num_rows; + const int n = A->num_cols; double *eigenvector_d, *next_eigenvector_d, *dual_product_d; CUDA_CHECK(cudaMalloc(&eigenvector_d, m * sizeof(double))); @@ -92,7 +89,6 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, double sigma_max_sq = 1.0; const double one = 1.0; - const double zero = 0.0; cusparseSpMatDescr_t matA, matAT; CUSPARSE_CHECK(cusparseCreateCsr(&matA, @@ -123,36 +119,25 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&vecDual, n, dual_product_d, CUDA_R_64F)); + void *descrAT = NULL; + void *descrA = NULL; + void *planAT = NULL; + void *planA = NULL; + void *dBufferAT = NULL; void *dBufferA = NULL; size_t bufferSizeAT = 0, bufferSizeA = 0; - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeAT)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeA)); + cupdlpx_spmv_buffer_size(sparse_handle, matAT, vecNextEigen, vecDual, &bufferSizeAT); + cupdlpx_spmv_buffer_size(sparse_handle, matA, vecDual, vecEigen, &bufferSizeA); CUDA_CHECK(cudaMalloc(&dBufferAT, bufferSizeAT)); CUDA_CHECK(cudaMalloc(&dBufferA, bufferSizeA)); + cupdlpx_spmv_prepare(sparse_handle, matAT, vecNextEigen, vecDual, dBufferAT, &descrAT, &planAT); + cupdlpx_spmv_prepare(sparse_handle, matA, vecDual, vecEigen, dBufferA, &descrA, &planA); + for (int i = 0; i < max_iterations; ++i) { - CUDA_CHECK(cudaMemcpy(next_eigenvector_d, eigenvector_d, m * sizeof(double), cudaMemcpyDeviceToDevice)); double eigenvector_norm; CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1, &eigenvector_norm)); @@ -160,27 +145,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, double inv_eigenvector_norm = 1.0 / eigenvector_norm; CUBLAS_CHECK(cublasDscal(blas_handle, m, &inv_eigenvector_norm, next_eigenvector_d, 1)); - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferAT)); - - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferA)); + cupdlpx_spmv_execute(sparse_handle, matAT, vecNextEigen, vecDual, dBufferAT, planAT); + cupdlpx_spmv_execute(sparse_handle, matA, vecDual, vecEigen, dBufferA, planA); CUBLAS_CHECK(cublasDdot(blas_handle, m, next_eigenvector_d, 1, eigenvector_d, 1, &sigma_max_sq)); @@ -197,6 +163,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUDA_CHECK(cudaFree(dBufferAT)); CUDA_CHECK(cudaFree(dBufferA)); + cupdlpx_spmv_release(descrAT, planAT); + cupdlpx_spmv_release(descrA, planA); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); CUSPARSE_CHECK(cusparseDestroySpMat(matAT)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecEigen)); @@ -341,7 +309,6 @@ void set_default_parameters(pdhg_parameters_t *params) params->termination_evaluation_frequency = 200; params->feasibility_polishing = false; params->reflection_coefficient = 1.0; - params->sv_max_iter = 5000; params->sv_tol = 1e-4; @@ -506,6 +473,7 @@ void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem) printf(" time_limit : %.2f sec\n", params->termination_criteria.time_sec_limit); printf(" eps_opt : %.1e\n", params->termination_criteria.eps_optimal_relative); printf(" eps_feas : %.1e\n", params->termination_criteria.eps_feasible_relative); + printf(" spmv_backend : %s (auto)\n", cupdlpx_use_spmvop_by_default() ? "cusparseSpMVOp" : "cusparseSpMV"); if (params->optimality_norm != default_params.optimality_norm) { printf(" optimality_norm : %s\n", params->optimality_norm == NORM_TYPE_L_INF ? "L_inf" : "L2"); @@ -781,32 +749,8 @@ static double get_vector_sum(cublasHandle_t handle, int n, double *ones_d, const void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) { - cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution); - cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution); - cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product); - cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->pdhg_primal_solution, state->primal_product); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -906,32 +850,8 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double dual_ray_inf_norm = get_vector_inf_norm(state->blas_handle, state->num_constraints, state->delta_dual_solution); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->delta_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->delta_primal_solution, state->primal_product); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->delta_dual_solution, state->dual_product); CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -1312,19 +1232,7 @@ void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->pdhg_primal_solution, state->primal_product); compute_primal_feas_polish_residual_kernel<<num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -1365,19 +1273,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_dual_feas_polish_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->dual_residual,