From b10a9c34282adb2dea8a354c961da1363d662d22 Mon Sep 17 00:00:00 2001 From: Zedong Peng Date: Tue, 24 Feb 2026 23:03:43 -0500 Subject: [PATCH 01/12] working SpMVop --- CMakeLists.txt | 1 + README.md | 2 + include/cupdlpx_types.h | 1 + internal/internal_types.h | 6 + internal/utils.h | 8 + python/README.md | 3 +- python/cupdlpx/PDLP.py | 3 +- python_bindings/_core_bindings.cpp | 2 + src/cli.c | 7 + src/solver.cu | 201 ++++++++++++-------- src/utils.cu | 287 ++++++++++++++++++----------- 11 files changed, 335 insertions(+), 186 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e35150..8f8afcc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ set(CUPDLPX_VERSION_PATCH 5) set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}") add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}") +add_compile_definitions(CUSPARSE_ENABLE_EXPERIMENTAL_API) if (WIN32) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/README.md b/README.md index 4118fb3..d6d4f60 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,7 @@ After building the project, the `./build/cupdlpx` binary can be invoked from the | `--eval_freq` | `int` | Termination evaluation frequency | `200` | | `--sv_max_iter` | `int` | Max iterations for singular value estimation | `5000` | | `--sv_tol` | `float` | Tolerance for singular value estimation | `1e-4` | +| `--no_spmvop` | `flag` | Disable `cusparseSpMVOp` and use classic `cusparseSpMV` path | `SpMVOp enabled` | | `--no_presolve` | `flag` | Disable presolve | `enabled` | | `-f`,`--feasibility_polishing` |`flag` | Run the polishing loop | `false` | | `--eps_feas_polish` | `double` | Relative tolerance for polishing | `1e-6` | @@ -109,6 +110,7 @@ The solver generates three text files in the specified . The f ### Python Interface The `cupdlpx` Python package supports building and solving LPs directly with `NumPy` and `SciPy`. +To control the SpMV backend in Python, set `UseSpMVOp` (alias of `use_spmvop`). Documentation and examples are available in the [Python API Guide](python/README.md). ### Julia Interface diff --git a/include/cupdlpx_types.h b/include/cupdlpx_types.h index d874206..8befb0a 100644 --- a/include/cupdlpx_types.h +++ b/include/cupdlpx_types.h @@ -99,6 +99,7 @@ extern "C" double reflection_coefficient; bool feasibility_polishing; norm_type_t optimality_norm; + bool use_spmvop; bool presolve; double matrix_zero_tol; } pdhg_parameters_t; diff --git a/internal/internal_types.h b/internal/internal_types.h index 9894c93..12bc06a 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -110,6 +110,7 @@ typedef struct double last_trial_fixed_point_error; int inner_count; int *d_inner_count; + bool use_spmvop; cusparseHandle_t sparse_handle; cublasHandle_t blas_handle; @@ -127,6 +128,11 @@ typedef struct cusparseDnVecDescr_t vec_primal_prod; cusparseDnVecDescr_t vec_dual_prod; + cusparseSpMVOpDescr_t primal_spmv_descr; + cusparseSpMVOpPlan_t primal_spmv_plan; + cusparseSpMVOpDescr_t dual_spmv_descr; + cusparseSpMVOpPlan_t dual_spmv_plan; + double *ones_primal_d; double *ones_dual_d; diff --git a/internal/utils.h b/internal/utils.h index ce8d674..e5a9599 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -76,6 +76,7 @@ extern "C" cublasHandle_t blas_handle, const cu_sparse_matrix_csr_t *A, const cu_sparse_matrix_csr_t *AT, + bool use_spmvop, int max_iterations, double tolerance); @@ -132,6 +133,13 @@ extern "C" void set_default_parameters(pdhg_parameters_t *params); + void cupdlpx_spmv(pdhg_solver_state_t *state, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpPlan_t plan); + #ifdef __cplusplus } diff --git a/python/README.md b/python/README.md index af8a8de..8096589 100644 --- a/python/README.md +++ b/python/README.md @@ -143,6 +143,7 @@ Below is a list of commonly used parameters, their internal keys, and descriptio | `OutputFlag`, `LogToConsole` | `verbose` | bool | `False` | Enable (`True`) or disable (`False`) console logging output. | | `TermCheckFreq` | `termination_evaluation_frequency` | int | `200` | Frequency (in iterations) at which termination conditions are evaluated. | | `OptimalityNorm` | `optimality_norm` | string | `"l2"` | Norm for optimality criteria. Use `"l2"` for L2 norm or `"linf"` for infinity norm. | +| `UseSpMVOp` | `use_spmvop` | bool | `True` | Use `cusparseSpMVOp` path (`True`) or fallback to classic `cusparseSpMV` (`False`). | | `OptimalityTol` | `eps_optimal_relative` | float | `1e-4` | Relative tolerance for optimality gap. Solver stops if the relative primal-dual gap ≤ this value. | | `FeasibilityTol` | `eps_feasible_relative` | float | `1e-4` | Relative feasibility tolerance for primal/dual residuals. | | `RuizIters` | `l_inf_ruiz_iterations` | int | `10` | Number of iterations for L∞ Ruiz scaling. Improves numerical conditioning. | @@ -262,4 +263,4 @@ or ```python m.setWarmStart(primal=None, dual=None) -``` \ No newline at end of file +``` diff --git a/python/cupdlpx/PDLP.py b/python/cupdlpx/PDLP.py index 4e21730..6533f85 100644 --- a/python/cupdlpx/PDLP.py +++ b/python/cupdlpx/PDLP.py @@ -54,10 +54,11 @@ "FeasibilityPolishingTol": "eps_feas_polish_relative", # termination criteria "OptimalityNorm": "optimality_norm", + "UseSpMVOp": "use_spmvop", # singular value estimation (power method) "SVMaxIter": "sv_max_iter", "SVTol": "sv_tol", # presolve "Presolve": "presolve", "MatrixZeroTol": "matrix_zero_tol", -} \ No newline at end of file +} diff --git a/python_bindings/_core_bindings.cpp b/python_bindings/_core_bindings.cpp index 0329d96..be9e54a 100644 --- a/python_bindings/_core_bindings.cpp +++ b/python_bindings/_core_bindings.cpp @@ -270,6 +270,7 @@ static py::dict get_default_params_py() // Termination criteria norm d["optimality_norm"] = (p.optimality_norm == NORM_TYPE_L_INF) ? "linf" : "l2"; + d["use_spmvop"] = p.use_spmvop; // power method for singular value estimation d["sv_max_iter"] = p.sv_max_iter; d["sv_tol"] = p.sv_tol; @@ -354,6 +355,7 @@ static void parse_params_from_python(py::object params_obj, pdhg_parameters_t *p // Termination criteria norm get_norm("optimality_norm", p->optimality_norm); + getb("use_spmvop", p->use_spmvop); // power method for singular value estimation geti("sv_max_iter", p->sv_max_iter); getf("sv_tol", p->sv_tol); diff --git a/src/cli.c b/src/cli.c index 5bfa4a6..82fd875 100644 --- a/src/cli.c +++ b/src/cli.c @@ -209,6 +209,9 @@ void print_usage(const char *prog_name) fprintf(stderr, " --matrix_zero_tol . " "Zero tolerance in constraint matrix.\n"); + fprintf(stderr, + " --no_spmvop " + "Use classic cusparseSpMV instead of cusparseSpMVOp (default: SpMVOp).\n"); } int main(int argc, char *argv[]) @@ -234,6 +237,7 @@ int main(int argc, char *argv[]) {"opt_norm", required_argument, 0, 1014}, {"no_presolve", no_argument, 0, 1015}, {"matrix_zero_tol", required_argument, 0, 1016}, + {"no_spmvop", no_argument, 0, 1017}, {0, 0, 0, 0}}; int opt; @@ -310,6 +314,9 @@ int main(int argc, char *argv[]) case 1016: // --matrix_zero_tol params.matrix_zero_tol = atof(optarg); break; + case 1017: // --no_spmvop + params.use_spmvop = false; + break; case '?': // Unknown option return 1; } diff --git a/src/solver.cu b/src/solver.cu index 896f19d..dff997f 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -599,6 +599,7 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ state->last_trial_fixed_point_error = INFINITY; state->step_size = 0.0; state->is_this_major_iteration = false; + state->use_spmvop = params->use_spmvop; size_t primal_spmv_buffer_size; size_t dual_spmv_buffer_size; @@ -636,50 +637,107 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ CUSPARSE_CHECK( cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &primal_spmv_buffer_size)); - - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &dual_spmv_buffer_size)); + if (state->use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->vec_primal_prod, + CUDA_R_64F, + &primal_spmv_buffer_size)); + + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + state->matAt, + state->vec_dual_sol, + state->vec_dual_prod, + state->vec_dual_prod, + CUDA_R_64F, + &dual_spmv_buffer_size)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + state->matA, + state->vec_primal_sol, + &HOST_ZERO, + state->vec_primal_prod, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &primal_spmv_buffer_size)); + + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + state->matAt, + state->vec_dual_sol, + &HOST_ZERO, + state->vec_dual_prod, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &dual_spmv_buffer_size)); + } CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); + if (state->use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp_createDescr(state->sparse_handle, + &state->primal_spmv_descr, + CUSPARSE_OPERATION_NON_TRANSPOSE, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->vec_primal_prod, + CUDA_R_64F, + state->primal_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan( + state->sparse_handle, state->primal_spmv_descr, &state->primal_spmv_plan, NULL, 0)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + state->matA, + state->vec_primal_sol, + &HOST_ZERO, + state->vec_primal_prod, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + state->primal_spmv_buffer)); + } CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + if (state->use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp_createDescr(state->sparse_handle, + &state->dual_spmv_descr, + CUSPARSE_OPERATION_NON_TRANSPOSE, + state->matAt, + state->vec_dual_sol, + state->vec_dual_prod, + state->vec_dual_prod, + CUDA_R_64F, + state->dual_spmv_buffer)); + CUSPARSE_CHECK( + cusparseSpMVOp_createPlan(state->sparse_handle, state->dual_spmv_descr, &state->dual_spmv_plan, NULL, 0)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + state->matAt, + state->vec_dual_sol, + &HOST_ZERO, + state->vec_dual_prod, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + state->dual_spmv_buffer)); + } CUDA_CHECK(cudaMalloc(&state->d_primal_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_dual_step_size, sizeof(double))); @@ -963,16 +1021,8 @@ static void compute_next_primal_solution(pdhg_solver_state_t *state, CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->current_dual_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); double step = state->step_size / state->primal_weight; @@ -1020,16 +1070,12 @@ static void compute_next_dual_solution(pdhg_solver_state_t *state, CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->reflected_primal_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); + cupdlpx_spmv(state, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->primal_spmv_buffer, + state->primal_spmv_plan); double step = state->step_size * state->primal_weight; @@ -1144,6 +1190,7 @@ static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, c state->blas_handle, state->constraint_matrix, state->constraint_matrix_t, + params->use_spmvop, params->sv_max_iter, params->sv_tol); state->step_size = 0.998 / max_sv; @@ -1175,16 +1222,8 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state) CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); double interaction, movement; @@ -1229,6 +1268,14 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); if (state->vec_dual_prod) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); + if (state->primal_spmv_descr) + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(state->primal_spmv_descr)); + if (state->primal_spmv_plan) + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(state->primal_spmv_plan)); + if (state->dual_spmv_descr) + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(state->dual_spmv_descr)); + if (state->dual_spmv_plan) + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(state->dual_spmv_plan)); if (state->primal_spmv_buffer) CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); if (state->dual_spmv_buffer) @@ -1352,16 +1399,8 @@ static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, co CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); compute_and_rescale_reduced_cost_kernel<<num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>( state->dual_slack, diff --git a/src/utils.cu b/src/utils.cu index c5133d9..7eaa835 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -28,6 +28,32 @@ std::normal_distribution dist(0.0, 1.0); const double HOST_ONE = 1.0; const double HOST_ZERO = 0.0; +void cupdlpx_spmv(pdhg_solver_state_t *state, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpPlan_t plan) +{ + if (state->use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp(state->sparse_handle, plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer)); + } +} + void *safe_malloc(size_t size) { void *ptr = malloc(size); @@ -70,6 +96,7 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, cublasHandle_t blas_handle, const cu_sparse_matrix_csr_t *A, const cu_sparse_matrix_csr_t *AT, + bool use_spmvop, int max_iterations, double tolerance) { @@ -123,33 +150,106 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&vecDual, n, dual_product_d, CUDA_R_64F)); + cusparseSpMVOpDescr_t descrAT = NULL, descrA = NULL; + cusparseSpMVOpPlan_t planAT = NULL, planA = NULL; + void *dBufferAT = NULL; void *dBufferA = NULL; size_t bufferSizeAT = 0, bufferSizeA = 0; - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeAT)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeA)); + if (use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + matAT, + vecNextEigen, + vecDual, + vecDual, + CUDA_R_64F, + &bufferSizeAT)); + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + matA, + vecDual, + vecEigen, + vecEigen, + CUDA_R_64F, + &bufferSizeA)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matAT, + vecNextEigen, + &zero, + vecDual, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &bufferSizeAT)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matA, + vecDual, + &zero, + vecEigen, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &bufferSizeA)); + } CUDA_CHECK(cudaMalloc(&dBufferAT, bufferSizeAT)); CUDA_CHECK(cudaMalloc(&dBufferA, bufferSizeA)); + if (use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, + &descrAT, + CUSPARSE_OPERATION_NON_TRANSPOSE, + matAT, + vecNextEigen, + vecDual, + vecDual, + CUDA_R_64F, + dBufferAT)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, descrAT, &planAT, NULL, 0)); + + CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, + &descrA, + CUSPARSE_OPERATION_NON_TRANSPOSE, + matA, + vecDual, + vecEigen, + vecEigen, + CUDA_R_64F, + dBufferA)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, descrA, &planA, NULL, 0)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matAT, + vecNextEigen, + &zero, + vecDual, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + dBufferAT)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matA, + vecDual, + &zero, + vecEigen, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + dBufferA)); + } + for (int i = 0; i < max_iterations; ++i) { @@ -160,27 +260,34 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, double inv_eigenvector_norm = 1.0 / eigenvector_norm; CUBLAS_CHECK(cublasDscal(blas_handle, m, &inv_eigenvector_norm, next_eigenvector_d, 1)); - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferAT)); - - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferA)); + if (use_spmvop) + { + CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, planAT, &one, &zero, vecNextEigen, vecDual, vecDual)); + CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, planA, &one, &zero, vecDual, vecEigen, vecEigen)); + } + else + { + CUSPARSE_CHECK(cusparseSpMV(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matAT, + vecNextEigen, + &zero, + vecDual, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + dBufferAT)); + CUSPARSE_CHECK(cusparseSpMV(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + matA, + vecDual, + &zero, + vecEigen, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + dBufferA)); + } CUBLAS_CHECK(cublasDdot(blas_handle, m, next_eigenvector_d, 1, eigenvector_d, 1, &sigma_max_sq)); @@ -197,6 +304,14 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUDA_CHECK(cudaFree(dBufferAT)); CUDA_CHECK(cudaFree(dBufferA)); + if (descrAT) + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descrAT)); + if (planAT) + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(planAT)); + if (descrA) + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descrA)); + if (planA) + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(planA)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); CUSPARSE_CHECK(cusparseDestroySpMat(matAT)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecEigen)); @@ -341,6 +456,7 @@ void set_default_parameters(pdhg_parameters_t *params) params->termination_evaluation_frequency = 200; params->feasibility_polishing = false; params->reflection_coefficient = 1.0; + params->use_spmvop = true; params->sv_max_iter = 5000; params->sv_tol = 1e-4; @@ -521,6 +637,7 @@ void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem) PRINT_DIFF_INT( "evaluation_freq", params->termination_evaluation_frequency, default_params.termination_evaluation_frequency); PRINT_DIFF_BOOL("feasibility_polishing", params->feasibility_polishing, default_params.feasibility_polishing); + PRINT_DIFF_BOOL("use_spmvop", params->use_spmvop, default_params.use_spmvop); PRINT_DIFF_DBL("eps_feas_polish_relative", params->termination_criteria.eps_feas_polish_relative, default_params.termination_criteria.eps_feas_polish_relative); @@ -786,27 +903,15 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product); cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv(state, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->primal_spmv_buffer, + state->primal_spmv_plan); + + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); compute_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -911,27 +1016,15 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv(state, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->primal_spmv_buffer, + state->primal_spmv_plan); + + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -1315,16 +1408,12 @@ void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); + cupdlpx_spmv(state, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->primal_spmv_buffer, + state->primal_spmv_plan); compute_primal_feas_polish_residual_kernel<<num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -1368,16 +1457,8 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); + cupdlpx_spmv( + state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); compute_dual_feas_polish_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->dual_residual, From dae4dd5e990ea8850436a1b7cf8e0189ac8736ee Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Mon, 9 Mar 2026 14:45:34 -0400 Subject: [PATCH 02/12] update --- README.md | 3 +- include/cupdlpx_types.h | 1 - internal/cusparse_compat.h | 22 ++++ internal/internal_types.h | 2 +- internal/utils.h | 30 ++++- python/README.md | 4 +- python/cupdlpx/PDLP.py | 1 - python_bindings/_core_bindings.cpp | 2 - src/cli.c | 7 -- src/solver.cu | 128 ++++---------------- src/spmv_backend.cu | 140 ++++++++++++++++++++++ src/utils.cu | 180 +++-------------------------- 12 files changed, 229 insertions(+), 291 deletions(-) create mode 100644 internal/cusparse_compat.h create mode 100644 src/spmv_backend.cu diff --git a/README.md b/README.md index d6d4f60..c1939d8 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Our work is presented in two papers: * **GPU:** NVIDIA GPU with CUDA 12.4+. * **Build Tools:** CMake (≥ 3.20), GCC, NVCC. +> **SpMV backend** is selected automatically based on cuSPARSE version (`cusparseSpMV` for < 13.1 Update 1; `cusparseSpMVOp` for >= 13.1 Update 1). ### Build from Source Clone the repository and compile the project using CMake. @@ -94,7 +95,6 @@ After building the project, the `./build/cupdlpx` binary can be invoked from the | `--eval_freq` | `int` | Termination evaluation frequency | `200` | | `--sv_max_iter` | `int` | Max iterations for singular value estimation | `5000` | | `--sv_tol` | `float` | Tolerance for singular value estimation | `1e-4` | -| `--no_spmvop` | `flag` | Disable `cusparseSpMVOp` and use classic `cusparseSpMV` path | `SpMVOp enabled` | | `--no_presolve` | `flag` | Disable presolve | `enabled` | | `-f`,`--feasibility_polishing` |`flag` | Run the polishing loop | `false` | | `--eps_feas_polish` | `double` | Relative tolerance for polishing | `1e-6` | @@ -110,7 +110,6 @@ The solver generates three text files in the specified . The f ### Python Interface The `cupdlpx` Python package supports building and solving LPs directly with `NumPy` and `SciPy`. -To control the SpMV backend in Python, set `UseSpMVOp` (alias of `use_spmvop`). Documentation and examples are available in the [Python API Guide](python/README.md). ### Julia Interface diff --git a/include/cupdlpx_types.h b/include/cupdlpx_types.h index 8befb0a..d874206 100644 --- a/include/cupdlpx_types.h +++ b/include/cupdlpx_types.h @@ -99,7 +99,6 @@ extern "C" double reflection_coefficient; bool feasibility_polishing; norm_type_t optimality_norm; - bool use_spmvop; bool presolve; double matrix_zero_tol; } pdhg_parameters_t; diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h new file mode 100644 index 0000000..78cbbcd --- /dev/null +++ b/internal/cusparse_compat.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#if defined(CUSPARSE_VER_MAJOR) && defined(CUSPARSE_VER_MINOR) && defined(CUSPARSE_VER_PATCH) +#define CUPDLPX_CUSPARSE_GTE_13_1U1 \ + ((CUSPARSE_VER_MAJOR > 13) || \ + (CUSPARSE_VER_MAJOR == 13 && \ + (CUSPARSE_VER_MINOR > 1 || (CUSPARSE_VER_MINOR == 1 && CUSPARSE_VER_PATCH >= 1)))) +#elif defined(CUSPARSE_VERSION) +// Fallback encoding assumption: major*1000 + minor*10 + patch +#define CUPDLPX_CUSPARSE_GTE_13_1U1 (CUSPARSE_VERSION >= 13101) +#else +#define CUPDLPX_CUSPARSE_GTE_13_1U1 0 +#endif + +#define CUPDLPX_HAS_SPMVOP CUPDLPX_CUSPARSE_GTE_13_1U1 + +#if !CUPDLPX_HAS_SPMVOP +typedef void *cusparseSpMVOpDescr_t; +typedef void *cusparseSpMVOpPlan_t; +#endif diff --git a/internal/internal_types.h b/internal/internal_types.h index 12bc06a..e753b74 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -17,6 +17,7 @@ limitations under the License. #pragma once #include "cupdlpx_types.h" +#include "cusparse_compat.h" #include #include #include @@ -110,7 +111,6 @@ typedef struct double last_trial_fixed_point_error; int inner_count; int *d_inner_count; - bool use_spmvop; cusparseHandle_t sparse_handle; cublasHandle_t blas_handle; diff --git a/internal/utils.h b/internal/utils.h index e5a9599..8b2aca9 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -17,6 +17,7 @@ limitations under the License. #pragma once #include "internal_types.h" +#include "cusparse_compat.h" #include #include #include @@ -63,9 +64,6 @@ extern "C" #define THREADS_PER_BLOCK 256 - extern const double HOST_ONE; - extern const double HOST_ZERO; - void *safe_malloc(size_t size); void *safe_calloc(size_t num, size_t size); @@ -76,10 +74,34 @@ extern "C" cublasHandle_t blas_handle, const cu_sparse_matrix_csr_t *A, const cu_sparse_matrix_csr_t *AT, - bool use_spmvop, int max_iterations, double tolerance); + bool cupdlpx_use_spmvop_by_default(void); + + void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + size_t *buffer_size); + + void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpDescr_t *descr, + cusparseSpMVOpPlan_t *plan); + + void cupdlpx_spmv_release(cusparseSpMVOpDescr_t descr, cusparseSpMVOpPlan_t plan); + + void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpPlan_t plan); + void compute_interaction_and_movement(pdhg_solver_state_t *solver_state, double *interaction, double *movement); bool should_do_adaptive_restart(pdhg_solver_state_t *solver_state, diff --git a/python/README.md b/python/README.md index 8096589..4b53778 100644 --- a/python/README.md +++ b/python/README.md @@ -18,6 +18,9 @@ It provides a high-level, Pythonic API for constructing, modifying, and solving - An NVIDIA GPU with CUDA support (≥12.4 required) - A C/C++ toolchain with GCC and NVCC +> **SpMV backend** is selected automatically based on cuSPARSE version (`cusparseSpMV` for < 13.1 Update 1; `cusparseSpMVOp` for >= 13.1 Update 1). + + ### Install Install from PyPI: @@ -143,7 +146,6 @@ Below is a list of commonly used parameters, their internal keys, and descriptio | `OutputFlag`, `LogToConsole` | `verbose` | bool | `False` | Enable (`True`) or disable (`False`) console logging output. | | `TermCheckFreq` | `termination_evaluation_frequency` | int | `200` | Frequency (in iterations) at which termination conditions are evaluated. | | `OptimalityNorm` | `optimality_norm` | string | `"l2"` | Norm for optimality criteria. Use `"l2"` for L2 norm or `"linf"` for infinity norm. | -| `UseSpMVOp` | `use_spmvop` | bool | `True` | Use `cusparseSpMVOp` path (`True`) or fallback to classic `cusparseSpMV` (`False`). | | `OptimalityTol` | `eps_optimal_relative` | float | `1e-4` | Relative tolerance for optimality gap. Solver stops if the relative primal-dual gap ≤ this value. | | `FeasibilityTol` | `eps_feasible_relative` | float | `1e-4` | Relative feasibility tolerance for primal/dual residuals. | | `RuizIters` | `l_inf_ruiz_iterations` | int | `10` | Number of iterations for L∞ Ruiz scaling. Improves numerical conditioning. | diff --git a/python/cupdlpx/PDLP.py b/python/cupdlpx/PDLP.py index 6533f85..0dfa1f0 100644 --- a/python/cupdlpx/PDLP.py +++ b/python/cupdlpx/PDLP.py @@ -54,7 +54,6 @@ "FeasibilityPolishingTol": "eps_feas_polish_relative", # termination criteria "OptimalityNorm": "optimality_norm", - "UseSpMVOp": "use_spmvop", # singular value estimation (power method) "SVMaxIter": "sv_max_iter", "SVTol": "sv_tol", diff --git a/python_bindings/_core_bindings.cpp b/python_bindings/_core_bindings.cpp index be9e54a..0329d96 100644 --- a/python_bindings/_core_bindings.cpp +++ b/python_bindings/_core_bindings.cpp @@ -270,7 +270,6 @@ static py::dict get_default_params_py() // Termination criteria norm d["optimality_norm"] = (p.optimality_norm == NORM_TYPE_L_INF) ? "linf" : "l2"; - d["use_spmvop"] = p.use_spmvop; // power method for singular value estimation d["sv_max_iter"] = p.sv_max_iter; d["sv_tol"] = p.sv_tol; @@ -355,7 +354,6 @@ static void parse_params_from_python(py::object params_obj, pdhg_parameters_t *p // Termination criteria norm get_norm("optimality_norm", p->optimality_norm); - getb("use_spmvop", p->use_spmvop); // power method for singular value estimation geti("sv_max_iter", p->sv_max_iter); getf("sv_tol", p->sv_tol); diff --git a/src/cli.c b/src/cli.c index 82fd875..5bfa4a6 100644 --- a/src/cli.c +++ b/src/cli.c @@ -209,9 +209,6 @@ void print_usage(const char *prog_name) fprintf(stderr, " --matrix_zero_tol . " "Zero tolerance in constraint matrix.\n"); - fprintf(stderr, - " --no_spmvop " - "Use classic cusparseSpMV instead of cusparseSpMVOp (default: SpMVOp).\n"); } int main(int argc, char *argv[]) @@ -237,7 +234,6 @@ int main(int argc, char *argv[]) {"opt_norm", required_argument, 0, 1014}, {"no_presolve", no_argument, 0, 1015}, {"matrix_zero_tol", required_argument, 0, 1016}, - {"no_spmvop", no_argument, 0, 1017}, {0, 0, 0, 0}}; int opt; @@ -314,9 +310,6 @@ int main(int argc, char *argv[]) case 1016: // --matrix_zero_tol params.matrix_zero_tol = atof(optarg); break; - case 1017: // --no_spmvop - params.use_spmvop = false; - break; case '?': // Unknown option return 1; } diff --git a/src/solver.cu b/src/solver.cu index dff997f..1cc3fa4 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -599,7 +599,6 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ state->last_trial_fixed_point_error = INFINITY; state->step_size = 0.0; state->is_this_major_iteration = false; - state->use_spmvop = params->use_spmvop; size_t primal_spmv_buffer_size; size_t dual_spmv_buffer_size; @@ -637,107 +636,27 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ CUSPARSE_CHECK( cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); - if (state->use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->vec_primal_prod, - CUDA_R_64F, - &primal_spmv_buffer_size)); - - CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - state->matAt, - state->vec_dual_sol, - state->vec_dual_prod, - state->vec_dual_prod, - CUDA_R_64F, - &dual_spmv_buffer_size)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &primal_spmv_buffer_size)); - - CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &dual_spmv_buffer_size)); - } + cupdlpx_spmv_buffer_size( + state->sparse_handle, state->matA, state->vec_primal_sol, state->vec_primal_prod, &primal_spmv_buffer_size); + cupdlpx_spmv_buffer_size( + state->sparse_handle, state->matAt, state->vec_dual_sol, state->vec_dual_prod, &dual_spmv_buffer_size); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - if (state->use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp_createDescr(state->sparse_handle, - &state->primal_spmv_descr, - CUSPARSE_OPERATION_NON_TRANSPOSE, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->vec_primal_prod, - CUDA_R_64F, - state->primal_spmv_buffer)); - CUSPARSE_CHECK(cusparseSpMVOp_createPlan( - state->sparse_handle, state->primal_spmv_descr, &state->primal_spmv_plan, NULL, 0)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matA, - state->vec_primal_sol, - &HOST_ZERO, - state->vec_primal_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->primal_spmv_buffer)); - } + cupdlpx_spmv_prepare(state->sparse_handle, + state->matA, + state->vec_primal_sol, + state->vec_primal_prod, + state->primal_spmv_buffer, + &state->primal_spmv_descr, + &state->primal_spmv_plan); CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - if (state->use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp_createDescr(state->sparse_handle, - &state->dual_spmv_descr, - CUSPARSE_OPERATION_NON_TRANSPOSE, - state->matAt, - state->vec_dual_sol, - state->vec_dual_prod, - state->vec_dual_prod, - CUDA_R_64F, - state->dual_spmv_buffer)); - CUSPARSE_CHECK( - cusparseSpMVOp_createPlan(state->sparse_handle, state->dual_spmv_descr, &state->dual_spmv_plan, NULL, 0)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - state->matAt, - state->vec_dual_sol, - &HOST_ZERO, - state->vec_dual_prod, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - state->dual_spmv_buffer)); - } + cupdlpx_spmv_prepare(state->sparse_handle, + state->matAt, + state->vec_dual_sol, + state->vec_dual_prod, + state->dual_spmv_buffer, + &state->dual_spmv_descr, + &state->dual_spmv_plan); CUDA_CHECK(cudaMalloc(&state->d_primal_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_dual_step_size, sizeof(double))); @@ -1190,7 +1109,6 @@ static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, c state->blas_handle, state->constraint_matrix, state->constraint_matrix_t, - params->use_spmvop, params->sv_max_iter, params->sv_tol); state->step_size = 0.998 / max_sv; @@ -1268,14 +1186,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); if (state->vec_dual_prod) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); - if (state->primal_spmv_descr) - CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(state->primal_spmv_descr)); - if (state->primal_spmv_plan) - CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(state->primal_spmv_plan)); - if (state->dual_spmv_descr) - CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(state->dual_spmv_descr)); - if (state->dual_spmv_plan) - CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(state->dual_spmv_plan)); + cupdlpx_spmv_release(state->primal_spmv_descr, state->primal_spmv_plan); + cupdlpx_spmv_release(state->dual_spmv_descr, state->dual_spmv_plan); if (state->primal_spmv_buffer) CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); if (state->dual_spmv_buffer) diff --git a/src/spmv_backend.cu b/src/spmv_backend.cu new file mode 100644 index 0000000..4aab4fd --- /dev/null +++ b/src/spmv_backend.cu @@ -0,0 +1,140 @@ +/* +Copyright 2025 Haihao Lu + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "utils.h" + +static const double HOST_ONE = 1.0; +static const double HOST_ZERO = 0.0; + +bool cupdlpx_use_spmvop_by_default(void) +{ + return CUPDLPX_HAS_SPMVOP; +} + +void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + size_t *buffer_size) +{ +#if CUPDLPX_HAS_SPMVOP + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + mat, + vec_x, + vec_y, + vec_y, + CUDA_R_64F, + buffer_size)); +#else + CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer_size)); +#endif +} + +void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpDescr_t *descr, + cusparseSpMVOpPlan_t *plan) +{ +#if CUPDLPX_HAS_SPMVOP + CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, + descr, + CUSPARSE_OPERATION_NON_TRANSPOSE, + mat, + vec_x, + vec_y, + vec_y, + CUDA_R_64F, + buffer)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, *descr, plan, NULL, 0)); +#else + (void)descr; + (void)plan; + CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer)); +#endif +} + +void cupdlpx_spmv_release(cusparseSpMVOpDescr_t descr, cusparseSpMVOpPlan_t plan) +{ +#if CUPDLPX_HAS_SPMVOP + if (descr) + { + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descr)); + } + if (plan) + { + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(plan)); + } +#else + (void)descr; + (void)plan; +#endif +} + +void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpPlan_t plan) +{ +#if CUPDLPX_HAS_SPMVOP + CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); +#else + (void)plan; + CUSPARSE_CHECK(cusparseSpMV(sparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, + mat, + vec_x, + &HOST_ZERO, + vec_y, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + buffer)); +#endif +} + +void cupdlpx_spmv(pdhg_solver_state_t *state, + cusparseSpMatDescr_t mat, + cusparseDnVecDescr_t vec_x, + cusparseDnVecDescr_t vec_y, + void *buffer, + cusparseSpMVOpPlan_t plan) +{ + cupdlpx_spmv_execute(state->sparse_handle, mat, vec_x, vec_y, buffer, plan); +} diff --git a/src/utils.cu b/src/utils.cu index 7eaa835..1d5c9c7 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -25,35 +25,6 @@ limitations under the License. std::mt19937 gen(1); std::normal_distribution dist(0.0, 1.0); -const double HOST_ONE = 1.0; -const double HOST_ZERO = 0.0; - -void cupdlpx_spmv(pdhg_solver_state_t *state, - cusparseSpMatDescr_t mat, - cusparseDnVecDescr_t vec_x, - cusparseDnVecDescr_t vec_y, - void *buffer, - cusparseSpMVOpPlan_t plan) -{ - if (state->use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp(state->sparse_handle, plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &HOST_ONE, - mat, - vec_x, - &HOST_ZERO, - vec_y, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - buffer)); - } -} - void *safe_malloc(size_t size) { void *ptr = malloc(size); @@ -96,12 +67,11 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, cublasHandle_t blas_handle, const cu_sparse_matrix_csr_t *A, const cu_sparse_matrix_csr_t *AT, - bool use_spmvop, int max_iterations, double tolerance) { - int m = A->num_rows; - int n = A->num_cols; + const int m = A->num_rows; + const int n = A->num_cols; double *eigenvector_d, *next_eigenvector_d, *dual_product_d; CUDA_CHECK(cudaMalloc(&eigenvector_d, m * sizeof(double))); @@ -119,7 +89,6 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, double sigma_max_sq = 1.0; const double one = 1.0; - const double zero = 0.0; cusparseSpMatDescr_t matA, matAT; CUSPARSE_CHECK(cusparseCreateCsr(&matA, @@ -150,109 +119,25 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&vecDual, n, dual_product_d, CUDA_R_64F)); - cusparseSpMVOpDescr_t descrAT = NULL, descrA = NULL; - cusparseSpMVOpPlan_t planAT = NULL, planA = NULL; + cusparseSpMVOpDescr_t descrAT = NULL; + cusparseSpMVOpDescr_t descrA = NULL; + cusparseSpMVOpPlan_t planAT = NULL; + cusparseSpMVOpPlan_t planA = NULL; void *dBufferAT = NULL; void *dBufferA = NULL; size_t bufferSizeAT = 0, bufferSizeA = 0; - if (use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - matAT, - vecNextEigen, - vecDual, - vecDual, - CUDA_R_64F, - &bufferSizeAT)); - CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - matA, - vecDual, - vecEigen, - vecEigen, - CUDA_R_64F, - &bufferSizeA)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeAT)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - &bufferSizeA)); - } + cupdlpx_spmv_buffer_size(sparse_handle, matAT, vecNextEigen, vecDual, &bufferSizeAT); + cupdlpx_spmv_buffer_size(sparse_handle, matA, vecDual, vecEigen, &bufferSizeA); CUDA_CHECK(cudaMalloc(&dBufferAT, bufferSizeAT)); CUDA_CHECK(cudaMalloc(&dBufferA, bufferSizeA)); - if (use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, - &descrAT, - CUSPARSE_OPERATION_NON_TRANSPOSE, - matAT, - vecNextEigen, - vecDual, - vecDual, - CUDA_R_64F, - dBufferAT)); - CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, descrAT, &planAT, NULL, 0)); - - CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, - &descrA, - CUSPARSE_OPERATION_NON_TRANSPOSE, - matA, - vecDual, - vecEigen, - vecEigen, - CUDA_R_64F, - dBufferA)); - CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, descrA, &planA, NULL, 0)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferAT)); - CUSPARSE_CHECK(cusparseSpMV_preprocess(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferA)); - } + cupdlpx_spmv_prepare(sparse_handle, matAT, vecNextEigen, vecDual, dBufferAT, &descrAT, &planAT); + cupdlpx_spmv_prepare(sparse_handle, matA, vecDual, vecEigen, dBufferA, &descrA, &planA); for (int i = 0; i < max_iterations; ++i) { - CUDA_CHECK(cudaMemcpy(next_eigenvector_d, eigenvector_d, m * sizeof(double), cudaMemcpyDeviceToDevice)); double eigenvector_norm; CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1, &eigenvector_norm)); @@ -260,34 +145,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, double inv_eigenvector_norm = 1.0 / eigenvector_norm; CUBLAS_CHECK(cublasDscal(blas_handle, m, &inv_eigenvector_norm, next_eigenvector_d, 1)); - if (use_spmvop) - { - CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, planAT, &one, &zero, vecNextEigen, vecDual, vecDual)); - CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, planA, &one, &zero, vecDual, vecEigen, vecEigen)); - } - else - { - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matAT, - vecNextEigen, - &zero, - vecDual, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferAT)); - CUSPARSE_CHECK(cusparseSpMV(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - matA, - vecDual, - &zero, - vecEigen, - CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, - dBufferA)); - } + cupdlpx_spmv_execute(sparse_handle, matAT, vecNextEigen, vecDual, dBufferAT, planAT); + cupdlpx_spmv_execute(sparse_handle, matA, vecDual, vecEigen, dBufferA, planA); CUBLAS_CHECK(cublasDdot(blas_handle, m, next_eigenvector_d, 1, eigenvector_d, 1, &sigma_max_sq)); @@ -304,14 +163,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUDA_CHECK(cudaFree(dBufferAT)); CUDA_CHECK(cudaFree(dBufferA)); - if (descrAT) - CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descrAT)); - if (planAT) - CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(planAT)); - if (descrA) - CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descrA)); - if (planA) - CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(planA)); + cupdlpx_spmv_release(descrAT, planAT); + cupdlpx_spmv_release(descrA, planA); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); CUSPARSE_CHECK(cusparseDestroySpMat(matAT)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecEigen)); @@ -456,8 +309,6 @@ void set_default_parameters(pdhg_parameters_t *params) params->termination_evaluation_frequency = 200; params->feasibility_polishing = false; params->reflection_coefficient = 1.0; - params->use_spmvop = true; - params->sv_max_iter = 5000; params->sv_tol = 1e-4; @@ -622,6 +473,8 @@ void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem) printf(" time_limit : %.2f sec\n", params->termination_criteria.time_sec_limit); printf(" eps_opt : %.1e\n", params->termination_criteria.eps_optimal_relative); printf(" eps_feas : %.1e\n", params->termination_criteria.eps_feasible_relative); + printf(" spmv_backend : %s (auto)\n", + cupdlpx_use_spmvop_by_default() ? "cusparseSpMVOp" : "cusparseSpMV"); if (params->optimality_norm != default_params.optimality_norm) { printf(" optimality_norm : %s\n", params->optimality_norm == NORM_TYPE_L_INF ? "L_inf" : "L2"); @@ -637,7 +490,6 @@ void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem) PRINT_DIFF_INT( "evaluation_freq", params->termination_evaluation_frequency, default_params.termination_evaluation_frequency); PRINT_DIFF_BOOL("feasibility_polishing", params->feasibility_polishing, default_params.feasibility_polishing); - PRINT_DIFF_BOOL("use_spmvop", params->use_spmvop, default_params.use_spmvop); PRINT_DIFF_DBL("eps_feas_polish_relative", params->termination_criteria.eps_feas_polish_relative, default_params.termination_criteria.eps_feas_polish_relative); From 57bb5080dc4bd1457962305a79ff6b9c80a5f3a2 Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 12:52:08 -0400 Subject: [PATCH 03/12] improve SpMVOp via cupdlpx_spmv_ctx_t --- internal/cusparse_compat.h | 16 ++-- internal/internal_types.h | 19 +---- internal/utils.h | 29 ++++--- src/solver.cu | 135 ++++++++---------------------- src/spmv_backend.cu | 163 +++++++++++++++++++++++++++++++++---- src/utils.cu | 56 +++---------- 6 files changed, 216 insertions(+), 202 deletions(-) diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h index 78cbbcd..fbb5b69 100644 --- a/internal/cusparse_compat.h +++ b/internal/cusparse_compat.h @@ -2,20 +2,14 @@ #include -#if defined(CUSPARSE_VER_MAJOR) && defined(CUSPARSE_VER_MINOR) && defined(CUSPARSE_VER_PATCH) -#define CUPDLPX_CUSPARSE_GTE_13_1U1 \ - ((CUSPARSE_VER_MAJOR > 13) || \ - (CUSPARSE_VER_MAJOR == 13 && \ - (CUSPARSE_VER_MINOR > 1 || (CUSPARSE_VER_MINOR == 1 && CUSPARSE_VER_PATCH >= 1)))) -#elif defined(CUSPARSE_VERSION) -// Fallback encoding assumption: major*1000 + minor*10 + patch -#define CUPDLPX_CUSPARSE_GTE_13_1U1 (CUSPARSE_VERSION >= 13101) +// cusparseSpMVOp_bufferSize was introduced in cuSPARSE 12.7.3 (CUDA 13.1 Update 1). +// CUSPARSE_VERSION encoding: major*1000 + minor*100 + patch. +#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION >= 12703 +#define CUPDLPX_HAS_SPMVOP 1 #else -#define CUPDLPX_CUSPARSE_GTE_13_1U1 0 +#define CUPDLPX_HAS_SPMVOP 0 #endif -#define CUPDLPX_HAS_SPMVOP CUPDLPX_CUSPARSE_GTE_13_1U1 - #if !CUPDLPX_HAS_SPMVOP typedef void *cusparseSpMVOpDescr_t; typedef void *cusparseSpMVOpPlan_t; diff --git a/internal/internal_types.h b/internal/internal_types.h index e753b74..87ba812 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -114,24 +114,7 @@ typedef struct cusparseHandle_t sparse_handle; cublasHandle_t blas_handle; - size_t spmv_buffer_size; - size_t primal_spmv_buffer_size; - size_t dual_spmv_buffer_size; - void *primal_spmv_buffer; - void *dual_spmv_buffer; - void *spmv_buffer; - - cusparseSpMatDescr_t matA; - cusparseSpMatDescr_t matAt; - cusparseDnVecDescr_t vec_primal_sol; - cusparseDnVecDescr_t vec_dual_sol; - cusparseDnVecDescr_t vec_primal_prod; - cusparseDnVecDescr_t vec_dual_prod; - - cusparseSpMVOpDescr_t primal_spmv_descr; - cusparseSpMVOpPlan_t primal_spmv_plan; - cusparseSpMVOpDescr_t dual_spmv_descr; - cusparseSpMVOpPlan_t dual_spmv_plan; + void *spmv_ctx; double *ones_primal_d; double *ones_dual_d; diff --git a/internal/utils.h b/internal/utils.h index 8b2aca9..e1715c3 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -90,17 +90,31 @@ extern "C" cusparseDnVecDescr_t vec_x, cusparseDnVecDescr_t vec_y, void *buffer, - cusparseSpMVOpDescr_t *descr, - cusparseSpMVOpPlan_t *plan); + void **descr, + void **plan); - void cupdlpx_spmv_release(cusparseSpMVOpDescr_t descr, cusparseSpMVOpPlan_t plan); + void cupdlpx_spmv_release(void *descr, void *plan); void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, cusparseSpMatDescr_t mat, cusparseDnVecDescr_t vec_x, cusparseDnVecDescr_t vec_y, void *buffer, - cusparseSpMVOpPlan_t plan); + void *plan); + + void *cupdlpx_spmv_ctx_create(cusparseHandle_t sparse_handle, + const cu_sparse_matrix_csr_t *A, + const cu_sparse_matrix_csr_t *AT, + const double *ax_x_init, + double *ax_y_init, + const double *atx_x_init, + double *atx_y_init); + + void cupdlpx_spmv_ctx_destroy(void *ctx); + + void cupdlpx_spmv_Ax(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y); + + void cupdlpx_spmv_ATx(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y); void compute_interaction_and_movement(pdhg_solver_state_t *solver_state, double *interaction, double *movement); @@ -155,13 +169,6 @@ extern "C" void set_default_parameters(pdhg_parameters_t *params); - void cupdlpx_spmv(pdhg_solver_state_t *state, - cusparseSpMatDescr_t mat, - cusparseDnVecDescr_t vec_x, - cusparseDnVecDescr_t vec_y, - void *buffer, - cusparseSpMVOpPlan_t plan); - #ifdef __cplusplus } diff --git a/src/solver.cu b/src/solver.cu index 1cc3fa4..185f1e5 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -600,64 +600,6 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ state->step_size = 0.0; state->is_this_major_iteration = false; - size_t primal_spmv_buffer_size; - size_t dual_spmv_buffer_size; - - CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, - state->num_constraints, - state->num_variables, - state->constraint_matrix->num_nonzeros, - state->constraint_matrix->row_ptr, - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - CUDA_CHECK(cudaGetLastError()); - - CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, - state->num_variables, - state->num_constraints, - state->constraint_matrix_t->num_nonzeros, - state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, - state->constraint_matrix_t->val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - CUDA_CHECK(cudaGetLastError()); - - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); - cupdlpx_spmv_buffer_size( - state->sparse_handle, state->matA, state->vec_primal_sol, state->vec_primal_prod, &primal_spmv_buffer_size); - cupdlpx_spmv_buffer_size( - state->sparse_handle, state->matAt, state->vec_dual_sol, state->vec_dual_prod, &dual_spmv_buffer_size); - CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - cupdlpx_spmv_prepare(state->sparse_handle, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->primal_spmv_buffer, - &state->primal_spmv_descr, - &state->primal_spmv_plan); - - CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - cupdlpx_spmv_prepare(state->sparse_handle, - state->matAt, - state->vec_dual_sol, - state->vec_dual_prod, - state->dual_spmv_buffer, - &state->dual_spmv_descr, - &state->dual_spmv_plan); - CUDA_CHECK(cudaMalloc(&state->d_primal_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_dual_step_size, sizeof(double))); CUDA_CHECK(cudaMalloc(&state->d_inner_count, sizeof(int))); @@ -687,6 +629,14 @@ static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_ CUSPARSE_CHECK(cusparseSetStream(state->sparse_handle, state->stream)); CUBLAS_CHECK(cublasSetStream(state->blas_handle, state->stream)); + state->spmv_ctx = cupdlpx_spmv_ctx_create(state->sparse_handle, + state->constraint_matrix, + state->constraint_matrix_t, + state->pdhg_primal_solution, + state->primal_product, + state->pdhg_dual_solution, + state->dual_product); + free(ones_dual_h); if (params->verbose) { @@ -937,11 +887,7 @@ static void compute_next_primal_solution(pdhg_solver_state_t *state, const double reflection_coefficient, bool is_major) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->current_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->current_dual_solution, state->dual_product); double step = state->step_size / state->primal_weight; @@ -986,15 +932,7 @@ static void compute_next_dual_solution(pdhg_solver_state_t *state, const double reflection_coefficient, bool is_major) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->reflected_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - - cupdlpx_spmv(state, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->primal_spmv_buffer, - state->primal_spmv_plan); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->reflected_primal_solution, state->primal_product); double step = state->step_size * state->primal_weight; @@ -1137,11 +1075,7 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state) state->num_variables, state->num_constraints); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->delta_dual_solution, state->dual_product); double interaction, movement; @@ -1174,28 +1108,14 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) return; } - if (state->matA) - CUSPARSE_CHECK(cusparseDestroySpMat(state->matA)); - if (state->matAt) - CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt)); - if (state->vec_primal_sol) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol)); - if (state->vec_dual_sol) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol)); - if (state->vec_primal_prod) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); - if (state->vec_dual_prod) - CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); - cupdlpx_spmv_release(state->primal_spmv_descr, state->primal_spmv_plan); - cupdlpx_spmv_release(state->dual_spmv_descr, state->dual_spmv_plan); - if (state->primal_spmv_buffer) - CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); - if (state->dual_spmv_buffer) - CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); + if (state->spmv_ctx) + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); if (state->sparse_handle) CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle)); if (state->blas_handle) CUBLAS_CHECK(cublasDestroy(state->blas_handle)); + if (state->stream) + CUDA_CHECK(cudaStreamDestroy(state->stream)); if (state->variable_lower_bound) CUDA_CHECK(cudaFree(state->variable_lower_bound)); @@ -1308,11 +1228,7 @@ static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, co cupdlpx_result_t *results = (cupdlpx_result_t *)safe_calloc(1, sizeof(cupdlpx_result_t)); // Compute reduced cost - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_and_rescale_reduced_cost_kernel<<num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>( state->dual_slack, @@ -1664,6 +1580,14 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solve primal_state->relative_objective_gap = 0.0; primal_state->objective_gap = 0.0; + primal_state->spmv_ctx = cupdlpx_spmv_ctx_create(primal_state->sparse_handle, + primal_state->constraint_matrix, + primal_state->constraint_matrix_t, + primal_state->pdhg_primal_solution, + primal_state->primal_product, + primal_state->pdhg_dual_solution, + primal_state->dual_product); + return primal_state; } @@ -1695,6 +1619,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } @@ -1796,6 +1721,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_ dual_state->absolute_primal_residual = 0.0; dual_state->relative_objective_gap = 0.0; dual_state->objective_gap = 0.0; + + dual_state->spmv_ctx = cupdlpx_spmv_ctx_create(dual_state->sparse_handle, + dual_state->constraint_matrix, + dual_state->constraint_matrix_t, + dual_state->pdhg_primal_solution, + dual_state->primal_product, + dual_state->pdhg_dual_solution, + dual_state->dual_product); + return dual_state; } @@ -1837,6 +1771,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } diff --git a/src/spmv_backend.cu b/src/spmv_backend.cu index 4aab4fd..d312987 100644 --- a/src/spmv_backend.cu +++ b/src/spmv_backend.cu @@ -19,6 +19,22 @@ limitations under the License. static const double HOST_ONE = 1.0; static const double HOST_ZERO = 0.0; +typedef struct +{ + cusparseSpMatDescr_t matA; + cusparseSpMatDescr_t matAT; + cusparseDnVecDescr_t vec_ax_x; + cusparseDnVecDescr_t vec_ax_y; + cusparseDnVecDescr_t vec_atx_x; + cusparseDnVecDescr_t vec_atx_y; + void *ax_buffer; + void *atx_buffer; + void *ax_descr; + void *ax_plan; + void *atx_descr; + void *atx_plan; +} cupdlpx_spmv_ctx_t; + bool cupdlpx_use_spmvop_by_default(void) { return CUPDLPX_HAS_SPMVOP; @@ -58,12 +74,14 @@ void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, cusparseDnVecDescr_t vec_x, cusparseDnVecDescr_t vec_y, void *buffer, - cusparseSpMVOpDescr_t *descr, - cusparseSpMVOpPlan_t *plan) + void **descr, + void **plan) { #if CUPDLPX_HAS_SPMVOP + cusparseSpMVOpDescr_t local_descr = NULL; + cusparseSpMVOpPlan_t local_plan = NULL; CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, - descr, + &local_descr, CUSPARSE_OPERATION_NON_TRANSPOSE, mat, vec_x, @@ -71,7 +89,9 @@ void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, vec_y, CUDA_R_64F, buffer)); - CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, *descr, plan, NULL, 0)); + CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, local_descr, &local_plan, NULL, 0)); + *descr = (void *)local_descr; + *plan = (void *)local_plan; #else (void)descr; (void)plan; @@ -88,16 +108,16 @@ void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, #endif } -void cupdlpx_spmv_release(cusparseSpMVOpDescr_t descr, cusparseSpMVOpPlan_t plan) +void cupdlpx_spmv_release(void *descr, void *plan) { #if CUPDLPX_HAS_SPMVOP if (descr) { - CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr(descr)); + CUSPARSE_CHECK(cusparseSpMVOp_destroyDescr((cusparseSpMVOpDescr_t)descr)); } if (plan) { - CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan(plan)); + CUSPARSE_CHECK(cusparseSpMVOp_destroyPlan((cusparseSpMVOpPlan_t)plan)); } #else (void)descr; @@ -110,10 +130,13 @@ void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, cusparseDnVecDescr_t vec_x, cusparseDnVecDescr_t vec_y, void *buffer, - cusparseSpMVOpPlan_t plan) + void *plan) { #if CUPDLPX_HAS_SPMVOP - CUSPARSE_CHECK(cusparseSpMVOp(sparse_handle, plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); + (void)mat; + (void)buffer; + CUSPARSE_CHECK( + cusparseSpMVOp(sparse_handle, (cusparseSpMVOpPlan_t)plan, &HOST_ONE, &HOST_ZERO, vec_x, vec_y, vec_y)); #else (void)plan; CUSPARSE_CHECK(cusparseSpMV(sparse_handle, @@ -129,12 +152,120 @@ void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle, #endif } -void cupdlpx_spmv(pdhg_solver_state_t *state, - cusparseSpMatDescr_t mat, - cusparseDnVecDescr_t vec_x, - cusparseDnVecDescr_t vec_y, - void *buffer, - cusparseSpMVOpPlan_t plan) +void *cupdlpx_spmv_ctx_create(cusparseHandle_t sparse_handle, + const cu_sparse_matrix_csr_t *A, + const cu_sparse_matrix_csr_t *AT, + const double *ax_x_init, + double *ax_y_init, + const double *atx_x_init, + double *atx_y_init) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)safe_calloc(1, sizeof(cupdlpx_spmv_ctx_t)); + size_t ax_buffer_size = 0; + size_t atx_buffer_size = 0; + + CUSPARSE_CHECK(cusparseCreateCsr(&ctx->matA, + A->num_rows, + A->num_cols, + A->num_nonzeros, + A->row_ptr, + A->col_ind, + A->val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + + CUSPARSE_CHECK(cusparseCreateCsr(&ctx->matAT, + AT->num_rows, + AT->num_cols, + AT->num_nonzeros, + AT->row_ptr, + AT->col_ind, + AT->val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_ax_x, A->num_cols, (void *)ax_x_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_ax_y, A->num_rows, ax_y_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_atx_x, AT->num_cols, (void *)atx_x_init, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&ctx->vec_atx_y, AT->num_rows, atx_y_init, CUDA_R_64F)); + + cupdlpx_spmv_buffer_size(sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, &ax_buffer_size); + cupdlpx_spmv_buffer_size(sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, &atx_buffer_size); + CUDA_CHECK(cudaMalloc(&ctx->ax_buffer, ax_buffer_size)); + CUDA_CHECK(cudaMalloc(&ctx->atx_buffer, atx_buffer_size)); + + cupdlpx_spmv_prepare( + sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, ctx->ax_buffer, &ctx->ax_descr, &ctx->ax_plan); + cupdlpx_spmv_prepare( + sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, ctx->atx_buffer, &ctx->atx_descr, &ctx->atx_plan); + + return (void *)ctx; +} + +void cupdlpx_spmv_ctx_destroy(void *ctx_void) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + if (ctx == NULL) + { + return; + } + + cupdlpx_spmv_release(ctx->ax_descr, ctx->ax_plan); + cupdlpx_spmv_release(ctx->atx_descr, ctx->atx_plan); + + if (ctx->ax_buffer) + { + CUDA_CHECK(cudaFree(ctx->ax_buffer)); + } + if (ctx->atx_buffer) + { + CUDA_CHECK(cudaFree(ctx->atx_buffer)); + } + + if (ctx->vec_ax_x) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_ax_x)); + } + if (ctx->vec_ax_y) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_ax_y)); + } + if (ctx->vec_atx_x) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_atx_x)); + } + if (ctx->vec_atx_y) + { + CUSPARSE_CHECK(cusparseDestroyDnVec(ctx->vec_atx_y)); + } + if (ctx->matA) + { + CUSPARSE_CHECK(cusparseDestroySpMat(ctx->matA)); + } + if (ctx->matAT) + { + CUSPARSE_CHECK(cusparseDestroySpMat(ctx->matAT)); + } + + free(ctx); +} + +void cupdlpx_spmv_Ax(cusparseHandle_t sparse_handle, void *ctx_void, const double *x, double *y) +{ + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_ax_x, (void *)x)); + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_ax_y, y)); + cupdlpx_spmv_execute(sparse_handle, ctx->matA, ctx->vec_ax_x, ctx->vec_ax_y, ctx->ax_buffer, ctx->ax_plan); +} + +void cupdlpx_spmv_ATx(cusparseHandle_t sparse_handle, void *ctx_void, const double *x, double *y) { - cupdlpx_spmv_execute(state->sparse_handle, mat, vec_x, vec_y, buffer, plan); + cupdlpx_spmv_ctx_t *ctx = (cupdlpx_spmv_ctx_t *)ctx_void; + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_atx_x, (void *)x)); + CUSPARSE_CHECK(cusparseDnVecSetValues(ctx->vec_atx_y, y)); + cupdlpx_spmv_execute(sparse_handle, ctx->matAT, ctx->vec_atx_x, ctx->vec_atx_y, ctx->atx_buffer, ctx->atx_plan); } diff --git a/src/utils.cu b/src/utils.cu index 1d5c9c7..c2cdc45 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -119,10 +119,10 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle, CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&vecDual, n, dual_product_d, CUDA_R_64F)); - cusparseSpMVOpDescr_t descrAT = NULL; - cusparseSpMVOpDescr_t descrA = NULL; - cusparseSpMVOpPlan_t planAT = NULL; - cusparseSpMVOpPlan_t planA = NULL; + void *descrAT = NULL; + void *descrA = NULL; + void *planAT = NULL; + void *planA = NULL; void *dBufferAT = NULL; void *dBufferA = NULL; @@ -750,20 +750,8 @@ static double get_vector_sum(cublasHandle_t handle, int n, double *ones_d, const void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) { - cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution); - cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution); - cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product); - cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product); - - cupdlpx_spmv(state, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->primal_spmv_buffer, - state->primal_spmv_plan); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->pdhg_primal_solution, state->primal_product); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -863,20 +851,8 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double dual_ray_inf_norm = get_vector_inf_norm(state->blas_handle, state->num_constraints, state->delta_dual_solution); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->delta_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - cupdlpx_spmv(state, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->primal_spmv_buffer, - state->primal_spmv_plan); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->delta_primal_solution, state->primal_product); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->delta_dual_solution, state->dual_product); CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -1257,15 +1233,7 @@ void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); - - cupdlpx_spmv(state, - state->matA, - state->vec_primal_sol, - state->vec_primal_prod, - state->primal_spmv_buffer, - state->primal_spmv_plan); + cupdlpx_spmv_Ax(state->sparse_handle, state->spmv_ctx, state->pdhg_primal_solution, state->primal_product); compute_primal_feas_polish_residual_kernel<<num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>( state->primal_residual, @@ -1306,11 +1274,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); - CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); - - cupdlpx_spmv( - state, state->matAt, state->vec_dual_sol, state->vec_dual_prod, state->dual_spmv_buffer, state->dual_spmv_plan); + cupdlpx_spmv_ATx(state->sparse_handle, state->spmv_ctx, state->pdhg_dual_solution, state->dual_product); compute_dual_feas_polish_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->dual_residual, From 53eb063a91453f7982f6cf7bf620d57288ceffae Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 13:08:34 -0400 Subject: [PATCH 04/12] update feasibility polishing spmv --- src/feasibility_polish.cu | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu index 5f07e54..656c8e5 100644 --- a/src/feasibility_polish.cu +++ b/src/feasibility_polish.cu @@ -341,6 +341,14 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solve primal_state->relative_objective_gap = 0.0; primal_state->objective_gap = 0.0; + primal_state->spmv_ctx = cupdlpx_spmv_ctx_create(primal_state->sparse_handle, + primal_state->constraint_matrix, + primal_state->constraint_matrix_t, + primal_state->pdhg_primal_solution, + primal_state->primal_product, + primal_state->pdhg_dual_solution, + primal_state->dual_product); + return primal_state; } @@ -372,6 +380,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } @@ -473,6 +482,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_ dual_state->absolute_primal_residual = 0.0; dual_state->relative_objective_gap = 0.0; dual_state->objective_gap = 0.0; + + dual_state->spmv_ctx = cupdlpx_spmv_ctx_create(dual_state->sparse_handle, + dual_state->constraint_matrix, + dual_state->constraint_matrix_t, + dual_state->pdhg_primal_solution, + dual_state->primal_product, + dual_state->pdhg_dual_solution, + dual_state->dual_product); + return dual_state; } @@ -514,6 +532,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->dual_residual); SAFE_CUDA_FREE(state->delta_primal_solution); SAFE_CUDA_FREE(state->delta_dual_solution); + cupdlpx_spmv_ctx_destroy(state->spmv_ctx); free(state); } From 7c1aca225744ed703bad9ab734a07371b82c1e8f Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 13:14:07 -0400 Subject: [PATCH 05/12] Apply formatter --- internal/utils.h | 2 +- src/feasibility_polish.cu | 24 ++++++++++++------------ src/spmv_backend.cu | 21 ++++----------------- src/utils.cu | 3 +-- 4 files changed, 18 insertions(+), 32 deletions(-) diff --git a/internal/utils.h b/internal/utils.h index e1715c3..c31fd62 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -16,8 +16,8 @@ limitations under the License. #pragma once -#include "internal_types.h" #include "cusparse_compat.h" +#include "internal_types.h" #include #include #include diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu index 656c8e5..879d241 100644 --- a/src/feasibility_polish.cu +++ b/src/feasibility_polish.cu @@ -342,12 +342,12 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solve primal_state->objective_gap = 0.0; primal_state->spmv_ctx = cupdlpx_spmv_ctx_create(primal_state->sparse_handle, - primal_state->constraint_matrix, - primal_state->constraint_matrix_t, - primal_state->pdhg_primal_solution, - primal_state->primal_product, - primal_state->pdhg_dual_solution, - primal_state->dual_product); + primal_state->constraint_matrix, + primal_state->constraint_matrix_t, + primal_state->pdhg_primal_solution, + primal_state->primal_product, + primal_state->pdhg_dual_solution, + primal_state->dual_product); return primal_state; } @@ -484,12 +484,12 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_ dual_state->objective_gap = 0.0; dual_state->spmv_ctx = cupdlpx_spmv_ctx_create(dual_state->sparse_handle, - dual_state->constraint_matrix, - dual_state->constraint_matrix_t, - dual_state->pdhg_primal_solution, - dual_state->primal_product, - dual_state->pdhg_dual_solution, - dual_state->dual_product); + dual_state->constraint_matrix, + dual_state->constraint_matrix_t, + dual_state->pdhg_primal_solution, + dual_state->primal_product, + dual_state->pdhg_dual_solution, + dual_state->dual_product); return dual_state; } diff --git a/src/spmv_backend.cu b/src/spmv_backend.cu index d312987..4b7618a 100644 --- a/src/spmv_backend.cu +++ b/src/spmv_backend.cu @@ -47,14 +47,8 @@ void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle, size_t *buffer_size) { #if CUPDLPX_HAS_SPMVOP - CUSPARSE_CHECK(cusparseSpMVOp_bufferSize(sparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - mat, - vec_x, - vec_y, - vec_y, - CUDA_R_64F, - buffer_size)); + CUSPARSE_CHECK(cusparseSpMVOp_bufferSize( + sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mat, vec_x, vec_y, vec_y, CUDA_R_64F, buffer_size)); #else CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -80,15 +74,8 @@ void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle, #if CUPDLPX_HAS_SPMVOP cusparseSpMVOpDescr_t local_descr = NULL; cusparseSpMVOpPlan_t local_plan = NULL; - CUSPARSE_CHECK(cusparseSpMVOp_createDescr(sparse_handle, - &local_descr, - CUSPARSE_OPERATION_NON_TRANSPOSE, - mat, - vec_x, - vec_y, - vec_y, - CUDA_R_64F, - buffer)); + CUSPARSE_CHECK(cusparseSpMVOp_createDescr( + sparse_handle, &local_descr, CUSPARSE_OPERATION_NON_TRANSPOSE, mat, vec_x, vec_y, vec_y, CUDA_R_64F, buffer)); CUSPARSE_CHECK(cusparseSpMVOp_createPlan(sparse_handle, local_descr, &local_plan, NULL, 0)); *descr = (void *)local_descr; *plan = (void *)local_plan; diff --git a/src/utils.cu b/src/utils.cu index 745351d..5f9f303 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -473,8 +473,7 @@ void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem) printf(" time_limit : %.2f sec\n", params->termination_criteria.time_sec_limit); printf(" eps_opt : %.1e\n", params->termination_criteria.eps_optimal_relative); printf(" eps_feas : %.1e\n", params->termination_criteria.eps_feasible_relative); - printf(" spmv_backend : %s (auto)\n", - cupdlpx_use_spmvop_by_default() ? "cusparseSpMVOp" : "cusparseSpMV"); + printf(" spmv_backend : %s (auto)\n", cupdlpx_use_spmvop_by_default() ? "cusparseSpMVOp" : "cusparseSpMV"); if (params->optimality_norm != default_params.optimality_norm) { printf(" optimality_norm : %s\n", params->optimality_norm == NORM_TYPE_L_INF ? "L_inf" : "L2"); From 66b196077510fc02c023f6f4a3c527ca0ca11caf Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 14:50:37 -0400 Subject: [PATCH 06/12] fix spmvop version issue --- internal/cusparse_compat.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h index fbb5b69..9eaecb2 100644 --- a/internal/cusparse_compat.h +++ b/internal/cusparse_compat.h @@ -11,6 +11,11 @@ #endif #if !CUPDLPX_HAS_SPMVOP +// The SpMVOp types were added to cusparse.h before the functions +// (e.g. CUDA 13.1 base has the types but not the functions). +// Only provide fallback typedefs for cuSPARSE versions that lack them entirely. +#if !defined(CUSPARSE_VERSION) || CUSPARSE_VERSION < 12700 typedef void *cusparseSpMVOpDescr_t; typedef void *cusparseSpMVOpPlan_t; #endif +#endif From 97a78d6df097441a3863d22fa121f3d968f42702 Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 14:57:03 -0400 Subject: [PATCH 07/12] add more CUDA versions to CI --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 90d3e7b..a63e9d7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,6 +13,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest] + cuda: ["12.4.0", "12.5.0", "12.6.0", "12.8.0", "12.9.0", "13.0.0", "13.1.0"] runs-on: ${{ matrix.os }} steps: @@ -21,7 +22,7 @@ jobs: - uses: Jimver/cuda-toolkit@v0.2.30 id: cuda-toolkit with: - cuda: "13.1.0" + cuda: ${{ matrix.cuda }} - name: CUDA info run: | From 4cb718be53ee7fc713dfce274323d3ee416c0b59 Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 15:24:16 -0400 Subject: [PATCH 08/12] CI: update cuda-toolkit setting --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a63e9d7..01cb9c6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,6 +23,8 @@ jobs: id: cuda-toolkit with: cuda: ${{ matrix.cuda }} + method: 'network' + log-file-suffix: '${{ matrix.os }}-${{ matrix.cuda }}.txt' - name: CUDA info run: | From bbe334e724d9145257bda5d143fcd0ccb33e4937 Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 16:07:08 -0400 Subject: [PATCH 09/12] update build.yml --- .github/workflows/build.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 01cb9c6..a63e9d7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,8 +23,6 @@ jobs: id: cuda-toolkit with: cuda: ${{ matrix.cuda }} - method: 'network' - log-file-suffix: '${{ matrix.os }}-${{ matrix.cuda }}.txt' - name: CUDA info run: | From 5f6745df226772235ce3bab7e378660eb628f3ba Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Sun, 15 Mar 2026 16:29:50 -0400 Subject: [PATCH 10/12] update build.yml --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a63e9d7..8ec2c21 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,6 +23,7 @@ jobs: id: cuda-toolkit with: cuda: ${{ matrix.cuda }} + linux-local-args: '["--toolkit"]' - name: CUDA info run: | From fb1d003ec22ec8d7203ce41116b75eb35d96d9e8 Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Mon, 16 Mar 2026 21:06:35 -0400 Subject: [PATCH 11/12] update SpMV backend in readme --- README.md | 4 +++- python/README.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d52e296..919e8e0 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,9 @@ Our work is presented in two papers: * **GPU:** NVIDIA GPU with CUDA 12.4+. * **Build Tools:** CMake (≥ 3.20), GCC, NVCC. -> **SpMV backend** is selected automatically based on cuSPARSE version (`cusparseSpMV` for < 13.1 Update 1; `cusparseSpMVOp` for >= 13.1 Update 1). +> **SpMV backend** is selected automatically at compile time based on cuSPARSE version: +> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3) +> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3) ### Build from Source Clone the repository and compile the project using CMake. diff --git a/python/README.md b/python/README.md index 4b53778..47d0a66 100644 --- a/python/README.md +++ b/python/README.md @@ -18,7 +18,9 @@ It provides a high-level, Pythonic API for constructing, modifying, and solving - An NVIDIA GPU with CUDA support (≥12.4 required) - A C/C++ toolchain with GCC and NVCC -> **SpMV backend** is selected automatically based on cuSPARSE version (`cusparseSpMV` for < 13.1 Update 1; `cusparseSpMVOp` for >= 13.1 Update 1). +> **SpMV backend** is selected automatically at compile time based on cuSPARSE version: +> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3) +> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3) ### Install From acae20408bd9e39e32d852b9e96db379eccfb63b Mon Sep 17 00:00:00 2001 From: ZedongPeng Date: Mon, 16 Mar 2026 21:06:56 -0400 Subject: [PATCH 12/12] update version to v0.2.8 --- CMakeLists.txt | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bcf546..2edf592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(cupdlpx LANGUAGES C CXX CUDA) set(CUPDLPX_VERSION_MAJOR 0) set(CUPDLPX_VERSION_MINOR 2) -set(CUPDLPX_VERSION_PATCH 7) +set(CUPDLPX_VERSION_PATCH 8) set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}") add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}") diff --git a/pyproject.toml b/pyproject.toml index 1d3ada0..35f3ebf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.2.7" +version = "0.2.8" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" }