diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ff35d191..c1aee584 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -3,7 +3,6 @@ name: Code linting on: push: branches: - - main pull_request: jobs: diff --git a/CMakeLists.txt b/CMakeLists.txt index 637b7d0c..f6d7cf40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,11 +20,15 @@ cmake_dependent_option(GPRAT_ENABLE_TESTS "Build unit and integration tests" ${PROJECT_IS_TOP_LEVEL} "GPRAT_BUILD_CORE" OFF) cmake_dependent_option(GPRAT_ENABLE_MKL "Enable support for Intel oneMKL" ${PROJECT_IS_TOP_LEVEL} "GPRAT_BUILD_CORE" OFF) +option(GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS + "Evict data from caches before running BLAS operations" ON) option(GPRAT_ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" ${PROJECT_IS_TOP_LEVEL}) if(GPRAT_ENABLE_FORMAT_TARGETS) + set(CMAKE_FORMAT_EXCLUDE "^external_ports/") + find_package(format QUIET) if(NOT format_FOUND) include(FetchContent) diff --git a/CMakePresets.json b/CMakePresets.json index e18ab19b..95204452 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -21,6 +21,21 @@ "deprecated": true } }, + { + "name": "vcpkg", + "hidden": true, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake", + "X_VCPKG_APPLOCAL_DEPS_INSTALL": "ON" + } + }, + { + "name": "vcpkg-win64-static", + "hidden": true, + "cacheVariables": { + "VCPKG_TARGET_TRIPLET": "x64-windows-static-md-release" + } + }, { "name": "cppcheck", "hidden": true, @@ -67,7 +82,7 @@ "description": "Note that all the flags after /W4 are required for MSVC to conform to the language standard", "hidden": true, "cacheVariables": { - "CMAKE_CXX_FLAGS": "/sdl /guard:cf /utf-8 /diagnostics:caret /w14165 /w44242 /w44254 /w44263 /w34265 /w34287 /w44296 /w44365 /w44388 /w44464 /w14545 /w14546 /w14547 /w14549 /w14555 /w34619 /w34640 /w24826 /w14905 /w14906 /w14928 /w45038 /W4 /permissive- /volatile:iso /Zc:inline /Zc:preprocessor /Zc:enumTypes /Zc:lambda /Zc:__cplusplus /Zc:externConstexpr /Zc:throwingNew /EHsc", + "CMAKE_CXX_FLAGS": "/sdl /guard:cf /utf-8 /diagnostics:caret /w14165 /w44242 /w44254 /w44263 /w34265 /w34287 /w44296 /w44365 /w44388 /w44464 /w14545 /w14546 /w14547 /w14549 /w14555 /w34619 /w34640 /w24826 /w14905 /w14906 /w14928 /w45038 /W4 /permissive- /volatile:iso /Zc:inline /Zc:preprocessor /Zc:enumTypes /Zc:lambda /Zc:__cplusplus /Zc:externConstexpr /Zc:throwingNew /EHsc /D_CRT_SECURE_NO_WARNINGS", "CMAKE_EXE_LINKER_FLAGS": "/machine:x64 /guard:cf", "CMAKE_SHARED_LINKER_FLAGS": "/machine:x64 /guard:cf" } @@ -146,7 +161,7 @@ }, { "name": "ci-windows", - "inherits": ["ci-build", "ci-win64", "ci-multi-config"] + "inherits": ["ci-build", "ci-win64", "ci-multi-config", "vcpkg", "vcpkg-win64-static"] }, { "name": "ci-ubuntu-24.04", diff --git a/README.md b/README.md index 389bc776..7c73f0e0 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ code. ## Dependencies -GPRat depends on [HPX](https://hpx-docs.stellar-group.org/latest/html/index.html) for asynchronous task-based parallelization. +GPRat depends on [HPX](https://hpx-docs.stellar-group.org/latest/html/index.html) for asynchronous task-based parallelization. Furthermore, for CPU-only BLAS computation GPRat requires [OpenBLAS](http://www.openmathlib.org/OpenBLAS/) or [MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html). A [CUDA](https://developer.nvidia.com/cuda-toolkit) installation is required for GPU-only BLAS computations. @@ -20,6 +20,9 @@ A script to install and setup spack for `GPRat` is provided in [`spack-repo`](sp Spack environment configurations and setup scripts for CPU and GPU use are provided in [`spack-repo/environments`](spack-repo/environments). +Since Spack is not available on Windows, we also support dependency installation using vcpkg. +For now, vcpkg builds are only tested on Windows. + ## How To Compile GPRat makes use of [CMake presets][1] to simplify the process of configuring the project. @@ -35,6 +38,7 @@ ctest --preset=dev-linux As a developer, you may create a `CMakeUserPresets.json` file at the root of the project that contains additional presets local to your machine. In addition to the build configuration `dev-linux`, there are `release-linux`, `dev-linux-gpu`, and `release-linux-gpu`. +For Windows, we have similar presets called `dev-windows` and `release-windows`. The configurations suffixed with `-gpu` build the library with CUDA. GPRat can be build with or without Python bindings. diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index bad4b5ea..5ae3222f 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -1,5 +1,5 @@ # try finding pybind11 -set(GPRat_pybind11_VERSION 2.10.3) +set(GPRat_pybind11_VERSION 2.13.6) find_package(pybind11 ${GPRat_pybind11_VERSION} QUIET) if(pybind11_FOUND) message(STATUS "Found package pybind11.") diff --git a/bindings/gprat_py.cpp b/bindings/gprat_py.cpp index b18d2279..9efb56ce 100644 --- a/bindings/gprat_py.cpp +++ b/bindings/gprat_py.cpp @@ -1,4 +1,5 @@ -#include "gprat_c.hpp" +#include "gprat/gprat.hpp" + #include #include @@ -31,19 +32,19 @@ void init_gprat(py::module &m) // Set hyperparameters to default values in `AdamParams` class, unless // specified. Python object has full access to each hyperparameter and a // string representation `__repr__`. - py::class_(m, "AdamParams") + py::class_(m, "AdamParams") .def(py::init(), py::arg("learning_rate") = 0.001, py::arg("beta1") = 0.9, py::arg("beta2") = 0.999, py::arg("epsilon") = 1e-8, py::arg("opt_iter") = 0) - .def_readwrite("learning_rate", &gprat_hyper::AdamParams::learning_rate) - .def_readwrite("beta1", &gprat_hyper::AdamParams::beta1) - .def_readwrite("beta2", &gprat_hyper::AdamParams::beta2) - .def_readwrite("epsilon", &gprat_hyper::AdamParams::epsilon) - .def_readwrite("opt_iter", &gprat_hyper::AdamParams::opt_iter) - .def("__repr__", &gprat_hyper::AdamParams::repr); + .def_readwrite("learning_rate", &gprat::AdamParams::learning_rate) + .def_readwrite("beta1", &gprat::AdamParams::beta1) + .def_readwrite("beta2", &gprat::AdamParams::beta2) + .def_readwrite("epsilon", &gprat::AdamParams::epsilon) + .def_readwrite("opt_iter", &gprat::AdamParams::opt_iter) + .def("__repr__", &gprat::AdamParams::repr); // Initializes Gaussian Process with `GP` class. Sets default parameters for // squared exponential kernel, number of regressors and trainable, unless diff --git a/bindings/utils_py.cpp b/bindings/utils_py.cpp index 277e40ef..ab44cc5a 100644 --- a/bindings/utils_py.cpp +++ b/bindings/utils_py.cpp @@ -1,5 +1,6 @@ -#include "target.hpp" -#include "utils_c.hpp" +#include "gprat/target.hpp" +#include "gprat/utils.hpp" + #include #include @@ -32,7 +33,7 @@ void start_hpx_wrapper(std::vector args, std::size_t n_cores) } argv.push_back(nullptr); int argc = static_cast(args.size()); - utils::start_hpx_runtime(argc, argv.data()); + gprat::start_hpx_runtime(argc, argv.data()); } /** @@ -43,7 +44,7 @@ void start_hpx_wrapper(std::vector args, std::size_t n_cores) void init_utils(py::module &m) { m.def("compute_train_tiles", - &utils::compute_train_tiles, + &gprat::compute_train_tiles, py::arg("n_samples"), py::arg("n_tile_size"), R"pbdoc( @@ -58,7 +59,7 @@ void init_utils(py::module &m) )pbdoc"); m.def("compute_train_tile_size", - &utils::compute_train_tile_size, + &gprat::compute_train_tile_size, py::arg("n_samples"), py::arg("n_tiles"), R"pbdoc( @@ -73,7 +74,7 @@ void init_utils(py::module &m) )pbdoc"); m.def("compute_test_tiles", - &utils::compute_test_tiles, + &gprat::compute_test_tiles, py::arg("m_samples"), py::arg("n_tiles"), py::arg("n_tile_size"), @@ -90,7 +91,7 @@ void init_utils(py::module &m) )pbdoc"); m.def("print_vector", - &utils::print_vector, + &gprat::print_vector, py::arg("vec"), py::arg("start") = 0, py::arg("end") = -1, @@ -98,11 +99,11 @@ void init_utils(py::module &m) "Print elements of a vector with optional start, end, and separator parameters"); m.def("start_hpx", &start_hpx_wrapper, py::arg("args"), py::arg("n_cores")); // Using the wrapper function - m.def("resume_hpx", &utils::resume_hpx_runtime); - m.def("suspend_hpx", &utils::suspend_hpx_runtime); - m.def("stop_hpx", &utils::stop_hpx_runtime); + m.def("resume_hpx", &gprat::resume_hpx_runtime); + m.def("suspend_hpx", &gprat::suspend_hpx_runtime); + m.def("stop_hpx", &gprat::stop_hpx_runtime); - m.def("compiled_with_cuda", &utils::compiled_with_cuda, "Check if the code was compiled with CUDA support"); + m.def("compiled_with_cuda", &gprat::compiled_with_cuda, "Check if the code was compiled with CUDA support"); m.def("print_available_gpus", &gprat::print_available_gpus, "Print available GPUs with their properties"); m.def("gpu_count", &gprat::gpu_count, "Return the number of available GPUs"); diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index da4c96d0..1a7b4db3 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -1,18 +1,20 @@ +# Option for GPU support with CUDA, cuSolver, cuBLAS +option(GPRAT_WITH_CUDA "Enable GPU support with CUDA, cuSolver, cuBLAS" OFF) + if(GPRAT_WITH_CUDA) + set(CMAKE_CUDA_STANDARD 20) + set(CMAKE_CUDA_EXTENSIONS OFF) enable_language(CUDA) endif() -# Option for GPU support with CUDA, cuSolver, cuBLAS -option(GPRAT_WITH_CUDA "Enable GPU support with CUDA, cuSolver, cuBLAS" OFF) -# Pass variable to C++ code -add_compile_definitions(GPRAT_WITH_CUDA=$) - set(SOURCE_FILES - src/gprat_c.cpp - src/utils_c.cpp + src/gprat.cpp + src/utils.cpp + src/performance_counters.cpp src/target.cpp - src/gp_kernels.cpp - src/gp_hyperparameters.cpp + src/tile_data.cpp + src/kernels.cpp + src/hyperparameters.cpp src/cpu/gp_functions.cpp src/cpu/gp_algorithms.cpp src/cpu/gp_uncertainty.cpp @@ -54,7 +56,10 @@ target_sources(gprat_core PRIVATE ${header_files}) target_link_libraries(gprat_core PUBLIC HPX::hpx) if(GPRAT_WITH_CUDA) + find_package(CUDAToolkit MODULE REQUIRED) target_link_libraries(gprat_core PUBLIC CUDA::cusolver CUDA::cublas) + # Flag not working for CLANG CUDA + target_compile_features(gprat_core PUBLIC cuda_std_${CMAKE_CUDA_STANDARD}) endif() # Include directories @@ -66,16 +71,19 @@ if(GPRAT_ENABLE_MKL) # Link Intel oneMKL target_link_libraries(gprat_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL MKL::mkl_sequential) + target_compile_definitions(gprat_core PUBLIC GPRAT_ENABLE_MKL) else() # Link OpenBLAS target_link_libraries(gprat_core PUBLIC ${OpenBLAS_LIB}) endif() -if(GPRAT_ENABLE_MKL) - target_compile_definitions(gprat_core PUBLIC GPRAT_ENABLE_MKL) +target_compile_definitions(gprat_core + PUBLIC GPRAT_WITH_CUDA=$) +if(GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS) + target_compile_definitions(gprat_core + PUBLIC GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS) endif() - -target_compile_features(gprat_core PUBLIC cxx_std_17) +target_compile_features(gprat_core PUBLIC cxx_std_20) set_property(TARGET gprat_core PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/core/include/cpu/adapter_cblas_fp32.hpp b/core/include/cpu/adapter_cblas_fp32.hpp deleted file mode 100644 index 9cf21915..00000000 --- a/core/include/cpu/adapter_cblas_fp32.hpp +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef CPU_ADAPTER_CBLAS_FP32_H -#define CPU_ADAPTER_CBLAS_FP32_H - -#include -#include -using vector_future = hpx::shared_future>; - -// Constants that are compatible with CBLAS -typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE; - -typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE; - -typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA; - -// typedef enum BLAS_UPLO { Blas_upper = 121, -// Blas_lower = 122 } BLAS_UPLO; - -// typedef enum BLAS_ORDERING { Blas_row_major = 101, -// Blas_col_major = 102 } BLAS_ORDERING; - -// BLAS level 3 operations - -/** - * @brief FP32 In-place Cholesky decomposition of A - * @param f_A matrix to be factorized - * @param N matrix dimension - * @return factorized, lower triangular matrix f_L - */ -vector_future potrf(vector_future f_A, const int N); - -/** - * @brief FP32 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular - * @param f_L Cholesky factor matrix - * @param f_A right hand side matrix - * @param N first dimension - * @param M second dimension - * @return solution matrix f_X - */ -vector_future trsm(vector_future f_L, - vector_future f_A, - const int N, - const int M, - const BLAS_TRANSPOSE transpose_L, - const BLAS_SIDE side_L); - -/** - * @brief FP32 Symmetric rank-k update: A = A - B * B^T - * @param f_A Base matrix - * @param f_B Symmetric update matrix - * @param N matrix dimension - * @return updated matrix f_A - */ -vector_future syrk(vector_future f_A, vector_future f_B, const int N); - -/** - * @brief FP32 General matrix-matrix multiplication: C = C - A(^T) * B(^T) - * @param f_C Base matrix - * @param f_B Right update matrix - * @param f_A Left update matrix - * @param N first matrix dimension - * @param M second matrix dimension - * @param K third matrix dimension - * @param transpose_A transpose left matrix - * @param transpose_B transpose right matrix - * @return updated matrix f_X - */ -vector_future -gemm(vector_future f_A, - vector_future f_B, - vector_future f_C, - const int N, - const int M, - const int K, - const BLAS_TRANSPOSE transpose_A, - const BLAS_TRANSPOSE transpose_B); - -// BLAS level 2 operations - -/** - * @brief FP32 In-place solve L(^T) * x = a where L lower triangular - * @param f_L Cholesky factor matrix - * @param f_a right hand side vector - * @param N matrix dimension - * @param transpose_L transpose Cholesky factor - * @return solution vector f_x - */ -vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L); - -/** - * @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a - * @param f_A update matrix - * @param f_a update vector - * @param f_b base vector - * @param N matrix dimension - * @param alpha add or substract update to base vector - * @param transpose_A transpose update matrix - * @return updated vector f_b - */ -vector_future gemv(vector_future f_A, - vector_future f_a, - vector_future f_b, - const int N, - const int M, - const BLAS_ALPHA alpha, - const BLAS_TRANSPOSE transpose_A); - -/** - * @brief FP32 Vector update with diagonal SYRK: r = r + diag(A^T * A) - * @param f_A update matrix - * @param f_r base vector - * @param N first matrix dimension - * @param M second matrix dimension - * @return updated vector f_r - */ -vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M); - -/** - * @brief FP32 Vector update with diagonal GEMM: r = r + diag(A * B) - * @param f_A first update matrix - * @param f_B second update matrix - * @param f_r base vector - * @param N first matrix dimension - * @param M second matrix dimension - * @return updated vector f_r - */ -vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M); - -// BLAS level 1 operations - -/** - * @brief FP32 AXPY: y - x - * @param f_y left vector - * @param f_x right vector - * @param N vector length - * @return y - x - */ -vector_future axpy(vector_future f_y, vector_future f_x, const int N); - -/** - * @brief FP32 Dot product: a * b - * @param f_a left vector - * @param f_b right vector - * @param N vector length - * @return f_a * f_b - */ -float dot(std::vector a, std::vector b, const int N); - -#endif // end of CPU_ADAPTER_CBLAS_FP32_H diff --git a/core/include/cpu/gp_functions.hpp b/core/include/cpu/gp_functions.hpp deleted file mode 100644 index 7079bab6..00000000 --- a/core/include/cpu/gp_functions.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#ifndef CPU_GP_FUNCTIONS_H -#define CPU_GP_FUNCTIONS_H - -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" -#include - -namespace cpu -{ - -/** - * @brief Perform Cholesky decompositon (+Assebmly) - * - * @param training_input The training input data - * @param hyperparameters The kernel hyperparameters - * - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param n_regressors The number of regressors - * - * @return The tiled Cholesky factor - */ -std::vector> -cholesky(const std::vector &training_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int n_regressors); - -/** - * @brief Compute the predictions without uncertainties. - * - * @param training_input The training input data - * @param training_output The raining output data - * @param test_input The test input data - * @param hyperparameters The kernel hyperparameters - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param m_tiles The number of test tiles - * @param m_tile_size The size of each test tile - * @param n_regressors The number of regressors - * - * @return A vector containing the predictions - */ -std::vector -predict(const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors); - -/** - * @brief Compute the predictions with uncertainties. - * - * @param training_input The training input data - * @param training_output The raining output data - * @param test_input The test input data - * @param hyperparameters The kernel hyperparameters - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param m_tiles The number of test tiles - * @param m_tile_size The size of each test tile - * @param n_regressors The number of regressors - * - * @return A vector containing the prediction vector and the uncertainty vector - */ -std::vector> predict_with_uncertainty( - const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors); - -/** - * @brief Compute the predictions with full covariance matrix. - * - * @param training_input The training input data - * @param training_output The raining output data - * @param test_input The test input data - * @param hyperparameters The kernel hyperparameters - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param m_tiles The number of test tiles - * @param m_tile_size The size of each test tile - * @param n_regressors The number of regressors - * - * @return A vector containing the prediction vector and the full posterior covariance matrix - */ -std::vector> predict_with_full_cov( - const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_data, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors); - -/** - * @brief Compute loss for given data and Gaussian process model - * - * @param training_input The training input data - * @param training_output The raining output data - * @param hyperparameters The kernel hyperparameters - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param n_regressors The number of regressors - * - * @return The loss - */ -double compute_loss(const std::vector &training_input, - const std::vector &training_output, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int n_regressors); - -/** - * @brief Perform optimization for a given number of iterations - * - * @param training_input The training input data - * @param training_output The raining output data - * - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param n_regressors The number of regressors - * - * @param hyperparams The Adam optimizer hyperparameters - * @param hyperparameters The kernel hyperparameters - * @param trainable_params The vector containing a bool wheather to train a hyperparameter - * - * @return A vector containing the loss values of each iteration - */ -std::vector -optimize(const std::vector &training_input, - const std::vector &training_output, - int n_tiles, - int n_tile_size, - int n_regressors, - const gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - std::vector trainable_params); - -/** - * @brief Perform a single optimization step - * - * @param training_input The training input data - * @param training_output The raining output data - * - * @param n_tiles The number of training tiles - * @param n_tile_size The size of each training tile - * @param n_regressors The number of regressors - * - * @param hyperparams The Adam optimizer hyperparameters - * @param hyperparameters The kernel hyperparameters - * @param trainable_params The vector containing a bool wheather to train a hyperparameter - * - * @param iter The current optimization iteration - * - * @return The loss value - */ -double optimize_step(const std::vector &training_input, - const std::vector &training_output, - int n_tiles, - int n_tile_size, - int n_regressors, - gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - std::vector trainable_params, - int iter); - -} // end of namespace cpu - -#endif // end of CPU_GP_FUNCTIONS_H diff --git a/core/include/cpu/gp_uncertainty.hpp b/core/include/cpu/gp_uncertainty.hpp deleted file mode 100644 index 28089584..00000000 --- a/core/include/cpu/gp_uncertainty.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef CPU_GP_UNCERTAINTY_H -#define CPU_GP_UNCERTAINTY_H - -#include -#include - -namespace cpu -{ - -/** - * @brief Extract diagonal elements of the matrix A. - * - * @param A The matrix - * @param M The rumber of rows in the matrix - * - * @return Diagonal element vector of the matrix A of size M - */ -// std::vector get_matrix_diagonal(const std::vector &A, std::size_t M); -hpx::shared_future> get_matrix_diagonal(hpx::shared_future> f_A, std::size_t M); - -} // end of namespace cpu - -#endif // end of CPU_GP_UNCERTAINTY_H diff --git a/core/include/cpu/tiled_algorithms.hpp b/core/include/cpu/tiled_algorithms.hpp deleted file mode 100644 index 28c25c05..00000000 --- a/core/include/cpu/tiled_algorithms.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#ifndef CPU_TILED_ALGORITHMS_H -#define CPU_TILED_ALGORITHMS_H - -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" -#include - -using Tiled_matrix = std::vector>>; -using Tiled_vector = std::vector>>; - -namespace cpu -{ - -// Tiled Cholesky Algorithm - -/** - * @brief Perform right-looking tiled Cholesky decomposition. - * - * @param ft_tiles Tiled matrix represented as a vector of futurized tiles, containing the - * covariance matrix, afterwards the Cholesky decomposition. - * @param N Tile size per dimension. - * @param n_tiles Number of tiles per dimension. - */ -void right_looking_cholesky_tiled(Tiled_matrix &ft_tiles, int N, std::size_t n_tiles); - -// Tiled Triangular Solve Algorithms - -/** - * @brief Perform tiled forward triangular matrix-vector solve. - * - * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. - * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector - * @param N Tile size per dimension. - * @param n_tiles Number of tiles per dimension. - */ -void forward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles); - -/** - * @brief Perform tiled backward triangular matrix-vector solve. - * - * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. - * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector - * @param N Tile size per dimension. - * @param n_tiles Number of tiles per dimension. - */ -void backward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles); - -/** - * @brief Perform tiled forward triangular matrix-matrix solve. - * - * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. - * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix. - * @param N Tile size of first dimension. - * @param M Tile size of second dimension. - * @param n_tiles Number of tiles in first dimension. - * @param m_tiles Number of tiles in second dimension. - */ -void forward_solve_tiled_matrix( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles); - -/** - * @brief Perform tiled backward triangular matrix-matrix solve. - * - * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. - * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix. - * @param N Tile size of first dimension. - * @param M Tile size of second dimension. - * @param n_tiles Number of tiles in first dimension. - * @param m_tiles Number of tiles in second dimension. - */ -void backward_solve_tiled_matrix( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles); - -/** - * @brief Perform tiled matrix-vector multiplication - * - * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. - * @param ft_vector Tiled vector represented as a vector of futurized tiles. - * @param ft_rhsTiled solution represented as a vector of futurized tiles. - * @param N_row Tile size of first dimension. - * @param N_col Tile size of second dimension. - * @param n_tiles Number of tiles in first dimension. - * @param m_tiles Number of tiles in second dimension. - */ -void matrix_vector_tiled(Tiled_matrix &ft_tiles, - Tiled_vector &ft_vector, - Tiled_vector &ft_rhs, - int N_row, - int N_col, - std::size_t n_tiles, - std::size_t m_tiles); - -/** - * @brief Perform tiled symmetric k-rank update on diagonal tiles - * - * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. - * @param ft_vector Tiled vector holding the diagonal tile results - * @param N Tile size of first dimension. - * @param M Tile size of second dimension. - * @param n_tiles Number of tiles in first dimension. - * @param m_tiles Number of tiles in second dimension. - */ -void symmetric_matrix_matrix_diagonal_tiled( - Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int N, int M, std::size_t n_tiles, std::size_t m_tiles); - -/** - * @brief Perform tiled symmetric k-rank update (ft_tiles^T * ft_tiles) - * - * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. - * @param ft_result Tiled matrix holding the result of the computationi. - * @param N Tile size of first dimension. - * @param M Tile size of second dimension. - * @param n_tiles Number of tiles in first dimension. - * @param m_tiles Number of tiles in second dimension. - */ -void symmetric_matrix_matrix_tiled( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_result, int N, int M, std::size_t n_tiles, std::size_t m_tiles); - -/** - * @brief Compute the difference between two tiled vectors - * @param ft_minuend Tiled vector that is being subtracted from. - * @param ft_subtrahend Tiled vector that is being subtracted. - * @param ft_difference Tiled vector that contains the result of the substraction. - * @param M Tile size dimension. - * @param m_tiles Number of tiles. - */ -void vector_difference_tiled(Tiled_vector &ft_minuend, Tiled_vector &ft_substrahend, int M, std::size_t m_tiles); - -/** - * @brief Extract the tiled diagonals of a tiled matrix - * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. - * @param ft_vector Tiled vector containing the diagonals of the matrix tiles - * @param M Tile size per dimension. - * @param m_tiles Number of tiles per dimension. - */ -void matrix_diagonal_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int M, std::size_t m_tiles); - -/** - * @brief Compute the negative log likelihood loss with a tiled covariance matrix K. - * - * Computes l = 0.5 * ( log(det(K)) + y^T * K^-1 * y) + const.) - * - * @param ft_tiles Tiled Cholesky factor matrix represented as a vector of futurized tiles. - * @param ft_alpha Tiled vector containing the solution of K^-1 * y - * @param ft_y Tiled vector containing the the training output y - * @param loss The loss value to be computed - * @param N Tile size per dimension. - * @param n_tiles Number of tiles per dimension. - */ -void compute_loss_tiled(Tiled_matrix &ft_tiles, - Tiled_vector &ft_alpha, - Tiled_vector &ft_y, - hpx::shared_future &loss, - int N, - std::size_t n_tiles); - -/** - * @brief Updates a hyperparameter of the SEK kernel using Adam - * - * @param ft_invK Tiled inverse of the covariance matrix K represented as a vector of futurized tiles. - * @param ft_grad_param Tiled covariance matrix gradient w.r.t. a hyperparameter. - * @param ft_alpha Tiled vector containing the precomputed inv(K) * y where y is the training output. - * @param adam_params Hyperparameter of the Adam optimizer - * @param sek_params Hyperparameters of the SEK kernel - * @param N Tile size per dimension. - * @param n_tiles Number of tiles per dimension. - * @param iter Current iteration. - * @param param_idx Index of the hyperparameter to optimize. - */ -void update_hyperparameter_tiled( - const Tiled_matrix &ft_invK, - const Tiled_matrix &ft_gradK_param, - const Tiled_vector &ft_alpha, - const gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - int N, - std::size_t n_tiles, - std::size_t iter, - std::size_t param_idx); - -} // end of namespace cpu - -#endif // end of CPU_TILED_ALGORITHMS_H diff --git a/core/include/gprat/cpu/adapter_cblas_fp32.hpp b/core/include/gprat/cpu/adapter_cblas_fp32.hpp new file mode 100644 index 00000000..015646b5 --- /dev/null +++ b/core/include/gprat/cpu/adapter_cblas_fp32.hpp @@ -0,0 +1,160 @@ +#ifndef GPRAT_CPU_ADAPTER_CBLAS_FP32_HPP +#define GPRAT_CPU_ADAPTER_CBLAS_FP32_HPP + +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/tile_data.hpp" + +#include + +GPRAT_NS_BEGIN + +// Constants that are compatible with CBLAS +typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE; + +typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE; + +typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA; + +// typedef enum BLAS_UPLO { Blas_upper = 121, +// Blas_lower = 122 } BLAS_UPLO; + +// typedef enum BLAS_ORDERING { Blas_row_major = 101, +// Blas_col_major = 102 } BLAS_ORDERING; + +// BLAS level 3 operations + +/** + * @brief FP32 In-place Cholesky decomposition of A + * @param A matrix to be factorized + * @param N matrix dimension + * @return factorized, lower triangular matrix f_L + */ +mutable_tile_data potrf(const mutable_tile_data &A, int N); + +/** + * @brief FP32 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular + * @param L Cholesky factor matrix + * @param A right hand side matrix + * @param N first dimension + * @param M second dimension + * @return solution matrix f_X + */ +mutable_tile_data +trsm(const const_tile_data &L, + const mutable_tile_data &A, + int N, + int M, + BLAS_TRANSPOSE transpose_L, + BLAS_SIDE side_L); + +/** + * @brief FP32 Symmetric rank-k update: A = A - B * B^T + * @param A Base matrix + * @param B Symmetric update matrix + * @param N matrix dimension + * @return updated matrix f_A + */ +mutable_tile_data syrk(const mutable_tile_data &A, const const_tile_data &B, int N); + +/** + * @brief FP32 General matrix-matrix multiplication: C = C - A(^T) * B(^T) + * @param C Base matrix + * @param B Right update matrix + * @param A Left update matrix + * @param N first matrix dimension + * @param M second matrix dimension + * @param K third matrix dimension + * @param transpose_A transpose left matrix + * @param transpose_B transpose right matrix + * @return updated matrix f_X + */ +mutable_tile_data +gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &C, + int N, + int M, + int K, + BLAS_TRANSPOSE transpose_A, + BLAS_TRANSPOSE transpose_B); + +// BLAS level 2 operations + +/** + * @brief FP32 In-place solve L(^T) * x = a where L lower triangular + * @param L Cholesky factor matrix + * @param a right hand side vector + * @param N matrix dimension + * @param transpose_L transpose Cholesky factor + * @return solution vector f_x + */ +mutable_tile_data +trsv(const const_tile_data &L, const mutable_tile_data &a, int N, BLAS_TRANSPOSE transpose_L); + +/** + * @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a + * @param A update matrix + * @param a update vector + * @param b base vector + * @param N matrix dimension + * @param alpha add or subtract update to base vector + * @param transpose_A transpose update matrix + * @return updated vector f_b + */ +mutable_tile_data +gemv(const const_tile_data &A, + const const_tile_data &a, + const mutable_tile_data &b, + int N, + int M, + BLAS_ALPHA alpha, + BLAS_TRANSPOSE transpose_A); + +/** + * @brief FP32 Vector update with diagonal SYRK: r = r + diag(A^T * A) + * @param A update matrix + * @param r base vector + * @param N first matrix dimension + * @param M second matrix dimension + * @return updated vector f_r + */ +mutable_tile_data +dot_diag_syrk(const const_tile_data &A, const mutable_tile_data &r, int N, int M); + +/** + * @brief FP32 Vector update with diagonal GEMM: r = r + diag(A * B) + * @param A first update matrix + * @param B second update matrix + * @param r base vector + * @param N first matrix dimension + * @param M second matrix dimension + * @return updated vector f_r + */ +mutable_tile_data dot_diag_gemm( + const const_tile_data &A, const const_tile_data &B, const mutable_tile_data &r, int N, int M); + +// BLAS level 1 operations + +/** + * @brief FP32 AXPY: y - x + * @param y left vector + * @param x right vector + * @param N vector length + * @return y - x + */ +mutable_tile_data axpy(const mutable_tile_data &y, const const_tile_data &x, int N); + +/** + * @brief FP32 Dot product: a * b + * @param a left vector + * @param b right vector + * @param N vector length + * @return f_a * f_b + */ +float dot(std::span a, std::span b, int N); + +GPRAT_NS_END + +#endif diff --git a/core/include/cpu/adapter_cblas_fp64.hpp b/core/include/gprat/cpu/adapter_cblas_fp64.hpp similarity index 51% rename from core/include/cpu/adapter_cblas_fp64.hpp rename to core/include/gprat/cpu/adapter_cblas_fp64.hpp index b3c95420..c2dab5d7 100644 --- a/core/include/cpu/adapter_cblas_fp64.hpp +++ b/core/include/gprat/cpu/adapter_cblas_fp64.hpp @@ -1,13 +1,16 @@ -#ifndef CPU_ADAPTER_CBLAS_FP64_H -#define CPU_ADAPTER_CBLAS_FP64_H +#ifndef GPRAT_CPU_ADAPTER_CBLAS_FP64_HPP +#define GPRAT_CPU_ADAPTER_CBLAS_FP64_HPP -#include -#include +#pragma once -using vector_future = hpx::shared_future>; +#include "gprat/detail/config.hpp" +#include "gprat/tile_data.hpp" -// Constants that are compatible with CBLAS +#include + +GPRAT_NS_BEGIN +// Constants that are compatible with CBLAS typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE; typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE; @@ -24,41 +27,42 @@ typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA; /** * @brief FP64 In-place Cholesky decomposition of A - * @param f_A matrix to be factorized + * @param A matrix to be factorized * @param N matrix dimension * @return factorized, lower triangular matrix f_L */ -vector_future potrf(vector_future f_A, const int N); +mutable_tile_data potrf(const mutable_tile_data &A, int N); /** * @brief FP64 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular - * @param f_L Cholesky factor matrix - * @param f_A right hand side matrix + * @param L Cholesky factor matrix + * @param A right hand side matrix * @param N first dimension * @param M second dimension * @return solution matrix f_X */ -vector_future trsm(vector_future f_L, - vector_future f_A, - const int N, - const int M, - const BLAS_TRANSPOSE transpose_L, - const BLAS_SIDE side_L); +mutable_tile_data +trsm(const const_tile_data &L, + const mutable_tile_data &A, + int N, + int M, + BLAS_TRANSPOSE transpose_L, + BLAS_SIDE side_L); /** * @brief FP64 Symmetric rank-k update: A = A - B * B^T - * @param f_A Base matrix - * @param f_B Symmetric update matrix + * @param A Base matrix + * @param B Symmetric update matrix * @param N matrix dimension * @return updated matrix f_A */ -vector_future syrk(vector_future f_A, vector_future f_B, const int N); +mutable_tile_data syrk(const mutable_tile_data &A, const const_tile_data &B, int N); /** * @brief FP64 General matrix-matrix multiplication: C = C - A(^T) * B(^T) - * @param f_C Base matrix - * @param f_B Right update matrix - * @param f_A Left update matrix + * @param C Base matrix + * @param B Right update matrix + * @param A Left update matrix * @param N first matrix dimension * @param M second matrix dimension * @param K third matrix dimension @@ -66,66 +70,74 @@ vector_future syrk(vector_future f_A, vector_future f_B, const int N); * @param transpose_B transpose right matrix * @return updated matrix f_X */ -vector_future -gemm(vector_future f_A, - vector_future f_B, - vector_future f_C, - const int N, - const int M, - const int K, - const BLAS_TRANSPOSE transpose_A, - const BLAS_TRANSPOSE transpose_B); +mutable_tile_data +gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &C, + int N, + int M, + int K, + BLAS_TRANSPOSE transpose_A, + BLAS_TRANSPOSE transpose_B); // BLAS level 2 operations /** * @brief FP64 In-place solve L(^T) * x = a where L lower triangular - * @param f_L Cholesky factor matrix - * @param f_a right hand side vector + * @param L Cholesky factor matrix + * @param a right hand side vector * @param N matrix dimension * @param transpose_L transpose Cholesky factor * @return solution vector f_x */ -vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L); +mutable_tile_data +trsv(const const_tile_data &L, const mutable_tile_data &a, int N, BLAS_TRANSPOSE transpose_L); /** * @brief FP64 General matrix-vector multiplication: b = b - A(^T) * a - * @param f_A update matrix - * @param f_a update vector - * @param f_b base vector + * @param A update matrix + * @param a update vector + * @param b base vector * @param N matrix dimension * @param alpha add or substract update to base vector * @param transpose_A transpose update matrix * @return updated vector f_b */ -vector_future gemv(vector_future f_A, - vector_future f_a, - vector_future f_b, - const int N, - const int M, - const BLAS_ALPHA alpha, - const BLAS_TRANSPOSE transpose_A); +mutable_tile_data +gemv(const const_tile_data &A, + const const_tile_data &a, + const mutable_tile_data &b, + int N, + int M, + BLAS_ALPHA alpha, + BLAS_TRANSPOSE transpose_A); /** * @brief FP64 Vector update with diagonal SYRK: r = r + diag(A^T * A) - * @param f_A update matrix - * @param f_r base vector + * @param A update matrix + * @param r base vector * @param N first matrix dimension * @param M second matrix dimension * @return updated vector f_r */ -vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M); +mutable_tile_data +dot_diag_syrk(const const_tile_data &A, const mutable_tile_data &r, int N, int M); /** * @brief FP64 Vector update with diagonal GEMM: r = r + diag(A * B) - * @param f_A first update matrix - * @param f_B second update matrix - * @param f_r base vector + * @param A first update matrix + * @param B second update matrix + * @param r base vector * @param N first matrix dimension * @param M second matrix dimension * @return updated vector f_r */ -vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M); +mutable_tile_data +dot_diag_gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &r, + int N, + int M); // BLAS level 1 operations @@ -136,7 +148,7 @@ vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future * @param N vector length * @return y - x */ -vector_future axpy(vector_future f_y, vector_future f_x, const int N); +mutable_tile_data axpy(const mutable_tile_data &y, const const_tile_data &x, int N); /** * @brief FP64 Dot product: a * b @@ -145,6 +157,8 @@ vector_future axpy(vector_future f_y, vector_future f_x, const int N); * @param N vector length * @return a * b */ -double dot(std::vector a, std::vector b, const int N); +double dot(std::span a, std::span b, int N); + +GPRAT_NS_END -#endif // end of CPU_ADAPTER_CBLAS_FP64_H +#endif diff --git a/core/include/cpu/gp_algorithms.hpp b/core/include/gprat/cpu/gp_algorithms.hpp similarity index 71% rename from core/include/cpu/gp_algorithms.hpp rename to core/include/gprat/cpu/gp_algorithms.hpp index b8a6f043..210810fd 100644 --- a/core/include/cpu/gp_algorithms.hpp +++ b/core/include/gprat/cpu/gp_algorithms.hpp @@ -1,30 +1,34 @@ -#ifndef CPU_GP_ALGORITHMS_H -#define CPU_GP_ALGORITHMS_H +#ifndef GPRAT_CPU_GP_ALGORITHMS_HPP +#define GPRAT_CPU_GP_ALGORITHMS_HPP -#include "gp_kernels.hpp" +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/kernels.hpp" +#include "gprat/tile_data.hpp" + +#include #include +GPRAT_NS_BEGIN + namespace cpu { /** * @brief Compute the squared exponential kernel of two feature vectors * - * @param i_global The global index of the first feature vector - * @param j_global The global index of the second feature vector * @param n_regressors The number of regressors - * @param hyperparameters The kernel hyperparameters + * @param sek_params The kernel hyperparameters * @param i_input The first feature vector * @param j_input The second feature vector * - * @return The entry of a covariance function at position i_global,j_global + * @return The entry of a covariance function */ -double compute_covariance_function(std::size_t i_global, - std::size_t j_global, - std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &i_input, - const std::vector &j_input); +double compute_covariance_function(std::size_t n_regressors, + const SEKParams &sek_params, + std::span i_input, + std::span j_input); /** * @brief Generate a tile of the covariance matrix @@ -39,13 +43,13 @@ double compute_covariance_function(std::size_t i_global, * @return A quadratic tile of the covariance matrix of size N x N * @note Does apply noise variance on the diagonal */ -std::vector gen_tile_covariance( +mutable_tile_data gen_tile_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input); + const SEKParams &sek_params, + std::span input); /** * @brief Generate a tile of the prior covariance matrix @@ -61,13 +65,13 @@ std::vector gen_tile_covariance( * @note Does NOT apply noise variance on the diagonal */ // NAME: gen_tile_priot_covariance -std::vector gen_tile_full_prior_covariance( +mutable_tile_data gen_tile_full_prior_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input); + const SEKParams &sek_params, + std::span input); /** * @brief Generate the diagonal of a diagonal tile in the prior covariance matrix @@ -83,13 +87,13 @@ std::vector gen_tile_full_prior_covariance( * @note Does NOT apply noise variance */ // NAME: gen_tile_diag_prior_covariance -std::vector gen_tile_prior_covariance( +mutable_tile_data gen_tile_prior_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input); + const SEKParams &sek_params, + std::span input); /** * @brief Generate a tile of the cross-covariance matrix @@ -105,15 +109,15 @@ std::vector gen_tile_prior_covariance( * @return A tile of the cross covariance matrix of size N_row x N_col * @note Does NOT apply noise variance */ -std::vector gen_tile_cross_covariance( +mutable_tile_data gen_tile_cross_covariance( std::size_t row, std::size_t col, std::size_t N_row, std::size_t N_col, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &row_input, - const std::vector &col_input); + const SEKParams &sek_params, + std::span row_input, + std::span col_input); /** * @brief Transpose a tile of size N_row x N_col @@ -124,7 +128,7 @@ std::vector gen_tile_cross_covariance( * * @return The transposed tile of size N_col x N_row */ -std::vector gen_tile_transpose(std::size_t N_row, std::size_t N_col, const std::vector &tile); +mutable_tile_data gen_tile_transpose(std::size_t N_row, std::size_t N_col, std::span tile); /** * @brief Generate a tile of the output data @@ -135,7 +139,7 @@ std::vector gen_tile_transpose(std::size_t N_row, std::size_t N_col, con * * @return A tile of the output data of size N */ -std::vector gen_tile_output(std::size_t row, std::size_t N, const std::vector &output); +mutable_tile_data gen_tile_output(std::size_t row, std::size_t N, std::span output); /** * @brief Compute the L2-error norm over all tiles and elements @@ -158,7 +162,7 @@ double compute_error_norm(std::size_t n_tiles, * * @return A tile filled with zeros of size N */ -std::vector gen_tile_zeros(std::size_t N); +mutable_tile_data gen_tile_zeros(std::size_t N); /** * @brief Generate an identity tile (i==j?1:0) @@ -166,8 +170,10 @@ std::vector gen_tile_zeros(std::size_t N); * @param N The dimension of the quadratic tile * @return A NxN identity tile */ -std::vector gen_tile_identity(std::size_t N); +mutable_tile_data gen_tile_identity(std::size_t N); } // end of namespace cpu -#endif // end of CPU_GP_ALGORITHMS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gprat/cpu/gp_functions.hpp b/core/include/gprat/cpu/gp_functions.hpp new file mode 100644 index 00000000..55a9e0e3 --- /dev/null +++ b/core/include/gprat/cpu/gp_functions.hpp @@ -0,0 +1,1171 @@ +#ifndef GPRAT_CPU_GP_FUNCTIONS_HPP +#define GPRAT_CPU_GP_FUNCTIONS_HPP + +#pragma once + +#include "gprat/cpu/gp_algorithms.hpp" +#include "gprat/cpu/tiled_algorithms.hpp" +#include "gprat/detail/config.hpp" +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/scheduler.hpp" +#include "gprat/tile_data.hpp" + +#include + +GPRAT_NS_BEGIN + +namespace cpu +{ + +/** + * @brief Perform Cholesky decomposition (+Assembly) + * + * @param training_input The training input data + * @param sek_params The kernel hyperparameters + * + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param n_regressors The number of regressors + * + * @return The tiled Cholesky factor + */ +template +std::vector> +cholesky(Scheduler &sched, + const std::vector &training_input, + const SEKParams &sek_params, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors) +{ + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + for (std::size_t row = 0; row < n_tiles; row++) + { + for (std::size_t col = 0; col <= row; col++) + { + K_tiles[row * n_tiles + col] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, row, col), + "assemble_tiled_K", + K_tiles[row * n_tiles + col], + row, + col, + n_tile_size, + n_regressors, + sek_params, + training_input); + } + } + + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Synchronize + std::vector> result(n_tiles * n_tiles); + for (std::size_t i = 0; i < n_tiles; i++) + { + for (std::size_t j = 0; j <= i; j++) + { + result[i * n_tiles + j] = K_tiles[i * n_tiles + j].get(); + } + } + return result; +} + +/** + * @brief Compute the predictions without uncertainties. + * + * @param training_input The training input data + * @param training_output The raining output data + * @param test_input The test input data + * @param hyperparameters The kernel hyperparameters + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param m_tiles The number of test tiles + * @param m_tile_size The size of each test tile + * @param n_regressors The number of regressors + * + * @return A vector containing the predictions + */ +template +std::vector +predict(Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + const std::vector &test_input, + const SEKParams &sek_params, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t m_tiles, + std::size_t m_tile_size, + std::size_t n_regressors) +{ + /* + * Prediction: hat(y)_M = cross(K)_MxN * K^-1_NxN * y_N + * - Covariance matrix K_NxN + * - Cross-covariance cross(K)_MxN + * - Training output y_N + * - Prediction output hat(y)_M + * + * Algorithm: + * 1: Compute lower triangular part of covariance matrix K + * 2: Compute Cholesky factor L of K + * 3: Compute prediction hat(y): + * - triangular solve L * beta = y + * - triangular solve L^T * alpha = beta + * - compute hat(y) = cross(K) * alpha + */ + + /////////////////////////////////////////////////////////////////////////// + // Cholesky + + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + for (std::size_t row = 0; row < n_tiles; row++) + { + for (std::size_t col = 0; col <= row; col++) + { + K_tiles[row * n_tiles + col] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, row, col), + "assemble_tiled_K", + K_tiles[row * n_tiles + col], + row, + col, + n_tile_size, + n_regressors, + sek_params, + training_input); + } + } + + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Prediction + + // Tiled cross_covariance matrix K_NxM + auto cross_covariance_tiles = make_tiled_dataset( + sched, + m_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + // Tiled solution + auto prediction_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, m_tiles, tile_index); }); + // Tiled intermediate solution + auto alpha_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); }); + + for (std::size_t i = 0; i < n_tiles; i++) + { + alpha_tiles[i] = detail::named_make_tile( + sched, + schedule::alpha_tile(sched, n_tiles, i), + "assemble_tiled_alpha", + alpha_tiles[i], + i, + n_tile_size, + training_output); + } + + for (std::size_t i = 0; i < m_tiles; i++) + { + for (std::size_t j = 0; j < n_tiles; j++) + { + cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::cross_covariance_tile(sched, n_tiles, i, j), + "assemble_pred", + cross_covariance_tiles[i * n_tiles + j], + i, + j, + m_tile_size, + n_tile_size, + n_regressors, + sek_params, + test_input, + training_input); + } + } + + for (std::size_t i = 0; i < m_tiles; i++) + { + prediction_tiles[i] = detail::named_make_tile( + sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size); + } + + // Launch asynchronous triangular solve L * (L^T * alpha) = y + forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + + // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha + matrix_vector_tiled( + sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Synchronize prediction + // Preallocate memory + std::vector prediction_result; + prediction_result.reserve(test_input.size()); + for (std::size_t i = 0; i < m_tiles; i++) + { + mutable_tile_data tile = prediction_tiles[i].get(); + std::copy_n(tile.data(), tile.size(), std::back_inserter(prediction_result)); + } + return prediction_result; +} + +/** + * @brief Compute the predictions with uncertainties. + * + * @param training_input The training input data + * @param training_output The raining output data + * @param test_input The test input data + * @param hyperparameters The kernel hyperparameters + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param m_tiles The number of test tiles + * @param m_tile_size The size of each test tile + * @param n_regressors The number of regressors + * + * @return A vector containing the prediction vector and the uncertainty vector + */ +template +std::vector> predict_with_uncertainty( + Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + const std::vector &test_input, + const SEKParams &sek_params, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t m_tiles, + std::size_t m_tile_size, + std::size_t n_regressors) +{ + /* + * Prediction: hat(y) = cross(K) * K^-1 * y + * Uncertainty: diag(Sigma) = diag(prior(K)) * diag(cross(K)^T * K^-1 * cross(K)) + * - Covariance matrix K_NxN + * - Cross-covariance cross(K)_MxN + * - Prior covariance prior(K)_MxM + * - Training output y_N + * - Prediction output hat(y)_M + * - Posterior covariance matrix Sigma_MxM + * + * Algorithm: + * 1: Compute lower triangular part of covariance matrix K + * 2: Compute Cholesky factor L of K + * 3: Compute prediction hat(y): + * - triangular solve L * beta = y + * - triangular solve L^T * alpha = beta + * - compute hat(y) = cross(K) * alpha + * 4: Compute uncertainty diag(Sigma): + * - triangular solve L * V = cross(K)^T + * - compute diag(W) = diag(V^T * V) + * - compute diag(Sigma) = diag(prior(K)) - diag(W) + */ + + /////////////////////////////////////////////////////////////////////////// + // Cholesky + + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + for (std::size_t row = 0; row < n_tiles; row++) + { + for (std::size_t col = 0; col <= row; col++) + { + K_tiles[row * n_tiles + col] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, row, col), + "assemble_tiled_K", + K_tiles[row * n_tiles + col], + row, + col, + n_tile_size, + n_regressors, + sek_params, + training_input); + } + } + + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Prediction + + // Tiled intermediate solution + auto alpha_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); }); + for (std::size_t i = 0; i < n_tiles; i++) + { + alpha_tiles[i] = detail::named_make_tile( + sched, + schedule::alpha_tile(sched, n_tiles, i), + "assemble_tiled_alpha", + alpha_tiles[i], + i, + n_tile_size, + training_output); + } + + // Tiled cross_covariance matrix K_NxM + auto cross_covariance_tiles = make_tiled_dataset( + sched, + m_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + for (std::size_t j = 0; j < n_tiles; j++) + { + cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::cross_covariance_tile(sched, n_tiles, i, j), + "assemble_pred", + cross_covariance_tiles[i * n_tiles + j], + i, + j, + m_tile_size, + n_tile_size, + n_regressors, + sek_params, + test_input, + training_input); + } + } + + // Tiled solution + auto prediction_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, m_tiles, tile_index); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + prediction_tiles[i] = detail::named_make_tile( + sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size); + } + + // Launch asynchronous triangular solve L * (L^T * alpha) = y + forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + + // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha + matrix_vector_tiled( + sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Uncertainty + + // Tiled transposed cross_covariance matrix K_MxN + auto t_cross_covariance_tiles = make_tiled_dataset( + sched, + n_tiles * m_tiles, + [&](std::size_t tile_index) + { return schedule::t_cross_covariance_tile(sched, m_tiles, tile_index / m_tiles, tile_index % m_tiles); }); + for (std::size_t j = 0; j < n_tiles; j++) + { + for (std::size_t i = 0; i < m_tiles; i++) + { + t_cross_covariance_tiles[j * m_tiles + i] = detail::named_make_tile( + sched, + schedule::t_cross_covariance_tile(sched, m_tiles, j, i), + "assemble_pred", + t_cross_covariance_tiles[j * m_tiles + i], + m_tile_size, + n_tile_size, + cross_covariance_tiles[i * n_tiles + j]); + } + } + + // Tiled prior covariance matrix diagonal diag(K_MxM) + auto prior_K_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::prior_K_tile(sched, n_tiles, 0, tile_index); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + prior_K_tiles[i] = detail::named_make_tile( + sched, + schedule::prior_K_tile(sched, m_tiles, 0, i), + "assemble_tiled", + prior_K_tiles[i], + i, + i, + m_tile_size, + n_regressors, + sek_params, + test_input); + } + + // Tiled uncertainty solution + auto uncertainty_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::uncertainty_tile(sched, m_tiles, tile_index); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + uncertainty_tiles[i] = detail::named_make_tile( + sched, + schedule::uncertainty_tile(sched, m_tiles, i), + "assemble_prior_inter", + uncertainty_tiles[i], + m_tile_size); + } + + // Launch asynchronous triangular solve L * V = cross(K)^T + forward_solve_tiled_matrix(sched, K_tiles, t_cross_covariance_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles); + + // Launch asynchronous computation diag(W) = diag(V^T * V) + symmetric_matrix_matrix_diagonal_tiled( + sched, t_cross_covariance_tiles, uncertainty_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles); + + // Launch asynchronous computation diag(Sigma) = diag(prior(K)) - diag(W) + vector_difference_tiled(sched, prior_K_tiles, uncertainty_tiles, m_tile_size, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Preallocate memory + std::vector prediction_result; + std::vector uncertainty_result; + prediction_result.reserve(test_input.size()); + uncertainty_result.reserve(test_input.size()); + + // Synchronize prediction + for (std::size_t i = 0; i < m_tiles; i++) + { + mutable_tile_data tile = prediction_tiles[i].get(); + std::copy_n(tile.begin(), tile.size(), std::back_inserter(prediction_result)); + } + + // Synchronize uncertainty + for (std::size_t i = 0; i < m_tiles; i++) + { + mutable_tile_data tile = uncertainty_tiles[i].get(); + std::copy_n(tile.begin(), tile.size(), std::back_inserter(uncertainty_result)); + } + + return std::vector>{ std::move(prediction_result), std::move(uncertainty_result) }; +} + +/** + * @brief Compute the predictions with full covariance matrix. + * + * @param training_input The training input data + * @param training_output The raining output data + * @param test_input The test input data + * @param sek_params The kernel hyperparameters + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param m_tiles The number of test tiles + * @param m_tile_size The size of each test tile + * @param n_regressors The number of regressors + * + * @return A vector containing the prediction vector and the full posterior covariance matrix + */ +template +std::vector> predict_with_full_cov( + Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + const std::vector &test_input, + const SEKParams &sek_params, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t m_tiles, + std::size_t m_tile_size, + std::size_t n_regressors) +{ + /* + * Prediction: hat(y)_M = cross(K) * K^-1 * y + * Full covariance: Sigma = prior(K) - cross(K)^T * K^-1 * cross(K) + * - Covariance matrix K_NxN + * - Cross-covariance cross(K)_MxN + * - Prior covariance prior(K)_MxM + * - Training output y_N + * - Prediction output hat(y)_M + * - Posterior covariance matrix Sigma_MxM + * + * Algorithm: + * 1: Compute lower triangular part of covariance matrix K + * 2: Compute Cholesky factor L of K + * 3: Compute prediction hat(y): + * - triangular solve L * beta = y + * - triangular solve L^T * alpha = beta + * - compute hat(y) = cross(K) * alpha + * 4: Compute full covariance matrix Sigma: + * - triangular solve L * V = cross(K)^T + * - compute W = V^T * V + * - compute Sigma = prior(K) - W + * 5: Compute diag(Sigma) + */ + + /////////////////////////////////////////////////////////////////////////// + // Cholesky + + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + for (std::size_t row = 0; row < n_tiles; row++) + { + for (std::size_t col = 0; col <= row; col++) + { + K_tiles[row * n_tiles + col] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, row, col), + "assemble_tiled_K", + K_tiles[row * n_tiles + col], + row, + col, + n_tile_size, + n_regressors, + sek_params, + training_input); + } + } + + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Prediction + + // Tiled intermediate solution + auto alpha_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); }); + for (std::size_t i = 0; i < n_tiles; i++) + { + alpha_tiles[i] = detail::named_make_tile( + sched, + schedule::alpha_tile(sched, n_tiles, i), + "assemble_tiled_alpha", + alpha_tiles[i], + i, + n_tile_size, + training_output); + } + + // Tiled cross_covariance matrix K_NxM + auto cross_covariance_tiles = make_tiled_dataset( + sched, + m_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + for (std::size_t j = 0; j < n_tiles; j++) + { + cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::cross_covariance_tile(sched, n_tiles, i, j), + "assemble_pred", + cross_covariance_tiles[i * n_tiles + j], + i, + j, + m_tile_size, + n_tile_size, + n_regressors, + sek_params, + test_input, + training_input); + } + } + + // Tiled solution + auto prediction_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + prediction_tiles[i] = detail::named_make_tile( + sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size); + } + + // Launch asynchronous triangular solve L * (L^T * alpha) = y + forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + + // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha + matrix_vector_tiled( + sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Uncertainty + + // Tiled transposed cross_covariance matrix K_MxN + auto t_cross_covariance_tiles = make_tiled_dataset( + sched, + n_tiles * m_tiles, + [&](std::size_t tile_index) + { return schedule::t_cross_covariance_tile(sched, m_tiles, tile_index / m_tiles, tile_index % m_tiles); }); + for (std::size_t j = 0; j < n_tiles; j++) + { + for (std::size_t i = 0; i < m_tiles; i++) + { + t_cross_covariance_tiles[j * m_tiles + i] = detail::named_make_tile( + sched, + schedule::t_cross_covariance_tile(sched, m_tiles, j, i), + "assemble_pred", + t_cross_covariance_tiles[j * m_tiles + i], + m_tile_size, + n_tile_size, + cross_covariance_tiles[i * n_tiles + j]); + } + } + + // Tiled prior covariance matrix K_MxM + auto prior_K_tiles = make_tiled_dataset( + sched, + m_tiles * m_tiles, + [&](std::size_t tile_index) + { return schedule::prior_K_tile(sched, n_tiles, tile_index / m_tiles, tile_index % m_tiles); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + for (std::size_t j = 0; j <= i; j++) + { + prior_K_tiles[i * m_tiles + j] = detail::named_make_tile( + sched, + schedule::prior_K_tile(sched, m_tiles, i, j), + "assemble_prior_tiled", + prior_K_tiles[i * m_tiles + j], + i, + j, + m_tile_size, + n_regressors, + sek_params, + test_input); + + if (i != j) + { + prior_K_tiles[j * m_tiles + i] = detail::named_make_tile( + sched, + schedule::prior_K_tile(sched, m_tiles, j, i), + "assemble_prior_tiled", + prior_K_tiles[j * m_tiles + i], + m_tile_size, + m_tile_size, + prior_K_tiles[i * m_tiles + j]); + } + } + } + + // Tiled uncertainty solution + auto uncertainty_tiles = make_tiled_dataset( + sched, m_tiles, [&](std::size_t tile_index) { return schedule::uncertainty_tile(sched, m_tiles, tile_index); }); + for (std::size_t i = 0; i < m_tiles; i++) + { + uncertainty_tiles[i] = detail::named_make_tile( + sched, + schedule::uncertainty_tile(sched, m_tiles, i), + "assemble_prior_inter", + uncertainty_tiles[i], + m_tile_size); + } + + // Launch asynchronous triangular solve L * V = cross(K)^T + forward_solve_tiled_matrix(sched, K_tiles, t_cross_covariance_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous computation of full covariance Sigma = prior(K) - V^T * V + symmetric_matrix_matrix_tiled( + sched, t_cross_covariance_tiles, prior_K_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles); + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous computation of uncertainty diag(Sigma) + matrix_diagonal_tiled(sched, prior_K_tiles, uncertainty_tiles, m_tile_size, m_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Preallocate memory + std::vector prediction_result; + std::vector uncertainty_result; + prediction_result.reserve(test_input.size()); + uncertainty_result.reserve(test_input.size()); + + // Synchronize prediction + for (std::size_t i = 0; i < m_tiles; i++) + { + mutable_tile_data tile = prediction_tiles[i].get(); + std::copy_n(tile.begin(), tile.size(), std::back_inserter(prediction_result)); + } + + // Synchronize uncertainty + for (std::size_t i = 0; i < m_tiles; i++) + { + mutable_tile_data tile = uncertainty_tiles[i].get(); + std::copy_n(tile.begin(), tile.size(), std::back_inserter(uncertainty_result)); + } + + return std::vector>{ std::move(prediction_result), std::move(uncertainty_result) }; +} + +/////////////////////////////////////////////////////////////////////////// +// OPTIMIZATION + +/** + * @brief Compute loss for given data and Gaussian process model + * + * @param training_input The training input data + * @param training_output The raining output data + * @param sek_params The kernel hyperparameters + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param n_regressors The number of regressors + * + * @return The loss + */ +template +double calculate_loss(Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + const SEKParams &sek_params, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors) +{ + /* + * Negative log likelihood loss: + * loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) ) + * - Covariance matrix K(theta)_NxN + * - Training output y_N + * - Hyperparameters theta ={ v, l, v_n } + * + * Algorithm: + * 1: Compute lower triangular part of covariance matrix K + * 2: Compute Cholesky factor L of K + * 3: Compute prediction alpha = K^-1 * y: + * - triangular solve L * beta = y + * - triangular solve L^T * alpha = beta + * 5: Compute beta = K^-1 * y + * 6: Compute negative log likelihood loss + * - Calculate sum_i^N log(L_ii^2) + * - Calculate y^T * beta + * - Add constant N * log (2 * pi) + */ + + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + for (std::size_t row = 0; row < n_tiles; row++) + { + for (std::size_t col = 0; col <= row; col++) + { + K_tiles[row * n_tiles + col] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, row, col), + "assemble_tiled_K", + K_tiles[row * n_tiles + col], + row, + col, + n_tile_size, + n_regressors, + sek_params, + training_input); + } + } + + // Tiled intermediate solution + auto alpha_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); }); + for (std::size_t i = 0; i < n_tiles; i++) + { + alpha_tiles[i] = detail::named_make_tile( + sched, + schedule::alpha_tile(sched, n_tiles, i), + "assemble_tiled_alpha", + alpha_tiles[i], + i, + n_tile_size, + training_output); + } + + // Tiled output + auto y_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); }); + for (std::size_t i = 0; i < n_tiles; i++) + { + y_tiles[i] = detail::named_make_tile( + sched, + schedule::prediction_tile(sched, n_tiles, i), + "assemble_tiled_alpha", + y_tiles[i], + i, + n_tile_size, + training_output); + } + + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + // Launch asynchronous triangular solve L * (L^T * alpha) = y + forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles); + + // Launch asynchronous loss computation + return compute_loss_tiled(sched, K_tiles, alpha_tiles, y_tiles, n_tile_size, n_tiles).get(); +} + +/** + * @brief Perform optimization for a given number of iterations + * + * @param training_input The training input data + * @param training_output The raining output data + * + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param n_regressors The number of regressors + * + * @param adam_params The Adam optimizer hyperparameters + * @param sek_params The kernel hyperparameters + * @param trainable_params The vector containing a bool whether to train a hyperparameter + * + * @return A vector containing the loss values of each iteration + */ +template +std::vector +optimize(Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const AdamParams &adam_params, + SEKParams &sek_params, + std::vector trainable_params, + std::size_t start_iter = 0) +{ + /* + * - Hyperparameters theta={v, l, v_n} + * - Covariance matrix K(theta) + * - Training ouput y + * + * Algorithm: + * for opt_iter: + * 1: Compute distance for entries of covariance matrix K + * 2: Compute lower triangular part of K with distance + * 3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance + * + * 4: Compute Cholesky factor L of K + * 5: Compute K^-1: + * - triangular solve L * {} = I + * - triangular solve L^T * K^-1 = {} + * 6: Compute beta = K^-1 * y + * + * 7: Compute negative log likelihood loss + * - Calculate 0.5 sum_i^N log(L_ii^2) + * - Calculate 0.5 y^T * beta + * - Add constant N / 2 * log (2 * pi) + * + * 8: Compute delta(loss)/delta(param_i) + * - Compute trace(K^-1 * delta(K)/delta(theta_i)) + * - Compute beta^T * delta(K)/delta(theta_i) * beta + * 9: Update hyperparameters theta with Adam optimizer + * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T + * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 + * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) + * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) + * endfor + */ + + // data holder for computed loss values + std::vector losses; + losses.reserve(static_cast(adam_params.opt_iter)); + + // Tiled output + auto y_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); }); + // Launch asynchronous assembly of output y + for (std::size_t i = 0; i < n_tiles; i++) + { + y_tiles[i] = detail::named_make_tile( + sched, + schedule::prediction_tile(sched, n_tiles, i), + "assemble_y", + y_tiles[i], + i, + n_tile_size, + training_output); + } + + ////////////////////////////////////////////////////////////////////////////// + // per-loop tiles + + // Tiled covariance matrix K_NxN + auto K_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + // Tiled inverse covariance matrix K^-1_NxN + auto K_inv_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::K_inv_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + // Tiled intermediate solution + auto alpha_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); }); + + // Tiled future data structures for gradients + + // Tiled covariance with gradient v + auto grad_v_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::K_grad_v_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + // Tiled covariance with gradient l + auto grad_l_tiles = make_tiled_dataset( + sched, + n_tiles * n_tiles, + [&](std::size_t tile_index) + { return schedule::K_grad_l_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); }); + + auto inter_alpha = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::inter_alpha_tile(sched, n_tiles, tile_index); }); + + auto diag_tiles = make_tiled_dataset( + sched, n_tiles, [&](std::size_t tile_index) { return schedule::diag_tile(sched, n_tiles, tile_index); }); + + ////////////////////////////////////////////////////////////////////////////// + // Perform optimization + for (std::size_t iter = start_iter; iter < static_cast(adam_params.opt_iter); iter++) + { + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix + // vector w.r.t. to vertical lengthscale and derivative of covariance + // matrix vector w.r.t. to lengthscale + for (std::size_t i = 0; i < n_tiles; i++) + { + for (std::size_t j = 0; j <= i; j++) + { + // Compute the distance (z_i - z_j) of K entries to reuse + hpx::shared_future> cov_dists = detail::named_async( + "assemble_cov_dist", i, j, n_tile_size, n_regressors, sek_params, training_input); + + K_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::covariance_tile(sched, n_tiles, i, j), + "assemble_K", + K_tiles[i * n_tiles + j], + i, + j, + n_tile_size, + sek_params, + cov_dists); + if (trainable_params[0]) + { + grad_l_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::K_grad_l_tile(sched, n_tiles, i, j), + "assemble_gradl", + grad_l_tiles[i * n_tiles + j], + n_tile_size, + sek_params, + cov_dists); + if (i != j) + { + grad_l_tiles[j * n_tiles + i] = detail::named_make_tile( + sched, + schedule::K_grad_l_tile(sched, n_tiles, j, i), + "assemble_gradl_t", + grad_l_tiles[j * n_tiles + i], + n_tile_size, + n_tile_size, + grad_l_tiles[i * n_tiles + j]); + } + } + + if (trainable_params[1]) + { + grad_v_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::K_grad_v_tile(sched, n_tiles, i, j), + "assemble_gradv", + grad_v_tiles[i * n_tiles + j], + n_tile_size, + sek_params, + cov_dists); + if (i != j) + { + grad_v_tiles[j * n_tiles + i] = detail::named_make_tile( + sched, + schedule::K_grad_v_tile(sched, n_tiles, j, i), + "assemble_gradv_t", + grad_v_tiles[j * n_tiles + i], + n_tile_size, + n_tile_size, + grad_v_tiles[i * n_tiles + j]); + } + } + } + } + + // Assembly with reallocation -> optimize to only set existing values + for (std::size_t i = 0; i < n_tiles; i++) + { + alpha_tiles[i] = detail::named_make_tile( + sched, schedule::alpha_tile(sched, n_tiles, i), "assemble_tiled_alpha", alpha_tiles[i], n_tile_size); + } + + for (std::size_t i = 0; i < n_tiles; i++) + { + for (std::size_t j = 0; j < n_tiles; j++) + { + if (i == j) + { + K_inv_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::K_inv_tile(sched, n_tiles, i, j), + "assemble_identity_matrix", + K_inv_tiles[i * n_tiles + j], + n_tile_size); + } + else + { + K_inv_tiles[i * n_tiles + j] = detail::named_make_tile( + sched, + schedule::K_inv_tile(sched, n_tiles, i, j), + "assemble_identity_matrix", + K_inv_tiles[i * n_tiles + j], + n_tile_size * n_tile_size); + } + } + } + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous Cholesky decomposition: K = L * L^T + right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous compute K^-1 through L* (L^T * X) = I + forward_solve_tiled_matrix(sched, K_tiles, K_inv_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles); + backward_solve_tiled_matrix(sched, K_tiles, K_inv_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous compute beta = inv(K) * y + matrix_vector_tiled(sched, K_inv_tiles, y_tiles, alpha_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous loss computation where + // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) ) + auto loss_value = compute_loss_tiled(sched, K_tiles, alpha_tiles, y_tiles, n_tile_size, n_tiles); + + /////////////////////////////////////////////////////////////////////////// + // Launch asynchronous update of the hyperparameters + if (trainable_params[0]) + { // lengthscale + update_hyperparameter_tiled_lengthscale( + sched, + K_inv_tiles, + grad_l_tiles, + alpha_tiles, + adam_params, + diag_tiles, + inter_alpha, + sek_params, + n_tile_size, + n_tiles, + iter, + 0); + } + if (trainable_params[1]) + { // vertical_lengthscale + update_hyperparameter_tiled_lengthscale( + sched, + K_inv_tiles, + grad_v_tiles, + alpha_tiles, + adam_params, + diag_tiles, + inter_alpha, + sek_params, + n_tile_size, + n_tiles, + iter, + 1); + } + if (trainable_params[2]) + { // noise_variance + update_hyperparameter_tiled_noise_variance( + sched, K_inv_tiles, alpha_tiles, adam_params, sek_params, n_tile_size, n_tiles, iter, 2); + } + // Synchronize after iteration + losses.push_back(loss_value.get()); + } + return losses; +} + +/** + * @brief Perform a single optimization step + * + * @param training_input The training input data + * @param training_output The raining output data + * + * @param n_tiles The number of training tiles + * @param n_tile_size The size of each training tile + * @param n_regressors The number of regressors + * + * @param adam_params The Adam optimizer hyperparameters + * @param sek_params The kernel hyperparameters + * @param trainable_params The vector containing a bool whether to train a hyperparameter + * + * @param iter The current optimization iteration + * + * @return The loss value + */ +template +double optimize_step(Scheduler &sched, + const std::vector &training_input, + const std::vector &training_output, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + AdamParams &adam_params, + SEKParams &sek_params, + std::vector trainable_params, + std::size_t iter) +{ + // No point in copy&pasting everything for this function + const auto old_opt_iter = adam_params.opt_iter; + adam_params.opt_iter = iter + 1; + const auto r = optimize( + sched, + training_input, + training_output, + n_tiles, + n_tile_size, + n_regressors, + adam_params, + sek_params, + trainable_params, + iter); + adam_params.opt_iter = old_opt_iter; + return r[0]; +} + +} // end of namespace cpu + +GPRAT_NS_END + +#endif diff --git a/core/include/cpu/gp_optimizer.hpp b/core/include/gprat/cpu/gp_optimizer.hpp similarity index 80% rename from core/include/cpu/gp_optimizer.hpp rename to core/include/gprat/cpu/gp_optimizer.hpp index c632e87b..1712597d 100644 --- a/core/include/cpu/gp_optimizer.hpp +++ b/core/include/gprat/cpu/gp_optimizer.hpp @@ -1,10 +1,17 @@ -#ifndef CPU_GP_OPTIMIZER_H -#define CPU_GP_OPTIMIZER_H +#ifndef GPRAT_CPU_GP_OPTIMIZER_H +#define GPRAT_CPU_GP_OPTIMIZER_H + +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/tile_data.hpp" -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" #include +GPRAT_NS_BEGIN + namespace cpu { @@ -54,7 +61,7 @@ double compute_sigmoid(double parameter); double compute_covariance_distance(std::size_t i_global, std::size_t j_global, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, const std::vector &i_input, const std::vector &j_input); @@ -70,12 +77,12 @@ double compute_covariance_distance(std::size_t i_global, * * @return A quadratic tile containing the distance between the features of size N x N */ -std::vector gen_tile_distance( +mutable_tile_data gen_tile_distance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, const std::vector &input); /** @@ -89,12 +96,12 @@ std::vector gen_tile_distance( * * @return A quadratic tile of the covariance matrix of size N x N */ -std::vector gen_tile_covariance_with_distance( +mutable_tile_data gen_tile_covariance_with_distance( std::size_t row, std::size_t col, std::size_t N, - const gprat_hyper::SEKParams &sek_params, - const std::vector &distance); + const SEKParams &sek_params, + const const_tile_data &distance); /** * @brief Generate a derivative tile w.r.t. vertical_lengthscale v @@ -105,8 +112,8 @@ std::vector gen_tile_covariance_with_distance( * * @return A quadratic tile of the derivative of v of size N x N */ -std::vector -gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector &distance); +mutable_tile_data +gen_tile_grad_v(std::size_t N, const SEKParams &sek_params, const const_tile_data &distance); /** * @brief Generate a derivative tile w.r.t. lengthscale l @@ -117,8 +124,8 @@ gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const s * * @return A quadratic tile of the derivative of l of size N x N */ -std::vector -gen_tile_grad_l(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector &distance); +mutable_tile_data +gen_tile_grad_l(std::size_t N, const SEKParams &sek_params, const const_tile_data &distance); /** * @brief Update biased first raw moment estimate: m_T+1 = beta_1 * m_T + (1 - beta_1) * g_T. @@ -153,11 +160,8 @@ double update_second_moment(double gradient, double v_T, double beta_2); * * @return The updated hyperparameter */ -double adam_step(const double unconstrained_hyperparam, - const gprat_hyper::AdamParams &adam_params, - double m_T, - double v_T, - std::size_t iter); +double +adam_step(double unconstrained_hyperparam, const AdamParams &adam_params, double m_T, double v_T, std::size_t iter); /** * @brief Compute negative-log likelihood on one tile. @@ -168,9 +172,9 @@ double adam_step(const double unconstrained_hyperparam, * * @return Return l = y^T * alpha + \sum_i^N log(L_ii^2) */ -double compute_loss(const std::vector &K_diag_tile, - const std::vector &alpha_tile, - const std::vector &y_tile, +double compute_loss(std::span K_diag_tile, + std::span alpha_tile, + std::span y_tile, std::size_t N); /** @@ -182,7 +186,7 @@ double compute_loss(const std::vector &K_diag_tile, * * @return The added up loss plus the constant factor */ -double add_losses(const std::vector &losses, std::size_t N, std::size_t n); +double add_losses(std::span losses, std::size_t N, std::size_t n); /** * @brief Compute the loss gradient. @@ -204,7 +208,7 @@ double compute_gradient(double trace, double dot, std::size_t N, std::size_t n_t * * @return The updated global trace */ -double compute_trace(const std::vector &diagonal, double trace); +double compute_trace(std::span diagonal, double trace); /** * @brief Add the dot product of a vector to a global result. @@ -215,7 +219,7 @@ double compute_trace(const std::vector &diagonal, double trace); * * @return The updated global result */ -double compute_dot(const std::vector &vector_T, const std::vector &vector, double result); +double compute_dot(std::span vector_T, std::span vector, double result); /** * @brief Add the local trace of a matrix tile to the global trace @@ -226,8 +230,10 @@ double compute_dot(const std::vector &vector_T, const std::vector &tile, double trace, std::size_t N); +double compute_trace_diag(std::span tile, double trace, std::size_t N); } // end of namespace cpu -#endif // end of CPU_GP_OPTIMIZER_H +GPRAT_NS_END + +#endif diff --git a/core/include/gprat/cpu/gp_uncertainty.hpp b/core/include/gprat/cpu/gp_uncertainty.hpp new file mode 100644 index 00000000..cb402119 --- /dev/null +++ b/core/include/gprat/cpu/gp_uncertainty.hpp @@ -0,0 +1,28 @@ +#ifndef GPRAT_CPU_GP_UNCERTAINTY_HPP +#define GPRAT_CPU_GP_UNCERTAINTY_HPP + +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/tile_data.hpp" + +GPRAT_NS_BEGIN + +namespace cpu +{ + +/** + * @brief Extract diagonal elements of the matrix A. + * + * @param A The matrix + * @param M The rumber of rows in the matrix + * + * @return Diagonal element vector of the matrix A of size M + */ +mutable_tile_data get_matrix_diagonal(const const_tile_data &A, std::size_t M); + +} // end of namespace cpu + +GPRAT_NS_END + +#endif // end of CPU_GP_UNCERTAINTY_H diff --git a/core/include/gprat/cpu/tiled_algorithms.hpp b/core/include/gprat/cpu/tiled_algorithms.hpp new file mode 100644 index 00000000..718e4d5b --- /dev/null +++ b/core/include/gprat/cpu/tiled_algorithms.hpp @@ -0,0 +1,657 @@ +#ifndef GPRAT_CPU_TILED_ALGORITHMS_H +#define GPRAT_CPU_TILED_ALGORITHMS_H + +#pragma once + +#include "gprat/cpu/adapter_cblas_fp64.hpp" +#include "gprat/cpu/gp_algorithms.hpp" +#include "gprat/cpu/gp_optimizer.hpp" +#include "gprat/cpu/gp_uncertainty.hpp" +#include "gprat/detail/async_helpers.hpp" +#include "gprat/detail/config.hpp" +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/scheduler.hpp" + +#include + +GPRAT_NS_BEGIN + +namespace cpu +{ + +namespace impl +{ +void update_parameters( + const AdamParams &adam_params, + SEKParams &sek_params, + std::size_t N, + std::size_t n_tiles, + std::size_t iter, + std::size_t param_idx, + double trace, + double dot, + bool jitter, + double factor); +} + +// Tiled Cholesky Algorithm + +/** + * @brief Perform right-looking tiled Cholesky decomposition. + * + * @param tiles Tiled matrix represented as a vector of futurized tiles, containing the + * covariance matrix, afterwards the Cholesky decomposition. + * @param N Tile size per dimension. + * @param n_tiles Number of tiles per dimension. + */ +template +void right_looking_cholesky_tiled(Scheduler &sched, Tiles &tiles, std::size_t N, std::size_t n_tiles) +{ + for (std::size_t k = 0; k < n_tiles; k++) + { + // POTRF: Compute Cholesky factor L + tiles[k * n_tiles + k] = detail::named_dataflow( + sched, schedule::cholesky_potrf(sched, n_tiles, k), "cholesky_tiled", tiles[k * n_tiles + k], N); + for (std::size_t m = k + 1; m < n_tiles; m++) + { + // TRSM: Solve X * L^T = A + tiles[m * n_tiles + k] = detail::named_dataflow( + sched, + schedule::cholesky_trsm(sched, n_tiles, k, m), + "cholesky_tiled", + tiles[k * n_tiles + k], + tiles[m * n_tiles + k], + N, + N, + Blas_trans, + Blas_right); + } + for (std::size_t m = k + 1; m < n_tiles; m++) + { + // SYRK: A = A - B * B^T + tiles[m * n_tiles + m] = detail::named_dataflow( + sched, + schedule::cholesky_syrk(sched, n_tiles, m), + "cholesky_tiled", + tiles[m * n_tiles + m], + tiles[m * n_tiles + k], + N); + for (std::size_t n = k + 1; n < m; n++) + { + // GEMM: C = C - A * B^T + tiles[m * n_tiles + n] = detail::named_dataflow( + sched, + schedule::cholesky_gemm(sched, n_tiles, k, m, n), + "cholesky_tiled", + tiles[m * n_tiles + k], + tiles[n * n_tiles + k], + tiles[m * n_tiles + n], + N, + N, + N, + Blas_no_trans, + Blas_trans); + } + } + } +} + +// Tiled Triangular Solve Algorithms + +/** + * @brief Perform tiled forward triangular matrix-vector solve. + * + * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. + * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector + * @param N Tile size per dimension. + * @param n_tiles Number of tiles per dimension. + */ +template +void forward_solve_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_rhs, std::size_t N, std::size_t n_tiles) +{ + for (std::size_t k = 0; k < n_tiles; k++) + { + // TRSM: Solve L * x = a + ft_rhs[k] = detail::named_dataflow( + sched, + schedule::solve_trsv(sched, n_tiles, k), + "triangular_solve_tiled", + ft_tiles[k * n_tiles + k], + ft_rhs[k], + N, + Blas_no_trans); + for (std::size_t m = k + 1; m < n_tiles; m++) + { + // GEMV: b = b - A * a + ft_rhs[m] = detail::named_dataflow( + sched, + schedule::solve_gemv(sched, n_tiles, k, m), + "triangular_solve_tiled", + ft_tiles[m * n_tiles + k], + ft_rhs[k], + ft_rhs[m], + N, + N, + Blas_substract, + Blas_no_trans); + } + } +} + +/** + * @brief Perform tiled backward triangular matrix-vector solve. + * + * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. + * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector + * @param N Tile size per dimension. + * @param n_tiles Number of tiles per dimension. + */ +template +void backward_solve_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_rhs, std::size_t N, std::size_t n_tiles) +{ + for (int k_ = static_cast(n_tiles) - 1; k_ >= 0; k_--) // int instead of std::size_t for last comparison + { + std::size_t k = static_cast(k_); + // TRSM: Solve L^T * x = a + ft_rhs[k] = detail::named_dataflow( + sched, + schedule::solve_trsm(sched, n_tiles, k), + "triangular_solve_tiled", + ft_tiles[k * n_tiles + k], + ft_rhs[k], + N, + Blas_trans); + for (int m_ = k_ - 1; m_ >= 0; m_--) // int instead of std::size_t for last comparison + { + std::size_t m = static_cast(m_); + // GEMV:b = b - A^T * a + ft_rhs[m] = detail::named_dataflow( + sched, + schedule::solve_gemv(sched, n_tiles, k, m), + "triangular_solve_tiled", + ft_tiles[k * n_tiles + m], + ft_rhs[k], + ft_rhs[m], + N, + N, + Blas_substract, + Blas_trans); + } + } +} + +/** + * @brief Perform tiled forward triangular matrix-matrix solve. + * + * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. + * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix. + * @param N Tile size of first dimension. + * @param M Tile size of second dimension. + * @param n_tiles Number of tiles in first dimension. + * @param m_tiles Number of tiles in second dimension. + */ +template +void forward_solve_tiled_matrix( + Scheduler &sched, + Tiles &ft_tiles, + Tiles &ft_rhs, + std::size_t N, + std::size_t M, + std::size_t n_tiles, + std::size_t m_tiles) +{ + for (std::size_t c = 0; c < m_tiles; c++) + { + for (std::size_t k = 0; k < n_tiles; k++) + { + // TRSM: solve L * X = A + ft_rhs[k * m_tiles + c] = detail::named_dataflow( + sched, + schedule::solve_matrix_trsm(sched, m_tiles, c, k), + "triangular_solve_tiled_matrix", + ft_tiles[k * n_tiles + k], + ft_rhs[k * m_tiles + c], + N, + M, + Blas_no_trans, + Blas_left); + for (std::size_t m = k + 1; m < n_tiles; m++) + { + // GEMM: C = C - A * B + ft_rhs[m * m_tiles + c] = detail::named_dataflow( + sched, + schedule::solve_matrix_gemm(sched, m_tiles, c, k, m), + "triangular_solve_tiled_matrix", + ft_tiles[m * n_tiles + k], + ft_rhs[k * m_tiles + c], + ft_rhs[m * m_tiles + c], + N, + M, + N, + Blas_no_trans, + Blas_no_trans); + } + } + } +} + +/** + * @brief Perform tiled backward triangular matrix-matrix solve. + * + * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles. + * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix. + * @param N Tile size of first dimension. + * @param M Tile size of second dimension. + * @param n_tiles Number of tiles in first dimension. + * @param m_tiles Number of tiles in second dimension. + */ +template +void backward_solve_tiled_matrix( + Scheduler &sched, + Tiles &ft_tiles, + Tiles &ft_rhs, + std::size_t N, + std::size_t M, + std::size_t n_tiles, + std::size_t m_tiles) +{ + for (std::size_t c = 0; c < m_tiles; c++) + { + for (int k_ = static_cast(n_tiles) - 1; k_ >= 0; k_--) // int instead of std::size_t for last comparison + { + std::size_t k = static_cast(k_); + // TRSM: solve L^T * X = A + ft_rhs[k * m_tiles + c] = detail::named_dataflow( + sched, + schedule::solve_matrix_trsm(sched, m_tiles, c, k), + "triangular_solve_tiled_matrix", + ft_tiles[k * n_tiles + k], + ft_rhs[k * m_tiles + c], + N, + M, + Blas_trans, + Blas_left); + for (int m_ = k_ - 1; m_ >= 0; m_--) // int instead of std::size_t for last comparison + { + std::size_t m = static_cast(m_); + // GEMM: C = C - A^T * B + ft_rhs[m * m_tiles + c] = detail::named_dataflow( + sched, + schedule::solve_matrix_gemm(sched, m_tiles, c, k, m), + "triangular_solve_tiled_matrix", + ft_tiles[k * n_tiles + m], + ft_rhs[k * m_tiles + c], + ft_rhs[m * m_tiles + c], + N, + M, + N, + Blas_trans, + Blas_no_trans); + } + } + } +} + +/** + * @brief Perform tiled matrix-vector multiplication + * + * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. + * @param ft_vector Tiled vector represented as a vector of futurized tiles. + * @param ft_rhs Tiled solution represented as a vector of futurized tiles. + * @param N_row Tile size of first dimension. + * @param N_col Tile size of second dimension. + * @param n_tiles Number of tiles in first dimension. + * @param m_tiles Number of tiles in second dimension. + */ +template +void matrix_vector_tiled(Scheduler &sched, + Tiles &ft_tiles, + Tiles &ft_vector, + Tiles &ft_rhs, + std::size_t N_row, + std::size_t N_col, + std::size_t n_tiles, + std::size_t m_tiles) +{ + for (std::size_t k = 0; k < m_tiles; k++) + { + for (std::size_t m = 0; m < n_tiles; m++) + { + ft_rhs[k] = detail::named_dataflow( + sched, + schedule::multiply_gemv(sched, n_tiles, k, m), + "prediction_tiled", + ft_tiles[k * n_tiles + m], + ft_vector[m], + ft_rhs[k], + N_row, + N_col, + Blas_add, + Blas_no_trans); + } + } +} + +/** + * @brief Perform tiled symmetric k-rank update on diagonal tiles + * + * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. + * @param ft_vector Tiled vector holding the diagonal tile results + * @param N Tile size of first dimension. + * @param M Tile size of second dimension. + * @param n_tiles Number of tiles in first dimension. + * @param m_tiles Number of tiles in second dimension. + */ +template +void symmetric_matrix_matrix_diagonal_tiled( + Scheduler &sched, + Tiles &ft_tiles, + Tiles &ft_vector, + std::size_t N, + std::size_t M, + std::size_t n_tiles, + std::size_t m_tiles) +{ + for (std::size_t i = 0; i < m_tiles; ++i) + { + for (std::size_t n = 0; n < n_tiles; ++n) + { + // Compute inner product to obtain diagonal elements of + // V^T * V <=> cross(K) * K^-1 * cross(K)^T + ft_vector[i] = detail::named_dataflow( + sched, + schedule::k_rank_dot_diag_syrk(sched, m_tiles, i), + "posterior_tiled", + ft_tiles[n * m_tiles + i], + ft_vector[i], + N, + M); + } + } +} + +/** + * @brief Perform tiled symmetric k-rank update (ft_tiles^T * ft_tiles) + * + * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. + * @param ft_result Tiled matrix holding the result of the computationi. + * @param N Tile size of first dimension. + * @param M Tile size of second dimension. + * @param n_tiles Number of tiles in first dimension. + * @param m_tiles Number of tiles in second dimension. + */ +template +void symmetric_matrix_matrix_tiled( + Scheduler &sched, + Tiles &ft_tiles, + Tiles &ft_result, + std::size_t N, + std::size_t M, + std::size_t n_tiles, + std::size_t m_tiles) +{ + for (std::size_t c = 0; c < m_tiles; c++) + { + for (std::size_t k = 0; k < m_tiles; k++) + { + for (std::size_t m = 0; m < n_tiles; m++) + { + // (SYRK for (c == k) possible) + // GEMM: C = C - A^T * B + ft_result[c * m_tiles + k] = detail::named_dataflow( + sched, + schedule::k_rank_gemm(sched, m_tiles, c, k, m), + "triangular_solve_tiled_matrix", + ft_tiles[m * m_tiles + c], + ft_tiles[m * m_tiles + k], + ft_result[c * m_tiles + k], + N, + M, + M, + Blas_trans, + Blas_no_trans); + } + } + } +} + +/** + * @brief Compute the difference between two tiled vectors + * @param ft_minuend Tiled vector that is being subtracted from. + * @param ft_subtrahend Tiled vector that is being subtracted. + * @param M Tile size dimension. + * @param m_tiles Number of tiles. + */ +template +void vector_difference_tiled( + Scheduler &sched, Tiles &ft_minuend, Tiles &ft_subtrahend, std::size_t M, std::size_t m_tiles) +{ + for (std::size_t i = 0; i < m_tiles; i++) + { + ft_subtrahend[i] = detail::named_dataflow( + sched, schedule::vector_axpy(sched, m_tiles, i), "uncertainty_tiled", ft_minuend[i], ft_subtrahend[i], M); + } +} + +/** + * @brief Extract the tiled diagonals of a tiled matrix + * @param ft_tiles Tiled matrix represented as a vector of futurized tiles. + * @param ft_vector Tiled vector containing the diagonals of the matrix tiles + * @param M Tile size per dimension. + * @param m_tiles Number of tiles per dimension. + */ +template +void matrix_diagonal_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_vector, std::size_t M, std::size_t m_tiles) +{ + for (std::size_t i = 0; i < m_tiles; i++) + { + ft_vector[i] = detail::named_dataflow( + sched, schedule::get_diagonal(sched, m_tiles, i), "uncertainty_tiled", ft_tiles[i * m_tiles + i], M); + } +} + +/** + * @brief Compute the negative log likelihood loss with a tiled covariance matrix K. + * + * Computes l = 0.5 * ( log(det(K)) + y^T * K^-1 * y) + const.) + * + * @param ft_tiles Tiled Cholesky factor matrix represented as a vector of futurized tiles. + * @param ft_alpha Tiled vector containing the solution of K^-1 * y + * @param ft_y Tiled vector containing the training output y + * @param N Tile size per dimension. + * @param n_tiles Number of tiles per dimension. + * @return The loss value to be computed + */ +template +hpx::future +compute_loss_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_alpha, Tiles &ft_y, std::size_t N, std::size_t n_tiles) +{ + std::vector> loss_tiled; + loss_tiled.reserve(n_tiles); + for (std::size_t k = 0; k < n_tiles; k++) + { + loss_tiled.push_back(detail::named_dataflow( + sched, + schedule::compute_loss(sched, n_tiles, k), + "loss_tiled", + ft_tiles[k * n_tiles + k], + ft_alpha[k], + ft_y[k], + N)); + } + return detail::named_dataflow("loss_tiled", loss_tiled, N, n_tiles); +} + +/** + * @brief Updates a hyperparameter of the SEK kernel using Adam + * + * @param ft_invK Tiled inverse of the covariance matrix K represented as a vector of futurized tiles. + * @param ft_gradK_param Tiled covariance matrix gradient w.r.t. a hyperparameter. + * @param ft_alpha Tiled vector containing the precomputed inv(K) * y where y is the training output. + * @param adam_params Hyperparameter of the Adam optimizer + * @param sek_params Hyperparameters of the SEK kernel + * @param N Tile size per dimension. + * @param n_tiles Number of tiles per dimension. + * @param iter Current iteration. + * @param param_idx Index of the hyperparameter to optimize. + */ +template +void update_hyperparameter_tiled_lengthscale( + Scheduler &sched, + const Tiles &ft_invK, + const Tiles &ft_gradK_param, + const Tiles &ft_alpha, + const AdamParams &adam_params, + Tiles &diag_tiles, // Diagonal tiles + Tiles &inter_alpha, // Intermediate result + SEKParams &sek_params, + std::size_t N, + std::size_t n_tiles, + std::size_t iter, + std::size_t param_idx) +{ + /* + * PART 1: + * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y ) + * + * 1: Compute trace(inv(K) * grad(K)_param) + * 2: Compute y^T * inv(K) * grad(K)_param * inv(K) * y + * + * Update parameter: + * 3: Update moments + * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T + * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 + * 4: Adam step: + * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) + * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) + */ + hpx::shared_future trace = hpx::make_ready_future(0.0); + hpx::shared_future dot = hpx::make_ready_future(0.0); + bool jitter = false; + double factor = 1.0; + + // Reset our helper tiles + for (std::size_t d = 0; d < n_tiles; d++) + { + diag_tiles[d] = detail::named_make_tile( + sched, schedule::diag_tile(sched, n_tiles, d), "assemble", diag_tiles[d], N); + inter_alpha[d] = detail::named_make_tile( + sched, schedule::inter_alpha_tile(sched, n_tiles, d), "assemble", inter_alpha[d], N); + } + + //////////////////////////////////// + // PART 1: Compute gradient + // Step 1: Compute trace(inv(K)*grad_K_param) + // Compute diagonal tiles of inv(K) * grad(K)_param + for (std::size_t i = 0; i < n_tiles; ++i) + { + for (std::size_t j = 0; j < n_tiles; ++j) + { + diag_tiles[i] = detail::named_dataflow( + sched, + schedule::diag_tile(sched, n_tiles, i), + "trace", + ft_invK[i * n_tiles + j], + ft_gradK_param[j * n_tiles + i], + diag_tiles[i], + N, + N); + } + } + // Compute the trace of the diagonal tiles + for (std::size_t j = 0; j < n_tiles; ++j) + { + trace = detail::named_dataflow( + sched, schedule::diag_tile(sched, n_tiles, j), "trace", diag_tiles[j], trace); + } + // Not sure if can be done this way + // Step 2: Compute alpha^T * grad(K)_param * alpha (with alpha = inv(K) * y) + // Compute inter_alpha = grad(K)_param * alpha + for (std::size_t k = 0; k < n_tiles; k++) + { + for (std::size_t m = 0; m < n_tiles; m++) + { + inter_alpha[k] = detail::named_dataflow( + sched, + schedule::inter_alpha_tile(sched, n_tiles, k), + "gemv", + ft_gradK_param[k * n_tiles + m], + ft_alpha[m], + inter_alpha[k], + N, + N, + Blas_add, + Blas_no_trans); + } + } + // Compute alpha^T * inter_alpha + for (std::size_t j = 0; j < n_tiles; ++j) + { + dot = detail::named_dataflow( + sched, schedule::inter_alpha_tile(sched, n_tiles, j), "grad_right_tiled", inter_alpha[j], ft_alpha[j], dot); + } + + impl::update_parameters( + adam_params, sek_params, N, n_tiles, iter, param_idx, trace.get(), dot.get(), jitter, factor); +} + +template +void update_hyperparameter_tiled_noise_variance( + Scheduler &sched, + const Tiles &ft_invK, + const Tiles &ft_alpha, + const AdamParams &adam_params, + SEKParams &sek_params, + std::size_t N, + std::size_t n_tiles, + std::size_t iter, + std::size_t param_idx) +{ + /* + * PART 1: + * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y ) + * + * 1: Compute trace(inv(K) * grad(K)_param) + * 2: Compute y^T * inv(K) * grad(K)_param * inv(K) * y + * + * Update parameter: + * 3: Update moments + * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T + * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 + * 4: Adam step: + * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) + * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) + */ + hpx::shared_future trace = hpx::make_ready_future(0.0); + hpx::shared_future dot = hpx::make_ready_future(0.0); + bool jitter = true; + double factor = 1.0; + + //////////////////////////////////// + // PART 1: Compute gradient + // Step 1: Compute the trace of inv(K) * noise_variance + for (std::size_t j = 0; j < n_tiles; ++j) + { + trace = detail::named_dataflow( + sched, schedule::K_inv_tile(sched, n_tiles, j, j), "grad_left_tiled", ft_invK[j * n_tiles + j], trace, N); + } + //////////////////////////////////// + // Step 2: Compute the alpha^T * alpha * noise_variance + for (std::size_t j = 0; j < n_tiles; ++j) + { + dot = detail::named_dataflow( + sched, schedule::alpha_tile(sched, n_tiles, j), "grad_right_tiled", ft_alpha[j], ft_alpha[j], dot); + } + + factor = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true)); + + impl::update_parameters( + adam_params, sek_params, N, n_tiles, iter, param_idx, trace.get(), dot.get(), jitter, factor); +} + +} // end of namespace cpu + +GPRAT_NS_END + +#endif diff --git a/core/include/gprat/detail/async_helpers.hpp b/core/include/gprat/detail/async_helpers.hpp new file mode 100644 index 00000000..05a24a91 --- /dev/null +++ b/core/include/gprat/detail/async_helpers.hpp @@ -0,0 +1,74 @@ +#ifndef GPRAT_DETAIL_DATAFLOW_HELPERS_HPP +#define GPRAT_DETAIL_DATAFLOW_HELPERS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include +#include +#include +#include + +GPRAT_NS_BEGIN + +/// @brief Empty type representing local scheduling (always on this locality) +struct basic_local_scheduler +{ }; + +namespace detail +{ + +// Functions prefixed with named_* allow the user to specify a custom name for this entry in the +// execution graph. Much like wrapping your function with hpx::annotated_function would. + +// ============================================================= +// non-scheduler aware + +template +decltype(auto) named_dataflow(const char *name, Args &&...args) +{ + return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward(args)...); +} + +template +decltype(auto) named_async(const char *name, Args &&...args) +{ + return hpx::async(hpx::annotated_function(F, name), std::forward(args)...); +} + +// ============================================================= +// local shared-memory scheduling +// (no-op, same as above) + +template +decltype(auto) named_make_tile(const basic_local_scheduler & /*sched*/, + std::size_t /*on*/, + const char *name, + TileReference & /*target*/, + Args &&...args) +{ + // This method basically ignores the reference to the target tile as the non-action factories don't need it. + // (They always create the tile_data locally and return that - only the HPX action wrappers need a reference) + return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward(args)...); +} + +template +decltype(auto) +named_dataflow(const basic_local_scheduler & /*sched*/, std::size_t /*on*/, const char *name, Args &&...args) +{ + return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward(args)...); +} + +template +decltype(auto) +named_async(const basic_local_scheduler & /*sched*/, std::size_t /*on*/, const char *name, Args &&...args) +{ + return hpx::async(hpx::annotated_function(F, name), std::forward(args)...); +} + +} // namespace detail + +GPRAT_NS_END + +#endif diff --git a/core/include/gprat/detail/config.hpp b/core/include/gprat/detail/config.hpp new file mode 100644 index 00000000..e47a2de7 --- /dev/null +++ b/core/include/gprat/detail/config.hpp @@ -0,0 +1,26 @@ +#ifndef GPRAT_DETAIL_CONFIG_HPP +#define GPRAT_DETAIL_CONFIG_HPP + +#pragma once + +// clang-format off +#define GPRAT_NS gprat::v1 +#define GPRAT_NS_BEGIN namespace gprat { inline namespace v1 { +#define GPRAT_NS_END } } +// clang-format on + +#if defined(_MSC_VER) || defined(__BORLANDC__) || defined(__CODEGEARC__) +#if defined(GPRAT_DYN_LINK) +#if defined(GPRAT_SOURCE) +#define GPRAT_DECL __declspec(dllexport) +#else +#define GPRAT_DECL __declspec(dllimport) +#endif +#endif +#endif + +#if !defined(GPRAT_DECL) +#define GPRAT_DECL +#endif + +#endif diff --git a/core/include/gprat_c.hpp b/core/include/gprat/gprat.hpp similarity index 82% rename from core/include/gprat_c.hpp rename to core/include/gprat/gprat.hpp index 6781d286..88c6972f 100644 --- a/core/include/gprat_c.hpp +++ b/core/include/gprat/gprat.hpp @@ -1,16 +1,19 @@ -#ifndef GPRAT_C_H -#define GPRAT_C_H +#ifndef GPRAT_C_HPP +#define GPRAT_C_HPP -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" -#include "target.hpp" +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" + +#include "tile_data.hpp" #include #include #include -// namespace for GPRat library entities -namespace gprat -{ +GPRAT_NS_BEGIN /** * @brief Data structure for Gaussian Process data @@ -24,10 +27,10 @@ struct GP_data std::string file_path; /** @brief Number of samples in the data */ - int n_samples; + std::size_t n_samples; /** @brief Number of GP regressors */ - int n_regressors; + std::size_t n_regressors; /** @brief Vector containing the data */ std::vector data; @@ -38,10 +41,10 @@ struct GP_data * * The file specified by `f_path` must contain `n` samples. * - * @param f_path Path to the file + * @param file_path Path to the file * @param n Number of samples */ - GP_data(const std::string &file_path, int n, int n_reg); + GP_data(const std::string &file_path, std::size_t n, std::size_t n_reg); }; /** @@ -61,10 +64,10 @@ class GP std::vector training_output_; /** @brief Number of tiles */ - int n_tiles_; + std::size_t n_tiles_; /** @brief Size of each tile in each dimension */ - int n_tile_size_; + std::size_t n_tile_size_; /** * @brief List of bools indicating trainable parameters: lengthscale, @@ -79,12 +82,12 @@ class GP public: /** @brief Number of regressors */ - int n_reg; + std::size_t n_reg; /** * @brief Hyperarameters of the squared exponential kernel */ - gprat_hyper::SEKParams kernel_params; + SEKParams kernel_params; /** * @brief Constructs a Gaussian Process (GP) @@ -102,10 +105,10 @@ class GP */ GP(std::vector input, std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, std::vector trainable_bool, std::shared_ptr target); @@ -124,10 +127,10 @@ class GP */ GP(std::vector input, std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, std::vector trainable_bool); /** @@ -147,10 +150,10 @@ class GP */ GP(std::vector input, std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, std::vector trainable_bool, int gpu_id, int n_streams); @@ -173,14 +176,14 @@ class GP /** * @brief Predict output for test input */ - std::vector predict(const std::vector &test_data, int m_tiles, int m_tile_size); + std::vector predict(const std::vector &test_data, std::size_t m_tiles, std::size_t m_tile_size); /** * @brief Predict output for test input and additionally provide * uncertainty for the predictions. */ std::vector> - predict_with_uncertainty(const std::vector &test_data, int m_tiles, int m_tile_size); + predict_with_uncertainty(const std::vector &test_data, std::size_t m_tiles, std::size_t m_tile_size); /** * @brief Predict output for test input and additionally compute full @@ -193,7 +196,7 @@ class GP * @return Full covariance matrix */ std::vector> - predict_with_full_cov(const std::vector &test_data, int m_tiles, int m_tile_size); + predict_with_full_cov(const std::vector &test_data, std::size_t m_tiles, std::size_t m_tile_size); /** * @brief Optimize hyperparameters @@ -203,7 +206,7 @@ class GP * * @return losses */ - std::vector optimize(const gprat_hyper::AdamParams &adam_params); + std::vector optimize(const AdamParams &adam_params); /** * @brief Perform a single optimization step @@ -214,7 +217,7 @@ class GP * * @return loss */ - double optimize_step(gprat_hyper::AdamParams &adam_params, int iter); + double optimize_step(AdamParams &adam_params, std::size_t iter); /** * @brief Calculate loss for given data and Gaussian process model @@ -224,8 +227,9 @@ class GP /** * @brief Computes & returns cholesky decomposition */ - std::vector> cholesky(); + std::vector> cholesky(); }; -} // namespace gprat -#endif // end of GPRAT_C_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/adapter_cublas.cuh b/core/include/gprat/gpu/adapter_cublas.cuh similarity index 97% rename from core/include/gpu/adapter_cublas.cuh rename to core/include/gprat/gpu/adapter_cublas.cuh index 1a69cb58..05972b36 100644 --- a/core/include/gpu/adapter_cublas.cuh +++ b/core/include/gprat/gpu/adapter_cublas.cuh @@ -1,10 +1,18 @@ -#ifndef ADAPTER_CUBLAS_H -#define ADAPTER_CUBLAS_H +#ifndef GRRAT_GPU_ADAPTER_CUBLAS_HPP +#define GPRAT_GPU_ADAPTER_CUBLAS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include "gprat/target.hpp" -#include #include #include -#include + +#include + +GPRAT_NS_BEGIN // Constants, compatible with cuBLAS @@ -262,4 +270,6 @@ inline cublasSideMode_t opposite(cublasSideMode_t side) return (side == CUBLAS_SIDE_LEFT) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; } -#endif // end of ADAPTER_CUBLAS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/cuda_kernels.cuh b/core/include/gprat/gpu/cuda_kernels.cuh similarity index 71% rename from core/include/gpu/cuda_kernels.cuh rename to core/include/gprat/gpu/cuda_kernels.cuh index 4daef473..69a48d8f 100644 --- a/core/include/gpu/cuda_kernels.cuh +++ b/core/include/gprat/gpu/cuda_kernels.cuh @@ -1,5 +1,11 @@ -#ifndef CUDA_KERNELS_H -#define CUDA_KERNELS_H +#ifndef GPRAT_CUDA_KERNELS_HPP +#define GPRAT_CUDA_KERNELS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +GPRAT_NS_BEGIN /** * @brief Kernel to transpose a matrix. @@ -11,4 +17,6 @@ */ __global__ void transpose(double *transposed, double *original, std::size_t width, std::size_t height); -#endif // CUDA_KERNELS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/cuda_utils.cuh b/core/include/gprat/gpu/cuda_utils.cuh similarity index 89% rename from core/include/gpu/cuda_utils.cuh rename to core/include/gprat/gpu/cuda_utils.cuh index 0c51ea76..128c6e22 100644 --- a/core/include/gpu/cuda_utils.cuh +++ b/core/include/gprat/gpu/cuda_utils.cuh @@ -1,14 +1,20 @@ -#ifndef CUDA_UTILS_H -#define CUDA_UTILS_H +#ifndef GPRAT_CUDA_UTILS_HPP +#define GPRAT_CUDA_UTILS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/target.hpp" #include #include #include #include #include -#include #include +GPRAT_NS_BEGIN + #define BLOCK_SIZE 16 using hpx::cuda::experimental::check_cuda_error; @@ -25,7 +31,7 @@ using hpx::cuda::experimental::check_cuda_error; * * @return A pointer to the copied vector on the device */ -inline double *copy_to_device(const std::vector &h_vector, gprat::CUDA_GPU &gpu) +inline double *copy_to_device(const std::vector &h_vector, CUDA_GPU &gpu) { double *d_vector; check_cuda_error(cudaMalloc(&d_vector, h_vector.size() * sizeof(double))); @@ -66,4 +72,6 @@ inline void free(std::vector> &vector) } } -#endif // end of CUDA_UTILS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/gp_algorithms.cuh b/core/include/gprat/gpu/gp_algorithms.cuh similarity index 89% rename from core/include/gpu/gp_algorithms.cuh rename to core/include/gprat/gpu/gp_algorithms.cuh index 51cbc355..8da8a956 100644 --- a/core/include/gpu/gp_algorithms.cuh +++ b/core/include/gprat/gpu/gp_algorithms.cuh @@ -1,11 +1,18 @@ -#ifndef GPU_GP_ALGORITHMS_H -#define GPU_GP_ALGORITHMS_H +#ifndef GPRAT_GPU_GP_ALGORITHMS_HPP +#define GPRAT_GPU_GP_ALGORITHMS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" +#include "gprat/tile_data.hpp" -#include "gp_kernels.hpp" -#include "target.hpp" #include #include +GPRAT_NS_BEGIN + namespace gpu { @@ -28,8 +35,8 @@ double *gen_tile_covariance(const double *d_input, const std::size_t tile_column, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Generate the diagonal of a diagonal tile in the prior covariance matrix @@ -51,8 +58,8 @@ double *gen_tile_prior_covariance( const std::size_t tile_column, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Generate a tile of the cross-covariance matrix @@ -77,8 +84,8 @@ double *gen_tile_cross_covariance( const std::size_t n_row_tile_size, const std::size_t n_column_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Transpose a tile of size n_row_tile_size x n_column_tile_size @@ -92,7 +99,7 @@ double *gen_tile_cross_covariance( hpx::shared_future gen_tile_transpose(std::size_t n_row_tile_size, std::size_t n_column_tile_size, const hpx::shared_future f_tile, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Generate a tile of the output data @@ -104,7 +111,7 @@ hpx::shared_future gen_tile_transpose(std::size_t n_row_tile_size, * @return A tile of the output data of size n_tile_size */ double * -gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, gprat::CUDA_GPU &gpu); +gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, CUDA_GPU &gpu); /** * @brief Compute the L2-error norm over all tiles and elements @@ -126,7 +133,7 @@ double compute_error_norm(const std::size_t n_tiles, * * @return A tile filled with zeros of size N */ -double *gen_tile_zeros(std::size_t n_tile_size, gprat::CUDA_GPU &gpu); +double *gen_tile_zeros(std::size_t n_tile_size, CUDA_GPU &gpu); /** * @brief Allocates the tiled covariance matrix on the device given the training @@ -144,8 +151,8 @@ std::vector> assemble_tiled_covariance_matrix( const std::size_t n_tiles, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Allocates the tiled alpha vector on the device given the training @@ -159,7 +166,7 @@ std::vector> assemble_tiled_covariance_matrix( * @return A tiled alpha vector of size n_tiles x n_tile_size */ std::vector> assemble_alpha_tiles( - const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu); + const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu); /** * @brief Allocates the tiled cross covariance matrix on the device given the @@ -185,8 +192,8 @@ std::vector> assemble_cross_covariance_tiles( const std::size_t m_tile_size, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Allocates a tiled vector on the device and initializes it with zeros. @@ -198,7 +205,7 @@ std::vector> assemble_cross_covariance_tiles( * @return A tiled vector of size n_tiles x n_tile_size with zeros */ std::vector> -assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, gprat::CUDA_GPU &gpu); +assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu); /** * @brief Allocates the tiled prior covariance matrix on the device given the @@ -218,8 +225,8 @@ std::vector> assemble_prior_K_tiles( const std::size_t m_tiles, const std::size_t m_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Allocates the posterior covariance matrix. @@ -238,8 +245,8 @@ std::vector> assemble_prior_K_tiles_full( const std::size_t m_tiles, const std::size_t m_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu); + const SEKParams sek_params, + CUDA_GPU &gpu); /** * @brief Allocates the tiled transpose cross covariance matrix on the device @@ -261,7 +268,7 @@ std::vector> assemble_t_cross_covariance_tiles( const std::size_t m_tiles, const std::size_t n_tile_size, const std::size_t m_tile_size, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Allocates the output vector on the device given the training output @@ -272,7 +279,7 @@ std::vector> assemble_t_cross_covariance_tiles( * @param gpu GPU target for computations */ std::vector> assemble_y_tiles( - const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu); + const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu); /** * @brief Allocates the tiled covariance matrix on the device given the training @@ -286,7 +293,7 @@ std::vector> assemble_y_tiles( std::vector copy_tiled_vector_to_host_vector(std::vector> &d_tiles, std::size_t n_tile_size, std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Moves lower triangular tiles of the covariance matrix to the host. @@ -298,11 +305,11 @@ std::vector copy_tiled_vector_to_host_vector(std::vector> move_lower_tiled_matrix_to_host( +std::vector> move_lower_tiled_matrix_to_host( const std::vector> &d_tiles, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Frees the device memory of the lower triangular tiles of the covariance matrix. @@ -314,4 +321,6 @@ void free_lower_tiled_matrix(const std::vector> &d_ } // end of namespace gpu -#endif // end of GPU_GP_ALGORITHMS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/gp_functions.cuh b/core/include/gprat/gpu/gp_functions.cuh similarity index 87% rename from core/include/gpu/gp_functions.cuh rename to core/include/gprat/gpu/gp_functions.cuh index 6ea5bd0a..d8746d33 100644 --- a/core/include/gpu/gp_functions.cuh +++ b/core/include/gprat/gpu/gp_functions.cuh @@ -1,9 +1,15 @@ #ifndef GPU_GP_FUNCTIONS_H #define GPU_GP_FUNCTIONS_H -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" -#include "target.hpp" +#pragma once + +#include "gprat/detail/config.hpp" +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" +#include "gprat/tile_data.hpp" + +GPRAT_NS_BEGIN namespace gpu { @@ -28,13 +34,13 @@ std::vector predict(const std::vector &training_input, const std::vector &training_output, const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Compute the predictions with uncertainties. @@ -56,13 +62,13 @@ std::vector> predict_with_uncertainty( const std::vector &training_input, const std::vector &training_output, const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Compute the predictions with full covariance matrix. @@ -84,13 +90,13 @@ std::vector> predict_with_full_cov( const std::vector &training_input, const std::vector &training_output, const std::vector &test_data, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Compute loss for given data and Gaussian process model @@ -107,11 +113,11 @@ std::vector> predict_with_full_cov( */ double compute_loss(const std::vector &training_input, const std::vector &training_output, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform optimization for a given number of iterations @@ -137,10 +143,10 @@ optimize(const std::vector &training_input, int n_tiles, int n_tile_size, int n_regressors, - const gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, + const AdamParams &adam_params, + SEKParams &sek_params, std::vector trainable_params, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform a single optimization step @@ -166,11 +172,11 @@ double optimize_step(const std::vector &training_input, int n_tiles, int n_tile_size, int n_regressors, - gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, + AdamParams &adam_params, + SEKParams &sek_params, std::vector trainable_params, int iter, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform Cholesky decompositon (+ Assembly) @@ -186,14 +192,16 @@ double optimize_step(const std::vector &training_input, * * @return The tiled Cholesky factor */ -std::vector> +std::vector> cholesky(const std::vector &training_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); } // end of namespace gpu +GPRAT_NS_END + #endif diff --git a/core/include/gpu/gp_optimizer.cuh b/core/include/gprat/gpu/gp_optimizer.cuh similarity index 93% rename from core/include/gpu/gp_optimizer.cuh rename to core/include/gprat/gpu/gp_optimizer.cuh index d0c5dd3a..61495de0 100644 --- a/core/include/gpu/gp_optimizer.cuh +++ b/core/include/gprat/gpu/gp_optimizer.cuh @@ -1,12 +1,19 @@ -#ifndef GPU_GP_OPTIMIZER_H -#define GPU_GP_OPTIMIZER_H +#ifndef GPRAT_GPU_GP_OPTIMIZER_HPP +#define GPRAT_GPU_GP_OPTIMIZER_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include "gprat/hyperparameters.hpp" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" -#include "gp_hyperparameters.hpp" -#include "gp_kernels.hpp" -#include "target.hpp" #include #include +GPRAT_NS_BEGIN + namespace gpu { @@ -56,7 +63,7 @@ double compute_sigmoid(const double parameter); double compute_covariance_distance(std::size_t i_global, std::size_t j_global, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &i_input, const std::vector &j_input); @@ -77,7 +84,7 @@ std::vector gen_tile_distance( std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &input); /** @@ -96,7 +103,7 @@ std::vector gen_tile_covariance_with_distance( std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists); /** @@ -116,7 +123,7 @@ gen_tile_grad_v(std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists); /** @@ -136,7 +143,7 @@ gen_tile_grad_l(std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists); /** @@ -159,7 +166,7 @@ std::vector gen_tile_grad_v_trans(std::size_t N, const std::vector -gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future f_grad_l_tile, gprat::CUDA_GPU &gpu); +gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future f_grad_l_tile, CUDA_GPU &gpu); /** * @brief Compute hyper-parameter beta_1 or beta_2 to power t. @@ -187,7 +194,7 @@ compute_loss(const hpx::shared_future &K_diag_tile, const hpx::shared_future &alpha_tile, const hpx::shared_future &y_tile, std::size_t N, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Add up negative-log likelihood loss for all tiles. @@ -260,8 +267,8 @@ double update_second_moment(const double &gradient, double v_T, const double &be */ hpx::shared_future update_param(const double unconstrained_hyperparam, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, double m_T, double v_T, const std::vector beta1_T, @@ -319,7 +326,7 @@ sum_gradright(const std::vector &inter_alpha, const std::vector */ double sum_noise_gradleft(const std::vector &ft_invK, double grad, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, std::size_t N, std::size_t n_tiles); @@ -334,8 +341,10 @@ double sum_noise_gradleft(const std::vector &ft_invK, * @return The sum of the noise gradient */ double -sum_noise_gradright(const std::vector &alpha, double grad, gprat_hyper::SEKParams sek_params, std::size_t N); +sum_noise_gradright(const std::vector &alpha, double grad, SEKParams sek_params, std::size_t N); } // end of namespace gpu -#endif // end of GPU_GP_OPTIMIZER_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/gp_uncertainty.cuh b/core/include/gprat/gpu/gp_uncertainty.cuh similarity index 71% rename from core/include/gpu/gp_uncertainty.cuh rename to core/include/gprat/gpu/gp_uncertainty.cuh index 8c2dce18..4a93eccb 100644 --- a/core/include/gpu/gp_uncertainty.cuh +++ b/core/include/gprat/gpu/gp_uncertainty.cuh @@ -1,7 +1,13 @@ -#ifndef GPU_GP_UNCERTAINTY_H -#define GPU_GP_UNCERTAINTY_H +#ifndef GPRAT_GPU_GP_UNCERTAINTY_HPP +#define GPRAT_GPU_GP_UNCERTAINTY_HPP -#include "target.hpp" +#pragma once + +#include "gprat/detail/config.hpp" + +#include "gprat/target.hpp" + +GPRAT_NS_BEGIN namespace gpu { @@ -16,7 +22,7 @@ namespace gpu * @return Diagonal elements of posterior covariance matrix */ hpx::shared_future diag_posterior( - const hpx::shared_future A, const hpx::shared_future B, std::size_t M, gprat::CUDA_GPU &gpu); + const hpx::shared_future A, const hpx::shared_future B, std::size_t M, CUDA_GPU &gpu); /** * @brief Retrieve diagonal elements of posterior covariance matrix. @@ -26,8 +32,10 @@ hpx::shared_future diag_posterior( * * @return Diagonal elements of posterior covariance matrix */ -hpx::shared_future diag_tile(const hpx::shared_future A, std::size_t M, gprat::CUDA_GPU &gpu); +hpx::shared_future diag_tile(const hpx::shared_future A, std::size_t M, CUDA_GPU &gpu); } // end of namespace gpu -#endif // end of GPU_GP_UNCERTAINTY_H +GPRAT_NS_END + +#endif diff --git a/core/include/gpu/tiled_algorithms.cuh b/core/include/gprat/gpu/tiled_algorithms.cuh similarity index 92% rename from core/include/gpu/tiled_algorithms.cuh rename to core/include/gprat/gpu/tiled_algorithms.cuh index 78c6f5cb..38875e1e 100644 --- a/core/include/gpu/tiled_algorithms.cuh +++ b/core/include/gprat/gpu/tiled_algorithms.cuh @@ -1,12 +1,19 @@ -#ifndef GPU_TILED_ALGORITHMS_H -#define GPU_TILED_ALGORITHMS_H +#ifndef GPRAT_GPU_TILED_ALGORITHMS_HPP +#define GPRAT_GPU_TILED_ALGORITHMS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include "gprat/hyperparameters.hpp" +#include "gprat/target.hpp" +#include "gprat/kernels.hpp" -#include "gp_hyperparameters.hpp" -#include "target.hpp" #include -#include #include +GPRAT_NS_BEGIN + namespace gpu { @@ -26,7 +33,7 @@ namespace gpu void right_looking_cholesky_tiled(std::vector> &ft_tiles, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu, + CUDA_GPU &gpu, const cusolverDnHandle_t &cusolver); // Tiled Triangular Solve Algorithms @@ -44,7 +51,7 @@ void forward_solve_tiled(std::vector> &ft_tiles, std::vector> &ft_rhs, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform tiled backward triangular matrix-vector solve. @@ -59,7 +66,7 @@ void backward_solve_tiled(std::vector> &ft_tiles, std::vector> &ft_rhs, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform tiled forward triangular matrix-matrix solve. @@ -79,7 +86,7 @@ void forward_solve_tiled_matrix( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform tiled backward triangular matrix-matrix solve. @@ -99,7 +106,7 @@ void backward_solve_tiled_matrix( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform tiled matrix-vector multiplication @@ -120,7 +127,7 @@ void matrix_vector_tiled(std::vector> &ft_tiles, const std::size_t N_col, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Perform tiled symmetric k-rank update on diagonal tiles @@ -140,14 +147,14 @@ void symmetric_matrix_matrix_diagonal_tiled( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); void compute_gemm_of_invK_y(std::vector> &ft_invK, std::vector> &ft_y, std::vector> &ft_alpha, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); // Tiled Loss hpx::shared_future compute_loss_tiled( @@ -156,7 +163,7 @@ hpx::shared_future compute_loss_tiled( std::vector> &ft_y, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); // Tiled Diagonal of Posterior Covariance Matrix void symmetric_matrix_matrix_tiled( @@ -166,7 +173,7 @@ void symmetric_matrix_matrix_tiled( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Compute the difference between two tiled vectors @@ -183,14 +190,14 @@ void vector_difference_tiled(std::vector> &ft_prior std::vector> &ft_vector, const std::size_t m_tile_size, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); // Tiled Prediction Uncertainty void matrix_diagonal_tiled(std::vector> &ft_priorK, std::vector> &ft_vector, const std::size_t m_tile_size, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); // Compute I-y*y^T*inv(K) void update_grad_K_tiled_mkl(std::vector> &ft_tiles, @@ -198,7 +205,7 @@ void update_grad_K_tiled_mkl(std::vector> &ft_tiles const std::vector> &ft_v2, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Updates the lengthscale hyperparameter of the SEK kernel using Adam. @@ -223,8 +230,8 @@ double update_lengthscale( const std::vector> &ft_invK, const std::vector> &ft_gradparam, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -232,7 +239,7 @@ double update_lengthscale( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Updates the vertical lengthscale hyperparameter of the SEK kernel @@ -258,8 +265,8 @@ double update_vertical_lengthscale( const std::vector> &ft_invK, const std::vector> &ft_gradparam, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -267,7 +274,7 @@ double update_vertical_lengthscale( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); /** * @brief Updates a hyperparameter of the SEK kernel using Adam @@ -290,8 +297,8 @@ double update_vertical_lengthscale( double update_noise_variance( const std::vector> &ft_invK, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -299,8 +306,10 @@ double update_noise_variance( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu); + CUDA_GPU &gpu); } // end of namespace gpu -#endif // end of GPU_TILED_ALGORITHMS_H +GPRAT_NS_END + +#endif diff --git a/core/include/gp_hyperparameters.hpp b/core/include/gprat/hyperparameters.hpp similarity index 56% rename from core/include/gp_hyperparameters.hpp rename to core/include/gprat/hyperparameters.hpp index cd9cf5a8..dae073dc 100644 --- a/core/include/gp_hyperparameters.hpp +++ b/core/include/gprat/hyperparameters.hpp @@ -1,10 +1,14 @@ -#ifndef GP_HYPERPARAMETERS_H -#define GP_HYPERPARAMETERS_H +#ifndef GPRAT_GPHYPERPARAMETERS_HPP +#define GPRAT_GPHYPERPARAMETERS_HPP +#pragma once + +#include "gprat/detail/config.hpp" + +#include #include -namespace gprat_hyper -{ +GPRAT_NS_BEGIN /** * @brief Hyperparameters for the Adam optimizer @@ -34,7 +38,7 @@ struct AdamParams /** * @brief Number of optimization iterations */ - int opt_iter; + std::size_t opt_iter; /** * @brief Initialize hyperparameters @@ -44,10 +48,8 @@ struct AdamParams * @param b2 beta2 * @param eps epsilon * @param opt_i number of optimization iterationsgp op - * @param M_T_init initial values for first moment vector - * @param V_T_init initial values for second moment vector */ - AdamParams(double lr = 0.001, double b1 = 0.9, double b2 = 0.999, double eps = 1e-8, int opt_i = 0); + AdamParams(double lr = 0.001, double b1 = 0.9, double b2 = 0.999, double eps = 1e-8, std::size_t opt_i = 0); /** * @brief Returns a string representation of the hyperparameters @@ -55,6 +57,30 @@ struct AdamParams std::string repr() const; }; -} // namespace gprat_hyper +template +void save_construct_data(Archive &ar, const AdamParams *v, const unsigned int) +{ + ar << v->learning_rate; + ar << v->beta1; + ar << v->beta2; + ar << v->epsilon; + ar << v->opt_iter; +} + +template +void load_construct_data(Archive &ar, AdamParams *v, const unsigned int) +{ + double learning_rate, beta1, beta2, epsilon; + int opt_iter; + ar >> learning_rate; + ar >> beta1; + ar >> beta2; + ar >> epsilon; + ar >> opt_iter; + + new (v) AdamParams(learning_rate, beta1, beta2, epsilon, opt_iter); +} + +GPRAT_NS_END -#endif // GP_HYPERPARAMETERS_H +#endif diff --git a/core/include/gp_kernels.hpp b/core/include/gprat/kernels.hpp similarity index 55% rename from core/include/gp_kernels.hpp rename to core/include/gprat/kernels.hpp index c1346f32..daa7798b 100644 --- a/core/include/gp_kernels.hpp +++ b/core/include/gprat/kernels.hpp @@ -1,12 +1,15 @@ -#ifndef GP_KERNELS_H -#define GP_KERNELS_H +#ifndef GPRAT_GPKERNELS_HPP +#define GPRAT_GPKERNELS_HPP -#include +#pragma once -// #include +#include "gprat/detail/config.hpp" -namespace gprat_hyper -{ +#include +#include +#include + +GPRAT_NS_BEGIN /** * @brief Squared Exponential Kernel Parameters @@ -41,12 +44,12 @@ struct SEKParams /** * @brief Construct a new SEKParams object * - * @param lengthscale Lengthscale: variance of training output - * @param vertical_lengthscale Vertical Lengthscale: standard deviation + * @param in_lengthscale Lengthscale: variance of training output + * @param in_vertical_lengthscale Vertical Lengthscale: standard deviation * of training input - * @param noise_variance Noise Variance: small value + * @param in_noise_variance Noise Variance: small value */ - SEKParams(double lengthscale_, double vertical_lengthscale_, double noise_variance_); + SEKParams(double in_lengthscale, double in_vertical_lengthscale, double in_noise_variance); /** * @brief Return the number of parameters @@ -77,6 +80,31 @@ struct SEKParams const double &get_param(std::size_t index) const; }; -} // namespace gprat_hyper +template +void save_construct_data(Archive &ar, const SEKParams *v, const unsigned int) +{ + ar << v->lengthscale; + ar << v->vertical_lengthscale; + ar << v->noise_variance; +} + +template +void load_construct_data(Archive &ar, SEKParams *v, const unsigned int) +{ + double lengthscale, vertical_lengthscale, noise_variance; + ar >> lengthscale; + ar >> vertical_lengthscale; + ar >> noise_variance; + + new (v) SEKParams(lengthscale, vertical_lengthscale, noise_variance); +} + +template +void serialize(Archive &ar, SEKParams &pt, const unsigned int) +{ + ar & pt.m_T & pt.w_T; +} + +GPRAT_NS_END -#endif // end of GP_KERNELS_H +#endif diff --git a/core/include/gprat/performance_counters.hpp b/core/include/gprat/performance_counters.hpp new file mode 100644 index 00000000..13054735 --- /dev/null +++ b/core/include/gprat/performance_counters.hpp @@ -0,0 +1,102 @@ +#ifndef GPRAT_PERFORMANCE_COUNTERS_HPP +#define GPRAT_PERFORMANCE_COUNTERS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include +#include +#include +#include +#include +#include +#include + +GPRAT_NS_BEGIN + +/// The following is a very simple way of defining per-function metrics by using the function itself as a template +/// parameter ensuring that each function receives exactly one instantiation. +template +struct function_performance_metrics +{ + /// Number of times the function was called + static std::atomic num_calls; + + /// Total wall-clock time elapsed inside the function + static std::atomic elapsed_ns; +}; + +template +/*static*/ std::atomic function_performance_metrics::num_calls(0); +template +/*static*/ std::atomic function_performance_metrics::elapsed_ns(0); + +/// @brief This RAII helper allows us to time a function's total wall-clock execution time with minimal code. +struct scoped_function_timer +{ + explicit scoped_function_timer(std::atomic &num_calls, std::atomic &in_total) : + total(in_total) + { + ++num_calls; + } + + ~scoped_function_timer() + { + const auto elapsed = timer.elapsed_nanoseconds(); + HPX_ASSERT(elapsed >= 0); + if (elapsed > 0) + { + total += static_cast(elapsed); + } + } + + std::atomic &total; + hpx::chrono::high_resolution_timer timer; +}; + +/// @brief Time the execution of the enclosing function from the current point to its end. +/// @param local_function The function key that we're collecting performance information for. Usually the enclosing +/// function. +#define GPRAT_TIME_FUNCTION(local_function) \ + scoped_function_timer _gprat_fn_timer(function_performance_metrics::num_calls, \ + function_performance_metrics::elapsed_ns) + +template +std::uint64_t get_and_reset_function_elapsed(bool reset) +{ + return hpx::util::get_and_reset_value(function_performance_metrics::elapsed_ns, reset); +} + +template +std::uint64_t get_and_reset_function_calls(bool reset) +{ + return hpx::util::get_and_reset_value(function_performance_metrics::num_calls, reset); +} + +void track_tile_data_allocation(std::size_t size); +void track_tile_data_deallocation(std::size_t size); + +void register_performance_counters(); + +void force_evict_memory(const void *start, std::size_t size); + +template +void force_evict_memory(std::span data) +{ + force_evict_memory(data.data(), data.size_bytes()); +} + +#ifdef GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS +/// @brief Force-evict a memory span from the cache for benchmarking purposes. +/// @param data The memory region to evict +#define GPRAT_BENCHMARK_FORCE_EVICT(data) force_evict_memory(data) +#else +/// @brief Force-evict a memory span from the cache for benchmarking purposes. +/// @param data The memory region to evict +#define GPRAT_BENCHMARK_FORCE_EVICT(data) (void) data +#endif + +GPRAT_NS_END + +#endif diff --git a/core/include/gprat/scheduler.hpp b/core/include/gprat/scheduler.hpp new file mode 100644 index 00000000..2da7ccd7 --- /dev/null +++ b/core/include/gprat/scheduler.hpp @@ -0,0 +1,183 @@ +#ifndef GPRAT_CPU_SCHEDULER_HPP +#define GPRAT_CPU_SCHEDULER_HPP + +#pragma once + +#include "gprat/detail/async_helpers.hpp" + +// TODO: move to separate header +#include "gprat/tile_data.hpp" + +#include +#include + +GPRAT_NS_BEGIN + +using tiled_scheduler_local = basic_local_scheduler; + +template +using tiled_dataset_local = std::vector>>; + +template +struct tile_dataset_type; + +template +struct tile_dataset_type +{ + using type = tiled_dataset_local; +}; + +template +tiled_dataset_local make_tiled_dataset(const tiled_scheduler_local &, std::size_t num_tiles, Mapper &&) +{ + return std::vector>>{ num_tiles }; +} + +/// @brief This namespace contains the operation placement functions for all schedulers. +namespace schedule +{ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4100) +#endif + +// ============================================================= +// local scheduler + +constexpr std::size_t +covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t +cross_covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t alpha_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) { return 0; } + +constexpr std::size_t prediction_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) +{ + return 0; +} + +constexpr std::size_t +t_cross_covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t +prior_K_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t +K_inv_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t +K_grad_v_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t +K_grad_l_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col) +{ + return 0; +} + +constexpr std::size_t uncertainty_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) +{ + return 0; +} + +constexpr std::size_t inter_alpha_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) +{ + return 0; +} + +constexpr std::size_t diag_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) { return 0; } + +constexpr std::size_t cholesky_potrf(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) +{ + return 0; +} + +constexpr std::size_t cholesky_syrk(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t m) +{ + return 0; +} + +constexpr std::size_t +cholesky_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m) +{ + return 0; +} + +constexpr std::size_t +cholesky_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m, std::size_t n) +{ + return 0; +} + +constexpr std::size_t solve_trsv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; } + +constexpr std::size_t solve_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; } + +constexpr std::size_t solve_gemv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m) +{ + return 0; +} + +constexpr std::size_t +solve_matrix_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k) +{ + return 0; +} + +constexpr std::size_t +solve_matrix_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k, std::size_t m) +{ + return 0; +} + +constexpr std::size_t +multiply_gemv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m) +{ + return 0; +} + +constexpr std::size_t k_rank_dot_diag_syrk(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) +{ + return 0; +} + +constexpr std::size_t +k_rank_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k, std::size_t m) +{ + return 0; +} + +constexpr std::size_t vector_axpy(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; } + +constexpr std::size_t get_diagonal(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; } + +constexpr std::size_t compute_loss(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; } + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace schedule + +GPRAT_NS_END + +#endif diff --git a/core/include/target.hpp b/core/include/gprat/target.hpp similarity index 97% rename from core/include/target.hpp rename to core/include/gprat/target.hpp index 8b66cb0b..13487114 100644 --- a/core/include/target.hpp +++ b/core/include/gprat/target.hpp @@ -1,5 +1,9 @@ -#ifndef TARGET_H -#define TARGET_H +#ifndef GPRAT_TARGET_H +#define GPRAT_TARGET_H + +#pragma once + +#include "gprat/detail/config.hpp" #include @@ -8,8 +12,7 @@ #include #endif -namespace gprat -{ +GPRAT_NS_BEGIN /** * @brief This class represents the target on which to perform the Gaussian @@ -203,6 +206,6 @@ void print_available_gpus(); */ int gpu_count(); -} // namespace gprat +GPRAT_NS_END -#endif // end of TARGET_H +#endif diff --git a/core/include/gprat/tile_data.hpp b/core/include/gprat/tile_data.hpp new file mode 100644 index 00000000..a2615ad8 --- /dev/null +++ b/core/include/gprat/tile_data.hpp @@ -0,0 +1,170 @@ +#ifndef GPRAT_TILE_DATA_HPP +#define GPRAT_TILE_DATA_HPP + +#pragma once + +#include "gprat/detail/config.hpp" + +#include +#include + +GPRAT_NS_BEGIN + +namespace detail +{ +void *allocate_tile_data(std::size_t num_bytes); +void deallocate_tile_data(void *p, std::size_t num_bytes); + +template +struct tile_data_allocator +{ + typedef T value_type; + + tile_data_allocator() = default; + + template + constexpr tile_data_allocator(const tile_data_allocator &) noexcept + { } + + [[nodiscard]] T *allocate(std::size_t n) + { + if (n > (std::numeric_limits::max)() / sizeof(T)) + { + throw std::bad_array_new_length(); + } + + if (auto p = static_cast(allocate_tile_data(n * sizeof(T)))) + { + return p; + } + + throw std::bad_alloc(); + } + + void deallocate(T *p, std::size_t n) noexcept { deallocate_tile_data(p, n * sizeof(T)); } +}; + +template +bool operator==(const tile_data_allocator &, const tile_data_allocator &) +{ + return true; +} + +template +bool operator!=(const tile_data_allocator &, const tile_data_allocator &) +{ + return false; +} +} // namespace detail + +/** + * @brief Non-mutable reference-counted dynamic array of a given type T. + * This class represents a simple reference-counted non-resizeable buffer with elements of type T. + * It can be serialized by HPX and thus be used as a parameter for HPX actions. + * This type is intended to be used for parameters and attributes that do not require mutable data (i.e., only read + * access) + * + * @tparam T Element type of the tile. Usually some numeric type like double or float. This class currently only + * requires T to be serializable by HPX. + */ +template +class const_tile_data +{ + protected: + typedef hpx::serialization::serialize_buffer> cpu_buffer_type; + + struct hold_reference + { + explicit hold_reference(const cpu_buffer_type &data) : + data_(data) + { } + + void operator()(const T *) const { } // no deletion necessary + + cpu_buffer_type data_; + }; + + public: + const_tile_data() = default; + + // Create a new (uninitialized) tile_data of the given size. + explicit const_tile_data(std::size_t size) : + cpu_data_(size) + { } + + // Create a tile_data which acts as a proxy to a part of the embedded array. + // The proxy is assumed to refer to either the left or the right boundary + // element. + const_tile_data(const const_tile_data &base, std::size_t offset, std::size_t size) : + cpu_data_(base.cpu_data_.data() + offset, + size, + cpu_buffer_type::reference, + hold_reference(base.cpu_data_)) // keep referenced tile_data alive + { } + + [[nodiscard]] const T *data() const noexcept { return cpu_data_.data(); } + + [[nodiscard]] std::size_t size() const noexcept { return cpu_data_.size(); } + + [[nodiscard]] const T *begin() const noexcept { return cpu_data_.data(); } + + [[nodiscard]] const T *end() const noexcept { return cpu_data_.data() + cpu_data_.size(); } + + [[nodiscard]] const T &operator[](std::size_t idx) const { return cpu_data_[idx]; } + + [[nodiscard]] std::span as_span() const noexcept { return { cpu_data_.data(), cpu_data_.size() }; } + + // ReSharper disable once CppNonExplicitConversionOperator + operator std::span() const noexcept // NOLINT(*-explicit-constructor) + { + return { cpu_data_.data(), cpu_data_.size() }; + } + + friend bool operator==(const const_tile_data &a, const const_tile_data &b) noexcept + { + return a.cpu_data_ == b.cpu_data_; + } + + protected: + friend class hpx::serialization::access; + + template + void serialize(Archive &ar, const unsigned int) + { + // clang-format off + ar & cpu_data_; + // clang-format on + } + + cpu_buffer_type cpu_data_; +}; + +/** + * A mutable version of const_tile_data. + * + * @tparam T Element type of the tile. See @ref const_tile_data + */ +template +class mutable_tile_data : public const_tile_data +{ + public: + using const_tile_data::const_tile_data; + + [[nodiscard]] T *data() const noexcept { return const_cast(this->cpu_data_.data()); } + + [[nodiscard]] T *begin() const noexcept { return const_cast(this->cpu_data_.data()); } + + [[nodiscard]] T *end() const noexcept { return const_cast(this->cpu_data_.data()) + this->cpu_data_.size(); } + + [[nodiscard]] T &operator[](std::size_t idx) const { return this->cpu_data_[idx]; } + + // ReSharper disable once CppNonExplicitConversionOperator + operator std::span() noexcept // NOLINT(*-explicit-constructor) + { + return { this->cpu_data_.data(), this->cpu_data_.size() }; + } +}; + +GPRAT_NS_END + +#endif diff --git a/core/include/utils_c.hpp b/core/include/gprat/utils.hpp similarity index 75% rename from core/include/utils_c.hpp rename to core/include/gprat/utils.hpp index 591bb7ee..86a4ddd2 100644 --- a/core/include/utils_c.hpp +++ b/core/include/gprat/utils.hpp @@ -1,5 +1,9 @@ -#ifndef UTILS_C_H -#define UTILS_C_H +#ifndef GPRAT_UTILS_HPP +#define GPRAT_UTILS_HPP + +#pragma once + +#include "gprat/detail/config.hpp" #include #include @@ -7,8 +11,8 @@ #include #include -namespace utils -{ +GPRAT_NS_BEGIN + /** * @brief Compute the number of tiles for training data, given the number of * samples and the size of each tile. @@ -16,16 +20,16 @@ namespace utils * @param n_samples Number of samples * @param n_tile_size Size of each tile */ -int compute_train_tiles(int n_samples, int n_tile_size); +std::size_t compute_train_tiles(std::size_t n_samples, std::size_t n_tile_size); /** * @brief Compute the number of tiles for training data, given the number of * samples and the size of each tile. * * @param n_samples Number of samples - * @param n_tile_size Size of each tile + * @param n_tiles Size of each tile */ -int compute_train_tile_size(int n_samples, int n_tiles); +std::size_t compute_train_tile_size(std::size_t n_samples, std::size_t n_tiles); /** * @brief Compute the number of test tiles and the size of a test tile. @@ -37,7 +41,8 @@ int compute_train_tile_size(int n_samples, int n_tiles); * @param n_tiles Number of tiles * @param n_tile_size Size of each tile */ -std::pair compute_test_tiles(int n_test, int n_tiles, int n_tile_size); +std::pair +compute_test_tiles(std::size_t n_test, std::size_t n_tiles, std::size_t n_tile_size); /** * @brief Load data from file @@ -45,7 +50,7 @@ std::pair compute_test_tiles(int n_test, int n_tiles, int n_tile_size) * @param file_path Path to the file * @param n_samples Number of samples to load */ -std::vector load_data(const std::string &file_path, int n_samples, int offset); +std::vector load_data(const std::string &file_path, std::size_t n_samples, std::size_t offset); /** * @brief Print a vector @@ -85,6 +90,6 @@ void stop_hpx_runtime(); */ bool compiled_with_cuda(); -} // namespace utils +GPRAT_NS_END #endif diff --git a/core/src/cpu/adapter_cblas_fp32.cpp b/core/src/cpu/adapter_cblas_fp32.cpp index d91a3867..4cfbea51 100644 --- a/core/src/cpu/adapter_cblas_fp32.cpp +++ b/core/src/cpu/adapter_cblas_fp32.cpp @@ -1,4 +1,10 @@ -#include "cpu/adapter_cblas_fp32.hpp" +#include "gprat/cpu/adapter_cblas_fp32.hpp" + +#include "gprat/performance_counters.hpp" + +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +#include +#endif #ifdef GPRAT_ENABLE_MKL // MKL CBLAS and LAPACKE @@ -9,28 +15,32 @@ #include "lapacke.h" #endif +GPRAT_NS_BEGIN + // BLAS level 3 operations -vector_future potrf(vector_future f_A, const int N) +mutable_tile_data potrf(const mutable_tile_data &A, const int N) { - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_TIME_FUNCTION(&potrf); // POTRF: in-place Cholesky decomposition of A // use spotrf2 recursive version for better stability LAPACKE_spotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); // return factorized matrix L - return hpx::make_ready_future(A); + return A; } -vector_future trsm(vector_future f_L, - vector_future f_A, - const int N, - const int M, - const BLAS_TRANSPOSE transpose_L, - const BLAS_SIDE side_L) - +mutable_tile_data +trsm(const const_tile_data &L, + const mutable_tile_data &A, + const int N, + const int M, + const BLAS_TRANSPOSE transpose_L, + const BLAS_SIDE side_L) { - auto L = f_L.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(L.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_TIME_FUNCTION(&trsm); // TRSM constants const float alpha = 1.0; // TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular @@ -47,36 +57,37 @@ vector_future trsm(vector_future f_L, N, A.data(), M); - // return vector - return hpx::make_ready_future(A); + return A; } -vector_future syrk(vector_future f_A, vector_future f_B, const int N) +mutable_tile_data syrk(const mutable_tile_data &A, const const_tile_data &B, const int N) { - auto B = f_B.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_TIME_FUNCTION(&syrk); // SYRK constants const float alpha = -1.0; const float beta = 1.0; // SYRK:A = A - B * B^T cblas_ssyrk(CblasRowMajor, CblasLower, CblasNoTrans, N, N, alpha, B.data(), N, beta, A.data(), N); // return updated matrix A - return hpx::make_ready_future(A); + return A; } -vector_future -gemm(vector_future f_A, - vector_future f_B, - vector_future f_C, +mutable_tile_data +gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &C, const int N, const int M, const int K, const BLAS_TRANSPOSE transpose_A, const BLAS_TRANSPOSE transpose_B) { - auto C = f_C.get(); - auto B = f_B.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(C.as_span()); + GPRAT_TIME_FUNCTION(&gemm); // GEMM constants const float alpha = -1.0; const float beta = 1.0; @@ -97,15 +108,17 @@ gemm(vector_future f_A, C.data(), M); // return updated matrix C - return hpx::make_ready_future(C); + return C; } // BLAS level 2 operations -vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L) +mutable_tile_data +trsv(const const_tile_data &L, const mutable_tile_data &a, const int N, const BLAS_TRANSPOSE transpose_L) { - auto L = f_L.get(); - auto a = f_a.get(); + GPRAT_BENCHMARK_FORCE_EVICT(L.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(a.as_span()); + GPRAT_TIME_FUNCTION(&trsv); // TRSV: In-place solve L(^T) * x = a where L lower triangular cblas_strsv(CblasRowMajor, CblasLower, @@ -117,20 +130,22 @@ vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS a.data(), 1); // return solution vector x - return hpx::make_ready_future(a); + return a; } -vector_future gemv(vector_future f_A, - vector_future f_a, - vector_future f_b, - const int N, - const int M, - const BLAS_ALPHA alpha, - const BLAS_TRANSPOSE transpose_A) +mutable_tile_data +gemv(const const_tile_data &A, + const const_tile_data &a, + const mutable_tile_data &b, + const int N, + const int M, + const BLAS_ALPHA alpha, + const BLAS_TRANSPOSE transpose_A) { - auto A = f_A.get(); - auto a = f_a.get(); - auto b = f_b.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(a.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(b.as_span()); + GPRAT_TIME_FUNCTION(&gemv); // GEMV constants // const float alpha = -1.0; const float beta = 1.0; @@ -149,47 +164,102 @@ vector_future gemv(vector_future f_A, b.data(), 1); // return updated vector b - return hpx::make_ready_future(b); + return b; } -vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M) +mutable_tile_data +dot_diag_syrk(const const_tile_data &A, const mutable_tile_data &r, const int N, const int M) { - auto A = f_A.get(); - auto r = f_r.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(r.as_span()); + GPRAT_TIME_FUNCTION(&dot_diag_syrk); + auto r_p = r.data(); + auto A_p = A.data(); // r = r + diag(A^T * A) for (std::size_t j = 0; j < static_cast(M); ++j) { // Extract the j-th column and compute the dot product with itself - r[j] += cblas_sdot(N, &A[j], M, &A[j], M); + r_p[j] += cblas_sdot(N, &A_p[j], M, &A_p[j], M); } - return hpx::make_ready_future(r); + return r; } -vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M) +mutable_tile_data +dot_diag_gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &r, + const int N, + const int M) { - auto A = f_A.get(); - auto B = f_B.get(); - auto r = f_r.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(r.as_span()); + GPRAT_TIME_FUNCTION(&dot_diag_gemm); + auto r_p = r.data(); + auto A_p = A.data(); + auto B_p = B.data(); // r = r + diag(A * B) for (std::size_t i = 0; i < static_cast(N); ++i) { - r[i] += cblas_sdot(M, &A[i * static_cast(M)], 1, &B[i], N); + r_p[i] += cblas_sdot(M, &A_p[i * static_cast(M)], 1, &B_p[i], N); } - return hpx::make_ready_future(r); + return r; } // BLAS level 1 operations -vector_future axpy(vector_future f_y, vector_future f_x, const int N) +mutable_tile_data axpy(const mutable_tile_data &y, const const_tile_data &x, const int N) { - auto y = f_y.get(); - auto x = f_x.get(); + GPRAT_BENCHMARK_FORCE_EVICT(y.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(x.as_span()); + GPRAT_TIME_FUNCTION(&axpy); cblas_saxpy(N, -1.0, x.data(), 1, y.data(), 1); - return hpx::make_ready_future(y); + return y; } -float dot(std::vector a, std::vector b, const int N) +float dot(std::span a, std::span b, const int N) { + GPRAT_BENCHMARK_FORCE_EVICT(a); + GPRAT_BENCHMARK_FORCE_EVICT(b); + GPRAT_TIME_FUNCTION(&dot); // DOT: a * b return cblas_sdot(N, a.data(), 1, b.data(), 1); } + +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +namespace detail +{ +void register_fp32_performance_counters() +{ + // XXX: you can do this with templates, but it's quite a bit more complicated +#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr) \ + hpx::performance_counters::install_counter_type( \ + name "/time", \ + get_and_reset_function_elapsed, \ + #fn_expr, \ + "", \ + hpx::performance_counters::counter_type::monotonically_increasing); \ + hpx::performance_counters::install_counter_type( \ + name "/calls", \ + get_and_reset_function_calls, \ + #fn_expr, \ + "", \ + hpx::performance_counters::counter_type::monotonically_increasing) + + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf32", &potrf); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm32", &trsm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk32", &syrk); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm32", &gemm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv32", &trsv); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv32", &gemv); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk32", &dot_diag_syrk); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm32", &dot_diag_gemm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy32", &axpy); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot32", &dot); + +#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR +} +} // namespace detail +#endif + +GPRAT_NS_END diff --git a/core/src/cpu/adapter_cblas_fp64.cpp b/core/src/cpu/adapter_cblas_fp64.cpp index 0c38b3c2..64c94c78 100644 --- a/core/src/cpu/adapter_cblas_fp64.cpp +++ b/core/src/cpu/adapter_cblas_fp64.cpp @@ -1,4 +1,10 @@ -#include "cpu/adapter_cblas_fp64.hpp" +#include "gprat/cpu/adapter_cblas_fp64.hpp" + +#include "gprat/performance_counters.hpp" + +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +#include +#endif #ifdef GPRAT_ENABLE_MKL // MKL CBLAS and LAPACKE @@ -9,28 +15,32 @@ #include "lapacke.h" #endif +GPRAT_NS_BEGIN + // BLAS level 3 operations -vector_future potrf(vector_future f_A, const int N) +mutable_tile_data potrf(const mutable_tile_data &A, const int N) { - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_TIME_FUNCTION(&potrf); // POTRF: in-place Cholesky decomposition of A // use dpotrf2 recursive version for better stability LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); // return factorized matrix L - return hpx::make_ready_future(A); + return A; } -vector_future trsm(vector_future f_L, - vector_future f_A, - const int N, - const int M, - const BLAS_TRANSPOSE transpose_L, - const BLAS_SIDE side_L) - +mutable_tile_data +trsm(const const_tile_data &L, + const mutable_tile_data &A, + const int N, + const int M, + const BLAS_TRANSPOSE transpose_L, + const BLAS_SIDE side_L) { - auto L = f_L.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(L.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_TIME_FUNCTION(&trsm); // TRSM constants const double alpha = 1.0; // TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular @@ -48,35 +58,37 @@ vector_future trsm(vector_future f_L, A.data(), M); // return vector - return hpx::make_ready_future(A); + return A; } -vector_future syrk(vector_future f_A, vector_future f_B, const int N) +mutable_tile_data syrk(const mutable_tile_data &A, const const_tile_data &B, const int N) { - auto B = f_B.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_TIME_FUNCTION(&syrk); // SYRK constants const double alpha = -1.0; const double beta = 1.0; // SYRK:A = A - B * B^T cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, N, N, alpha, B.data(), N, beta, A.data(), N); // return updated matrix A - return hpx::make_ready_future(A); + return A; } -vector_future -gemm(vector_future f_A, - vector_future f_B, - vector_future f_C, +mutable_tile_data +gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &C, const int N, const int M, const int K, const BLAS_TRANSPOSE transpose_A, const BLAS_TRANSPOSE transpose_B) { - auto C = f_C.get(); - auto B = f_B.get(); - auto A = f_A.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(C.as_span()); + GPRAT_TIME_FUNCTION(&gemm); // GEMM constants const double alpha = -1.0; const double beta = 1.0; @@ -97,15 +109,17 @@ gemm(vector_future f_A, C.data(), M); // return updated matrix C - return hpx::make_ready_future(C); + return C; } // BLAS level 2 operations -vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L) +mutable_tile_data trsv( + const const_tile_data &L, const mutable_tile_data &a, const int N, const BLAS_TRANSPOSE transpose_L) { - auto L = f_L.get(); - auto a = f_a.get(); + GPRAT_BENCHMARK_FORCE_EVICT(L.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(a.as_span()); + GPRAT_TIME_FUNCTION(&trsv); // TRSV: In-place solve L(^T) * x = a where L lower triangular cblas_dtrsv(CblasRowMajor, CblasLower, @@ -117,20 +131,22 @@ vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS a.data(), 1); // return solution vector x - return hpx::make_ready_future(a); + return a; } -vector_future gemv(vector_future f_A, - vector_future f_a, - vector_future f_b, - const int N, - const int M, - const BLAS_ALPHA alpha, - const BLAS_TRANSPOSE transpose_A) +mutable_tile_data +gemv(const const_tile_data &A, + const const_tile_data &a, + const mutable_tile_data &b, + const int N, + const int M, + const BLAS_ALPHA alpha, + const BLAS_TRANSPOSE transpose_A) { - auto A = f_A.get(); - auto a = f_a.get(); - auto b = f_b.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(a.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(b.as_span()); + GPRAT_TIME_FUNCTION(&gemv); // GEMV constants // const double alpha = -1.0; const double beta = 1.0; @@ -149,47 +165,102 @@ vector_future gemv(vector_future f_A, b.data(), 1); // return updated vector b - return hpx::make_ready_future(b); + return b; } -vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M) +mutable_tile_data +dot_diag_syrk(const const_tile_data &A, const mutable_tile_data &r, const int N, const int M) { - auto A = f_A.get(); - auto r = f_r.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(r.as_span()); + GPRAT_TIME_FUNCTION(&dot_diag_syrk); + auto r_p = r.data(); + auto A_p = A.data(); // r = r + diag(A^T * A) for (std::size_t j = 0; j < static_cast(M); ++j) { // Extract the j-th column and compute the dot product with itself - r[j] += cblas_ddot(N, &A[j], M, &A[j], M); + r_p[j] += cblas_ddot(N, &A_p[j], M, &A_p[j], M); } - return hpx::make_ready_future(r); + return r; } -vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M) +mutable_tile_data +dot_diag_gemm(const const_tile_data &A, + const const_tile_data &B, + const mutable_tile_data &r, + const int N, + const int M) { - auto A = f_A.get(); - auto B = f_B.get(); - auto r = f_r.get(); + GPRAT_BENCHMARK_FORCE_EVICT(A.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(B.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(r.as_span()); + GPRAT_TIME_FUNCTION(&dot_diag_gemm); + auto r_p = r.data(); + auto A_p = A.data(); + auto B_p = B.data(); // r = r + diag(A * B) for (std::size_t i = 0; i < static_cast(N); ++i) { - r[i] += cblas_ddot(M, &A[i * static_cast(M)], 1, &B[i], N); + r_p[i] += cblas_ddot(M, &A_p[i * static_cast(M)], 1, &B_p[i], N); } - return hpx::make_ready_future(r); + return r; } // BLAS level 1 operations -vector_future axpy(vector_future f_y, vector_future f_x, const int N) +mutable_tile_data axpy(const mutable_tile_data &y, const const_tile_data &x, const int N) { - auto y = f_y.get(); - auto x = f_x.get(); + GPRAT_BENCHMARK_FORCE_EVICT(y.as_span()); + GPRAT_BENCHMARK_FORCE_EVICT(x.as_span()); + GPRAT_TIME_FUNCTION(&axpy); cblas_daxpy(N, -1.0, x.data(), 1, y.data(), 1); - return hpx::make_ready_future(y); + return y; } -double dot(std::vector a, std::vector b, const int N) +double dot(std::span a, std::span b, const int N) { + GPRAT_BENCHMARK_FORCE_EVICT(a); + GPRAT_BENCHMARK_FORCE_EVICT(b); + GPRAT_TIME_FUNCTION(&dot); // DOT: a * b return cblas_ddot(N, a.data(), 1, b.data(), 1); } + +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +namespace detail +{ +void register_fp64_performance_counters() +{ + // XXX: you can do this with templates, but it's quite a bit more complicated +#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr) \ + hpx::performance_counters::install_counter_type( \ + name "/time", \ + get_and_reset_function_elapsed, \ + #fn_expr, \ + "", \ + hpx::performance_counters::counter_type::monotonically_increasing); \ + hpx::performance_counters::install_counter_type( \ + name "/calls", \ + get_and_reset_function_calls, \ + #fn_expr, \ + "", \ + hpx::performance_counters::counter_type::monotonically_increasing) + + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf64", &potrf); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm64", &trsm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk64", &syrk); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm64", &gemm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv64", &trsv); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv64", &gemv); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk64", &dot_diag_syrk); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm64", &dot_diag_gemm); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy64", &axpy); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot64", &dot); + +#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR +} +} // namespace detail +#endif + +GPRAT_NS_END diff --git a/core/src/cpu/gp_algorithms.cpp b/core/src/cpu/gp_algorithms.cpp index 95eb2e2f..ab3ed77b 100644 --- a/core/src/cpu/gp_algorithms.cpp +++ b/core/src/cpu/gp_algorithms.cpp @@ -1,182 +1,182 @@ -#include "cpu/gp_algorithms.hpp" +#include "gprat/cpu/gp_algorithms.hpp" + +#include "gprat/performance_counters.hpp" +#include "gprat/tile_data.hpp" #include +GPRAT_NS_BEGIN + namespace cpu { // Tile generation -double compute_covariance_function(std::size_t i_global, - std::size_t j_global, - std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &i_input, - const std::vector &j_input) +double compute_covariance_function(std::size_t n_regressors, + const SEKParams &sek_params, + std::span i_input, + std::span j_input) { + GPRAT_TIME_FUNCTION(&compute_covariance_function); // k(z_i,z_j) = vertical_lengthscale * exp(-0.5 / lengthscale^2 * (z_i - z_j)^2) double distance = 0.0; - double z_ik_minus_z_jk; - for (std::size_t k = 0; k < n_regressors; k++) { - z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k]; + const double z_ik_minus_z_jk = i_input[k] - j_input[k]; distance += z_ik_minus_z_jk * z_ik_minus_z_jk; } + return sek_params.vertical_lengthscale * exp(-0.5 / (sek_params.lengthscale * sek_params.lengthscale) * distance); } -std::vector gen_tile_covariance( +mutable_tile_data gen_tile_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input) + const SEKParams &sek_params, + std::span input) { - std::size_t i_global, j_global; - double covariance_function; - // Preallocate required memory - std::vector tile; - tile.reserve(N * N); - // Compute entries + GPRAT_TIME_FUNCTION(&gen_tile_covariance); + mutable_tile_data tile(N * N); for (std::size_t i = 0; i < N; i++) { - i_global = N * row + i; + const std::size_t i_global = N * row + i; for (std::size_t j = 0; j < N; j++) { - j_global = N * col + j; + const std::size_t j_global = N * col + j; + // compute covariance function - covariance_function = - compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input); + auto covariance_function = compute_covariance_function( + n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors)); if (i_global == j_global) { // noise variance on diagonal covariance_function += sek_params.noise_variance; } - tile.push_back(covariance_function); + + tile.data()[i * N + j] = covariance_function; } } return tile; } -std::vector gen_tile_full_prior_covariance( +mutable_tile_data gen_tile_full_prior_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input) + const SEKParams &sek_params, + std::span input) { - std::size_t i_global, j_global; - // Preallocate required memory - std::vector tile; - tile.reserve(N * N); - // Compute entries + GPRAT_TIME_FUNCTION(&gen_tile_full_prior_covariance); + mutable_tile_data tile(N * N); for (std::size_t i = 0; i < N; i++) { - i_global = N * row + i; + const std::size_t i_global = N * row + i; for (std::size_t j = 0; j < N; j++) { - j_global = N * col + j; + const std::size_t j_global = N * col + j; // compute covariance function - tile.push_back(compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input)); + tile.data()[i * N + j] = compute_covariance_function( + n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors)); } } return tile; } -std::vector gen_tile_prior_covariance( +mutable_tile_data gen_tile_prior_covariance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &input) + const SEKParams &sek_params, + std::span input) { - std::size_t i_global, j_global; - // Preallocate required memory - std::vector tile; - tile.reserve(N); - // Compute entries + GPRAT_TIME_FUNCTION(&gen_tile_prior_covariance); + mutable_tile_data tile(N); for (std::size_t i = 0; i < N; i++) { - i_global = N * row + i; - j_global = N * col + i; + const std::size_t i_global = N * row + i; + const std::size_t j_global = N * col + i; // compute covariance function - tile.push_back(compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input)); + tile.data()[i] = compute_covariance_function( + n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors)); } return tile; } -std::vector gen_tile_cross_covariance( +mutable_tile_data gen_tile_cross_covariance( std::size_t row, std::size_t col, std::size_t N_row, std::size_t N_col, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, - const std::vector &row_input, - const std::vector &col_input) + const SEKParams &sek_params, + std::span row_input, + std::span col_input) { - std::size_t i_global, j_global; - // Preallocate required memory - std::vector tile; - tile.reserve(N_row * N_col); - // Compute entries + GPRAT_TIME_FUNCTION(&gen_tile_cross_covariance); + mutable_tile_data tile(N_row * N_col); for (std::size_t i = 0; i < N_row; i++) { - i_global = N_row * row + i; + std::size_t i_global = N_row * row + i; for (std::size_t j = 0; j < N_col; j++) { - j_global = N_col * col + j; + std::size_t j_global = N_col * col + j; // compute covariance function - tile.push_back( - compute_covariance_function(i_global, j_global, n_regressors, sek_params, row_input, col_input)); + tile.data()[i * N_col + j] = compute_covariance_function( + n_regressors, + sek_params, + row_input.subspan(i_global, n_regressors), + col_input.subspan(j_global, n_regressors)); } } return tile; } -std::vector gen_tile_transpose(std::size_t N_row, std::size_t N_col, const std::vector &tile) +mutable_tile_data gen_tile_transpose(std::size_t N_row, std::size_t N_col, std::span tile) { - // Preallocate required memory - std::vector transposed; - transposed.reserve(N_row * N_col); + GPRAT_TIME_FUNCTION(&gen_tile_transpose); + mutable_tile_data transposed(N_row * N_col); // Transpose entries for (std::size_t j = 0; j < N_col; j++) { for (std::size_t i = 0; i < N_row; ++i) { // Mapping (i, j) in the original tile to (j, i) in the transposed tile - transposed.push_back(tile[i * N_col + j]); + transposed.data()[j * N_row + i] = tile[i * N_col + j]; } } return transposed; } -std::vector gen_tile_output(std::size_t row, std::size_t N, const std::vector &output) +mutable_tile_data gen_tile_output(std::size_t row, std::size_t N, std::span output) { - // Preallocate required memory - std::vector tile; - tile.reserve(N); - // Copy entries - std::copy(output.begin() + static_cast(N * row), - output.begin() + static_cast(N * (row + 1)), - std::back_inserter(tile)); + GPRAT_TIME_FUNCTION(&gen_tile_output); + mutable_tile_data tile(N); + std::copy(output.data() + (N * row), output.data() + (N * (row + 1)), tile.data()); return tile; } -std::vector gen_tile_zeros(std::size_t N) { return std::vector(N, 0.0); } +mutable_tile_data gen_tile_zeros(std::size_t N) +{ + GPRAT_TIME_FUNCTION(&gen_tile_zeros); + mutable_tile_data tile(N); + std::fill_n(tile.data(), N, 0.0); + return tile; +} -std::vector gen_tile_identity(std::size_t N) +mutable_tile_data gen_tile_identity(std::size_t N) { + GPRAT_TIME_FUNCTION(&gen_tile_identity); + mutable_tile_data tile(N * N); // Initialize zero tile - std::vector tile(N * N, 0.0); + std::fill_n(tile.data(), N * N, 0.0); // Fill diagonal with ones for (std::size_t i = 0; i < N; i++) { - tile[i * N + i] = 1.0; + tile.data()[i * N + i] = 1.0; } return tile; } @@ -188,6 +188,7 @@ double compute_error_norm(std::size_t n_tiles, const std::vector &b, const std::vector> &tiles) { + GPRAT_TIME_FUNCTION(&compute_error_norm); double error = 0.0; for (std::size_t k = 0; k < n_tiles; k++) { @@ -203,3 +204,5 @@ double compute_error_norm(std::size_t n_tiles, } } // end of namespace cpu + +GPRAT_NS_END diff --git a/core/src/cpu/gp_functions.cpp b/core/src/cpu/gp_functions.cpp index 92caa275..097f4867 100644 --- a/core/src/cpu/gp_functions.cpp +++ b/core/src/cpu/gp_functions.cpp @@ -1,1161 +1,20 @@ -#include "cpu/gp_functions.hpp" +#include "gprat/cpu/gp_functions.hpp" + +#include "gprat/cpu/gp_algorithms.hpp" +#include "gprat/cpu/gp_optimizer.hpp" +#include "gprat/cpu/tiled_algorithms.hpp" +#include "gprat/detail/async_helpers.hpp" -#include "cpu/gp_algorithms.hpp" -#include "cpu/gp_optimizer.hpp" -#include "cpu/tiled_algorithms.hpp" #include -using Tiled_matrix = std::vector>>; -using Tiled_vector = std::vector>>; +GPRAT_NS_BEGIN namespace cpu { /////////////////////////////////////////////////////////////////////////// // PREDICT -std::vector> -cholesky(const std::vector &training_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int n_regressors) -{ - std::vector> result; - // Tiled future data structures - Tiled_matrix K_tiles; // Tiled covariance matrix - - // Preallocate memory - result.resize(static_cast(n_tiles * n_tiles)); - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - K_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - } - } - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Synchronize - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - result[i * static_cast(n_tiles) + j] = - K_tiles[i * static_cast(n_tiles) + j].get(); - } - } - return result; -} - -std::vector -predict(const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors) -{ - /* - * Prediction: hat(y)_M = cross(K)_MxN * K^-1_NxN * y_N - * - Covariance matrix K_NxN - * - Cross-covariance cross(K)_MxN - * - Training ouput y_N - * - Prediction output hat(y)_M - * - * Algorithm: - * 1: Compute lower triangular part of covariance matrix K - * 2: Compute Cholesky factor L of K - * 3: Compute prediction hat(y): - * - triangular solve L * beta = y - * - triangular solve L^T * alpha = beta - * - compute hat(y) = cross(K) * alpha - */ - - std::vector prediction_result; - // Tiled future data structures - Tiled_matrix K_tiles; // Tiled covariance matrix - Tiled_matrix cross_covariance_tiles; // Tiled cross_covariance matrix - Tiled_vector prediction_tiles; // Tiled solution - Tiled_vector alpha_tiles; // Tiled intermediate solution - - // Preallocate memory - prediction_result.reserve(test_input.size()); - - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - alpha_tiles.reserve(static_cast(n_tiles)); - cross_covariance_tiles.reserve(static_cast(m_tiles) * static_cast(n_tiles)); - prediction_tiles.reserve(static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - K_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - } - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output)); - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - cross_covariance_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"), - i, - j, - m_tile_size, - n_tile_size, - n_regressors, - sek_params, - test_input, - training_input)); - } - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size)); - } - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * (L^T * alpha) = y - forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha - matrix_vector_tiled( - cross_covariance_tiles, - alpha_tiles, - prediction_tiles, - m_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Synchronize prediction - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - auto tile = prediction_tiles[i].get(); - std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result)); - } - return prediction_result; -} - -std::vector> predict_with_uncertainty( - const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors) -{ - /* - * Prediction: hat(y) = cross(K) * K^-1 * y - * Uncertainty: diag(Sigma) = diag(prior(K)) * diag(cross(K)^T * K^-1 * cross(K)) - * - Covariance matrix K_NxN - * - Cross-covariance cross(K)_MxN - * - Prior covariance prior(K)_MxM - * - Training ouput y_N - * - Prediction output hat(y)_M - * - Posterior covariance matrix Sigma_MxM - * - * Algorithm: - * 1: Compute lower triangular part of covariance matrix K - * 2: Compute Cholesky factor L of K - * 3: Compute prediction hat(y): - * - triangular solve L * beta = y - * - triangular solve L^T * alpha = beta - * - compute hat(y) = cross(K) * alpha - * 4: Compute uncertainty diag(Sigma): - * - triangular solve L * V = cross(K)^T - * - compute diag(W) = diag(V^T * V) - * - compute diag(Sigma) = diag(prior(K)) - diag(W) - */ - - std::vector prediction_result; - std::vector uncertainty_result; - // Tiled future data structures for prediction - Tiled_matrix K_tiles; // Tiled covariance matrix K_NxN - Tiled_matrix cross_covariance_tiles; // Tiled cross_covariance matrix K_NxM - Tiled_vector prediction_tiles; // Tiled solution - Tiled_vector alpha_tiles; // Tiled intermediate solution - // Tiled future data structures for uncertainty - Tiled_matrix t_cross_covariance_tiles; // Tiled transposed cross_covariance matrix K_MxN - Tiled_vector prior_K_tiles; // Tiled prior covariance matrix diagonal diag(K_MxM) - Tiled_vector uncertainty_tiles; // Tiled uncertainty solution - - // Preallocate memory - prediction_result.reserve(test_input.size()); - uncertainty_result.reserve(test_input.size()); - - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - cross_covariance_tiles.reserve(static_cast(m_tiles) * static_cast(n_tiles)); - prediction_tiles.reserve(static_cast(m_tiles)); - alpha_tiles.reserve(static_cast(n_tiles)); - - t_cross_covariance_tiles.reserve(static_cast(n_tiles) * static_cast(m_tiles)); - prior_K_tiles.reserve(static_cast(m_tiles)); - uncertainty_tiles.reserve(static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - K_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - } - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output)); - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - cross_covariance_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"), - i, - j, - m_tile_size, - n_tile_size, - n_regressors, - sek_params, - test_input, - training_input)); - } - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size)); - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - prior_K_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_prior_covariance, "assemble_tiled"), - i, - i, - m_tile_size, - n_regressors, - sek_params, - test_input)); - } - - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - t_cross_covariance_tiles.push_back(hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_pred"), - m_tile_size, - n_tile_size, - cross_covariance_tiles[i * static_cast(n_tiles) + j])); - } - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - uncertainty_tiles.push_back( - hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_prior_inter"), m_tile_size)); - } - - // Prediction - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * (L^T * alpha) = y - forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous prediction computation solve: hat(y) = cross(K) * alpha - matrix_vector_tiled( - cross_covariance_tiles, - alpha_tiles, - prediction_tiles, - m_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * V = cross(K)^T - forward_solve_tiled_matrix( - K_tiles, - t_cross_covariance_tiles, - n_tile_size, - m_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous computation diag(W) = diag(V^T * V) - symmetric_matrix_matrix_diagonal_tiled( - t_cross_covariance_tiles, - uncertainty_tiles, - n_tile_size, - m_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous computation diag(Sigma) = diag(prior(K)) - diag(W) - vector_difference_tiled(prior_K_tiles, uncertainty_tiles, m_tile_size, static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Synchronize prediction - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - auto tile = prediction_tiles[i].get(); - std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result)); - } - - // Synchronize uncertainty - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - auto tile = uncertainty_tiles[i].get(); - std::copy(tile.begin(), tile.end(), std::back_inserter(uncertainty_result)); - } - - return std::vector>{ std::move(prediction_result), std::move(uncertainty_result) }; -} - -std::vector> predict_with_full_cov( - const std::vector &training_input, - const std::vector &training_output, - const std::vector &test_input, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int m_tiles, - int m_tile_size, - int n_regressors) -{ - /* - * Prediction: hat(y)_M = cross(K) * K^-1 * y - * Full covariance: Sigma = prior(K) - cross(K)^T * K^-1 * cross(K) - * - Covariance matrix K_NxN - * - Cross-covariance cross(K)_MxN - * - Prior covariance prior(K)_MxM - * - Training ouput y_N - * - Prediction output hat(y)_M - * - Posterior covariance matrix Sigma_MxM - * - * Algorithm: - * 1: Compute lower triangular part of covariance matrix K - * 2: Compute Cholesky factor L of K - * 3: Compute prediction hat(y): - * - triangular solve L * beta = y - * - triangular solve L^T * alpha = beta - * - compute hat(y) = cross(K) * alpha - * 4: Compute full covariance matrix Sigma: - * - triangular solve L * V = cross(K)^T - * - compute W = V^T * V - * - compute Sigma = prior(K) - W - * 5: Compute diag(Sigma) - */ - - std::vector prediction_result; - std::vector uncertainty_result; - // Tiled future data structures for prediction - Tiled_matrix K_tiles; // Tiled covariance matrix K_NxN - Tiled_matrix cross_covariance_tiles; // Tiled cross_covariance matrix K_NxM - Tiled_vector prediction_tiles; // Tiled solution - Tiled_vector alpha_tiles; // Tiled intermediate solution - // Tiled future data structures for uncertainty - Tiled_matrix t_cross_covariance_tiles; // Tiled transposed cross_covariance matrix K_MxN - Tiled_matrix prior_K_tiles; // Tiled prior covariance matrix K_MxM - Tiled_vector uncertainty_tiles; // Tiled uncertainty solution - - // Preallocate memory - prediction_result.reserve(test_input.size()); - uncertainty_result.reserve(test_input.size()); - - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - cross_covariance_tiles.reserve(static_cast(m_tiles) * static_cast(n_tiles)); - prediction_tiles.reserve(static_cast(m_tiles)); - alpha_tiles.reserve(static_cast(n_tiles)); - - t_cross_covariance_tiles.reserve(static_cast(n_tiles) * static_cast(m_tiles)); - prior_K_tiles.resize(static_cast(m_tiles * m_tiles)); - uncertainty_tiles.reserve(static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - K_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - } - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output)); - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - cross_covariance_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"), - i, - j, - m_tile_size, - n_tile_size, - n_regressors, - sek_params, - test_input, - training_input)); - } - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size)); - } - - // Assemble prior covariance matrix vector - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - prior_K_tiles[i * static_cast(m_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_full_prior_covariance, "assemble_prior_tiled"), - i, - j, - m_tile_size, - n_regressors, - sek_params, - test_input); - - if (i != j) - { - prior_K_tiles[j * static_cast(m_tiles) + i] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_prior_tiled"), - m_tile_size, - m_tile_size, - prior_K_tiles[i * static_cast(m_tiles) + j]); - } - } - } - - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - t_cross_covariance_tiles.push_back(hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_pred"), - m_tile_size, - n_tile_size, - cross_covariance_tiles[i * static_cast(n_tiles) + j])); - } - } - - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - uncertainty_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size)); - } - - // Prediction - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * (L^T * alpha) = y - forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous prediction computation solve: hat(y) = K_cross_cov * alpha - matrix_vector_tiled( - cross_covariance_tiles, - alpha_tiles, - prediction_tiles, - m_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * V = cross(K)^T - forward_solve_tiled_matrix( - K_tiles, - t_cross_covariance_tiles, - n_tile_size, - m_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous computation of full covariance Sigma = prior(K) - V^T * V - symmetric_matrix_matrix_tiled( - t_cross_covariance_tiles, - prior_K_tiles, - n_tile_size, - m_tile_size, - static_cast(n_tiles), - static_cast(m_tiles)); - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous computation of uncertainty diag(Sigma) - matrix_diagonal_tiled(prior_K_tiles, uncertainty_tiles, m_tile_size, static_cast(m_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Synchronize prediction - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - auto tile = prediction_tiles[i].get(); - std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result)); - } - - // Synchronize uncertainty - for (std::size_t i = 0; i < static_cast(m_tiles); i++) - { - auto tile = uncertainty_tiles[i].get(); - std::copy(tile.begin(), tile.end(), std::back_inserter(uncertainty_result)); - } - - return std::vector>{ std::move(prediction_result), std::move(uncertainty_result) }; -} - -/////////////////////////////////////////////////////////////////////////// -// OPTIMIZATION -double compute_loss(const std::vector &training_input, - const std::vector &training_output, - const gprat_hyper::SEKParams &sek_params, - int n_tiles, - int n_tile_size, - int n_regressors) -{ - /* - * Negative log likelihood loss: - * loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) ) - * - Covariance matrix K(theta)_NxN - * - Training ouput y_N - * - Hyperparameters theta ={ v, l, v_n } - * - * Algorithm: - * 1: Compute lower triangular part of covariance matrix K - * 2: Compute Cholesky factor L of K - * 3: Compute prediction alpha = K^-1 * y: - * - triangular solve L * beta = y - * - triangular solve L^T * alpha = beta - * 5: Compute beta = K^-1 * y - * 6: Compute negative log likelihood loss - * - Calculate sum_i^N log(L_ii^2) - * - Calculate y^T * beta - * - Add constant N * log (2 * pi) - */ - - hpx::shared_future loss_value; - // Tiled future data structures - Tiled_matrix K_tiles; // Tiled covariance matrix K_NxN - Tiled_vector y_tiles; // Tiled output - Tiled_vector alpha_tiles; // Tiled intermediate solution - - // Preallocate memory - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - y_tiles.reserve(static_cast(n_tiles)); - alpha_tiles.reserve(static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - K_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - } - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - y_tiles.push_back( - hpx::async(hpx::annotated_function(gen_tile_output, "assemble_tiled_y"), i, n_tile_size, training_output)); - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles.push_back(hpx::async( - hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output)); - } - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous triangular solve L * (L^T * alpha) = y - forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous loss computation - compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast(n_tiles)); - - return loss_value.get(); -} - -std::vector -optimize(const std::vector &training_input, - const std::vector &training_output, - int n_tiles, - int n_tile_size, - int n_regressors, - const gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - std::vector trainable_params) -{ - /* - * - Hyperparameters theta={v, l, v_n} - * - Covariance matrix K(theta) - * - Training ouput y - * - * Algorithm: - * for opt_iter: - * 1: Compute distance for entries of covariance matrix K - * 2: Compute lower triangular part of K with distance - * 3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance - * - * 4: Compute Cholesky factor L of K - * 5: Compute K^-1: - * - triangular solve L * {} = I - * - triangular solve L^T * K^-1 = {} - * 6: Compute beta = K^-1 * y - * - * 7: Compute negative log likelihood loss - * - Calculate 0.5 sum_i^N log(L_ii^2) - * - Calculate 0.5 y^T * beta - * - Add constant N / 2 * log (2 * pi) - * - * 8: Compute delta(loss)/delta(param_i) - * - Compute trace(K^-1 * delta(K)/delta(theta_i)) - * - Compute beta^T * delta(K)/delta(theta_i) * beta - * 9: Update hyperparameters theta with Adam optimizer - * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T - * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 - * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) - * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) - * endfor - */ - - // data holder for loss - hpx::shared_future loss_value; - // data holder for computed loss values - std::vector losses; - - // Tiled future data structures - Tiled_matrix K_tiles; // Tiled covariance matrix K_NxN - Tiled_vector y_tiles; // Tiled output - Tiled_vector alpha_tiles; // Tiled intermediate solution - Tiled_matrix K_inv_tiles; // Tiled inversed covariance matrix K^-1_NxN - // Tiled future data structures for gradients - Tiled_matrix grad_v_tiles; // Tiled covariance with gradient v - Tiled_matrix grad_l_tiles; // Tiled covariance with gradient l - - // Preallocate memory - losses.reserve(static_cast(adam_params.opt_iter)); - y_tiles.reserve(static_cast(n_tiles)); - - alpha_tiles.resize(static_cast(n_tiles)); // for now resize since reset in loop - K_inv_tiles.resize(static_cast(n_tiles * n_tiles)); // for now resize since reset in loop - - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - grad_v_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - grad_l_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly of output y - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - y_tiles.push_back( - hpx::async(hpx::annotated_function(gen_tile_output, "assemble_y"), i, n_tile_size, training_output)); - } - - ////////////////////////////////////////////////////////////////////////////// - // Perform optimization - for (std::size_t iter = 0; iter < static_cast(adam_params.opt_iter); iter++) - { - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix - // vector w.r.t. to vertical lengthscale and derivative of covariance - // matrix vector w.r.t. to lengthscale - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - // Compute the distance (z_i - z_j) of K entries to reuse - hpx::shared_future> cov_dists = hpx::async( - hpx::annotated_function(gen_tile_distance, "assemble_cov_dist"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - - K_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_covariance_with_distance), "assemble_K"), - i, - j, - n_tile_size, - sek_params, - cov_dists); - if (trainable_params[0]) - { - grad_l_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_l), "assemble_gradl"), - n_tile_size, - sek_params, - cov_dists); - if (i != j) - { - grad_l_tiles[j * static_cast(n_tiles) + i] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradl_t"), - n_tile_size, - n_tile_size, - grad_l_tiles[i * static_cast(n_tiles) + j]); - } - } - - if (trainable_params[1]) - { - grad_v_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_v), "assemble_gradv"), - n_tile_size, - sek_params, - cov_dists); - if (i != j) - { - grad_v_tiles[j * static_cast(n_tiles) + i] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradv_t"), - n_tile_size, - n_tile_size, - grad_v_tiles[i * static_cast(n_tiles) + j]); - } - } - } - } - - // Assembly with reallocation -> optimize to only set existing values - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles[i] = hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), n_tile_size); - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - if (i == j) - { - K_inv_tiles[i * static_cast(n_tiles) + j] = - hpx::async(hpx::annotated_function(gen_tile_identity, "assemble_identity_matrix"), n_tile_size); - } - else - { - K_inv_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_zeros, "assemble_identity_matrix"), n_tile_size * n_tile_size); - } - } - } - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous compute K^-1 through L* (L^T * X) = I - forward_solve_tiled_matrix( - K_tiles, - K_inv_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - backward_solve_tiled_matrix( - K_tiles, - K_inv_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous compute beta = inv(K) * y - matrix_vector_tiled( - K_inv_tiles, - y_tiles, - alpha_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous loss computation where - // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) ) - compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous update of the hyperparameters - if (trainable_params[0]) - { // lengthscale - update_hyperparameter_tiled( - K_inv_tiles, - grad_l_tiles, - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - iter, - 0); - } - if (trainable_params[1]) - { // vertical_lengthscale - update_hyperparameter_tiled( - K_inv_tiles, - grad_v_tiles, - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - iter, - 1); - } - if (trainable_params[2]) - { // noise_variance - update_hyperparameter_tiled( - K_inv_tiles, - Tiled_matrix{}, // no tiled gradient matrix required - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - iter, - 2); - } - // Synchronize after iteration - losses.push_back(loss_value.get()); - } - // Return losses - return losses; -} - -double optimize_step(const std::vector &training_input, - const std::vector &training_output, - int n_tiles, - int n_tile_size, - int n_regressors, - gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - std::vector trainable_params, - int iter) -{ - /* - * - Hyperparameters theta={v, l, v_n} - * - Covariance matrix K(theta) - * - Training ouput y - * - * Algorithm: - * 1: Compute distance for entries of covariance matrix K - * 2: Compute lower triangular part of K with distance - * 3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance - * - * 4: Compute Cholesky factor L of K - * 5: Compute K^-1: - * - triangular solve L * {} = I - * - triangular solve L^T * K^-1 = {} - * 6: Compute beta = K^-1 * y - * - * 7: Compute negative log likelihood loss - * - Calculate 0.5 sum_i^N log(L_ii^2) - * - Calculate 0.5 y^T * beta - * - Add constant N / 2 * log (2 * pi) - * - * 8: Compute delta(loss)/delta(param_i) - * - Compute trace(K^-1 * delta(K)/delta(theta_i)) - * - Compute beta^T * delta(K)/delta(theta_i) * beta - * 9: Update hyperparameters theta with Adam optimizer - * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T - * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 - * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) - * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) - */ - - // data holder for loss - hpx::shared_future loss_value; - - // Tiled future data structures - Tiled_matrix K_tiles; // Tiled covariance matrix K_NxN - Tiled_vector y_tiles; // Tiled output - Tiled_vector alpha_tiles; // Tiled intermediate solution - Tiled_matrix K_inv_tiles; // Tiled inversed covariance matrix K^-1_NxN - // Tiled future data structures for gradients - Tiled_matrix grad_v_tiles; // Tiled covariance with gradient v - Tiled_matrix grad_l_tiles; // Tiled covariance with gradient l - - // Preallocate memory - y_tiles.reserve(static_cast(n_tiles)); - - alpha_tiles.resize(static_cast(n_tiles)); // for now resize since reset in loop - K_inv_tiles.resize(static_cast(n_tiles * n_tiles)); // for now resize since reset in loop - - K_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - grad_v_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - grad_l_tiles.resize(static_cast(n_tiles * n_tiles)); // No reserve because of triangular structure - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly of output y - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - y_tiles.push_back( - hpx::async(hpx::annotated_function(gen_tile_output, "assemble_y"), i, n_tile_size, training_output)); - } - - ////////////////////////////////////////////////////////////////////////////// - // Perform one optimization step - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix - // vector w.r.t. to vertical lengthscale and derivative of covariance - // matrix vector w.r.t. to lengthscale - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j <= i; j++) - { - // Compute the distance (z_i - z_j) of K entries to reuse - hpx::shared_future> cov_dists = hpx::async( - hpx::annotated_function(gen_tile_distance, "assemble_cov_dist"), - i, - j, - n_tile_size, - n_regressors, - sek_params, - training_input); - - K_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_covariance_with_distance), "assemble_K"), - i, - j, - n_tile_size, - sek_params, - cov_dists); - - if (trainable_params[0]) - { - grad_l_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_l), "assemble_gradl"), - n_tile_size, - sek_params, - cov_dists); - if (i != j) - { - grad_l_tiles[j * static_cast(n_tiles) + i] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradl_t"), - n_tile_size, - n_tile_size, - grad_l_tiles[i * static_cast(n_tiles) + j]); - } - } - - if (trainable_params[1]) - { - grad_v_tiles[i * static_cast(n_tiles) + j] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_v), "assemble_gradv"), - n_tile_size, - sek_params, - cov_dists); - if (i != j) - { - grad_v_tiles[j * static_cast(n_tiles) + i] = hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradv_t"), - n_tile_size, - n_tile_size, - grad_v_tiles[i * static_cast(n_tiles) + j]); - } - } - } - } - - // Assembly with reallocation -> optimize to only set existing values - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - alpha_tiles[i] = hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), n_tile_size); - } - - for (std::size_t i = 0; i < static_cast(n_tiles); i++) - { - for (std::size_t j = 0; j < static_cast(n_tiles); j++) - { - if (i == j) - { - K_inv_tiles[i * static_cast(n_tiles) + j] = - hpx::async(hpx::annotated_function(gen_tile_identity, "assemble_identity_matrix"), n_tile_size); - } - else - { - K_inv_tiles[i * static_cast(n_tiles) + j] = hpx::async( - hpx::annotated_function(gen_tile_zeros, "assemble_identity_matrix"), n_tile_size * n_tile_size); - } - } - } - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous Cholesky decomposition: K = L * L^T - right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous compute K^-1 through L* (L^T * X) = I - forward_solve_tiled_matrix( - K_tiles, - K_inv_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - backward_solve_tiled_matrix( - K_tiles, - K_inv_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous compute beta = inv(K) * y - matrix_vector_tiled( - K_inv_tiles, - y_tiles, - alpha_tiles, - n_tile_size, - n_tile_size, - static_cast(n_tiles), - static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous loss computation where - // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) ) - compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast(n_tiles)); - - /////////////////////////////////////////////////////////////////////////// - // Launch asynchronous update of the hyperparameters - if (trainable_params[0]) - { // lengthscale - update_hyperparameter_tiled( - K_inv_tiles, - grad_l_tiles, - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - static_cast(iter), - 0); - } - if (trainable_params[1]) - { // vertical_lengthscale - update_hyperparameter_tiled( - K_inv_tiles, - grad_v_tiles, - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - static_cast(iter), - 1); - } - if (trainable_params[2]) - { // noise_variance - update_hyperparameter_tiled( - K_inv_tiles, - Tiled_matrix{}, // no tiled gradient matrix required - alpha_tiles, - adam_params, - sek_params, - n_tile_size, - static_cast(n_tiles), - static_cast(iter), - 2); - } - return loss_value.get(); -} } // end of namespace cpu + +GPRAT_NS_END diff --git a/core/src/cpu/gp_optimizer.cpp b/core/src/cpu/gp_optimizer.cpp index f9c5d500..7c1c76f7 100644 --- a/core/src/cpu/gp_optimizer.cpp +++ b/core/src/cpu/gp_optimizer.cpp @@ -1,8 +1,13 @@ -#include "cpu/gp_optimizer.hpp" +#include "gprat/cpu/gp_optimizer.hpp" -#include "cpu/adapter_cblas_fp64.hpp" +#include "gprat/cpu/adapter_cblas_fp64.hpp" + +#include +#include #include +GPRAT_NS_BEGIN + namespace cpu { @@ -39,110 +44,103 @@ double compute_sigmoid(double parameter) { return 1.0 / (1.0 + exp(-parameter)); double compute_covariance_distance(std::size_t i_global, std::size_t j_global, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, const std::vector &i_input, const std::vector &j_input) { // -0.5*lengthscale^2*(z_i-z_j)^2 double distance = 0.0; - double z_ik_minus_z_jk; - for (std::size_t k = 0; k < n_regressors; k++) { - z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k]; + const double z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k]; distance += z_ik_minus_z_jk * z_ik_minus_z_jk; } return -0.5 / (sek_params.lengthscale * sek_params.lengthscale) * distance; } -std::vector gen_tile_distance( +mutable_tile_data gen_tile_distance( std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, const std::vector &input) { - std::size_t i_global, j_global; // Preallocate memory - std::vector tile; - tile.reserve(N * N); + mutable_tile_data tile(N * N); for (std::size_t i = 0; i < N; i++) { - i_global = N * row + i; + const std::size_t i_global = N * row + i; for (std::size_t j = 0; j < N; j++) { - j_global = N * col + j; + const std::size_t j_global = N * col + j; // compute covariance function - tile.push_back(compute_covariance_distance(i_global, j_global, n_regressors, sek_params, input, input)); + tile.data()[i * N + j] = + compute_covariance_distance(i_global, j_global, n_regressors, sek_params, input, input); } } return tile; } -std::vector gen_tile_covariance_with_distance( +mutable_tile_data gen_tile_covariance_with_distance( std::size_t row, std::size_t col, std::size_t N, - const gprat_hyper::SEKParams &sek_params, - const std::vector &distance) + const SEKParams &sek_params, + const const_tile_data &distance) { - std::size_t i_global, j_global; - double covariance; // Preallocate required memory - std::vector tile; - tile.reserve(N * N); + mutable_tile_data tile(N * N); for (std::size_t i = 0; i < N; i++) { - i_global = N * row + i; + const std::size_t i_global = N * row + i; for (std::size_t j = 0; j < N; j++) { - j_global = N * col + j; + const std::size_t j_global = N * col + j; // compute covariance function - covariance = sek_params.vertical_lengthscale * exp(distance[i * N + j]); + double covariance = sek_params.vertical_lengthscale * exp(distance.data()[i * N + j]); if (i_global == j_global) { // noise variance on diagonal covariance += sek_params.noise_variance; } - tile.push_back(covariance); + tile.data()[i * N + j] = covariance; } } return tile; } -std::vector -gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector &distance) +mutable_tile_data +gen_tile_grad_v(std::size_t N, const SEKParams &sek_params, const const_tile_data &distance) { // Preallocate required memory - std::vector tile; - tile.reserve(N * N); + mutable_tile_data tile(N * N); double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.vertical_lengthscale, false)); for (std::size_t i = 0; i < N; i++) { for (std::size_t j = 0; j < N; j++) { // compute derivative - tile.push_back(exp(distance[i * N + j]) * hyperparam_der); + tile.data()[i * N + j] = exp(distance.data()[i * N + j]) * hyperparam_der; } } return tile; } -std::vector -gen_tile_grad_l(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector &distance) +mutable_tile_data +gen_tile_grad_l(std::size_t N, const SEKParams &sek_params, const const_tile_data &distance) { // Preallocate required memory - std::vector tile; - tile.reserve(N * N); - double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.lengthscale, false)); - double factor = -2.0 * sek_params.vertical_lengthscale / sek_params.lengthscale; + mutable_tile_data tile(N * N); + const double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.lengthscale, false)); + const double factor = -2.0 * sek_params.vertical_lengthscale / sek_params.lengthscale; for (std::size_t i = 0; i < N; i++) { for (std::size_t j = 0; j < N; j++) { // compute derivative - tile.push_back(factor * distance[i * N + j] * exp(distance[i * N + j]) * hyperparam_der); + tile.data()[i * N + j] = + factor * distance.data()[i * N + j] * exp(distance.data()[i * N + j]) * hyperparam_der; } } return tile; @@ -160,11 +158,8 @@ double update_second_moment(double gradient, double v_T, double beta_2) return beta_2 * v_T + (1.0 - beta_2) * gradient * gradient; } -double adam_step(const double unconstrained_hyperparam, - const gprat_hyper::AdamParams &adam_params, - double m_T, - double v_T, - std::size_t iter) +double adam_step( + const double unconstrained_hyperparam, const AdamParams &adam_params, double m_T, double v_T, std::size_t iter) { // Compute decay rate double beta1_T = pow(adam_params.beta1, static_cast(iter + 1)); @@ -183,9 +178,9 @@ double adam_step(const double unconstrained_hyperparam, ///////////////////////////////////////////////////////////////////////// // Loss -double compute_loss(const std::vector &K_diag_tile, - const std::vector &alpha_tile, - const std::vector &y_tile, +double compute_loss(std::span K_diag_tile, + std::span alpha_tile, + std::span y_tile, std::size_t N) { // l = y^T * alpha + \sum_i^N log(L_ii^2) @@ -201,7 +196,7 @@ double compute_loss(const std::vector &K_diag_tile, return l; } -double add_losses(const std::vector &losses, std::size_t N, std::size_t n_tiles) +double add_losses(std::span losses, std::size_t N, std::size_t n_tiles) { // 0.5 * \sum losses + const double l = 0.0; @@ -212,7 +207,7 @@ double add_losses(const std::vector &losses, std::size_t N, std::size_t l += losses[i]; } - l += Nn * log(2.0 * M_PI); + l += Nn * log(2.0 * std::numbers::pi); return 0.5 * l / Nn; // why /Nn? } @@ -223,17 +218,17 @@ double compute_gradient(double trace, double dot, std::size_t N, std::size_t n_t return 0.5 / static_cast(N * n_tiles) * (trace - dot); } -double compute_trace(const std::vector &diagonal, double trace) +double compute_trace(std::span diagonal, double trace) { return trace + std::reduce(diagonal.begin(), diagonal.end()); } -double compute_dot(const std::vector &vector_T, const std::vector &vector, double result) +double compute_dot(std::span vector_T, std::span vector, double result) { return result + dot(vector_T, vector, static_cast(vector.size())); } -double compute_trace_diag(const std::vector &tile, double trace, std::size_t N) +double compute_trace_diag(std::span tile, double trace, std::size_t N) { double local_trace = 0.0; for (std::size_t i = 0; i < N; ++i) @@ -244,3 +239,5 @@ double compute_trace_diag(const std::vector &tile, double trace, std::si } } // end of namespace cpu + +GPRAT_NS_END diff --git a/core/src/cpu/gp_uncertainty.cpp b/core/src/cpu/gp_uncertainty.cpp index 3ea6a7a9..5f03366f 100644 --- a/core/src/cpu/gp_uncertainty.cpp +++ b/core/src/cpu/gp_uncertainty.cpp @@ -1,21 +1,22 @@ -#include "cpu/gp_uncertainty.hpp" +#include "gprat/cpu/gp_uncertainty.hpp" + +#include "gprat/tile_data.hpp" + +GPRAT_NS_BEGIN namespace cpu { -hpx::shared_future> get_matrix_diagonal(hpx::shared_future> f_A, std::size_t M) +mutable_tile_data get_matrix_diagonal(const const_tile_data &A, std::size_t M) { - auto A = f_A.get(); - // Preallocate memory - std::vector tile; - tile.reserve(M); - // Add elements + mutable_tile_data tile(M); for (std::size_t i = 0; i < M; ++i) { - tile.push_back(A[i * M + i]); + tile.data()[i] = A.data()[i * M + i]; } - - return hpx::make_ready_future(std::move(tile)); + return tile; } } // end of namespace cpu + +GPRAT_NS_END diff --git a/core/src/cpu/tiled_algorithms.cpp b/core/src/cpu/tiled_algorithms.cpp index 5c5b2573..d035b89d 100644 --- a/core/src/cpu/tiled_algorithms.cpp +++ b/core/src/cpu/tiled_algorithms.cpp @@ -1,429 +1,31 @@ -#include "cpu/tiled_algorithms.hpp" +#include "gprat/cpu/tiled_algorithms.hpp" -#include "cpu/adapter_cblas_fp64.hpp" -#include "cpu/gp_algorithms.hpp" -#include "cpu/gp_optimizer.hpp" -#include "cpu/gp_uncertainty.hpp" -#include +#include "gprat/cpu/adapter_cblas_fp64.hpp" +#include "gprat/cpu/gp_algorithms.hpp" +#include "gprat/cpu/gp_optimizer.hpp" -namespace cpu -{ - -// Tiled Cholesky Algorithm - -void right_looking_cholesky_tiled(Tiled_matrix &ft_tiles, int N, std::size_t n_tiles) -{ - for (std::size_t k = 0; k < n_tiles; k++) - { - // POTRF: Compute Cholesky factor L - ft_tiles[k * n_tiles + k] = - hpx::dataflow(hpx::annotated_function(potrf, "cholesky_tiled"), ft_tiles[k * n_tiles + k], N); - for (std::size_t m = k + 1; m < n_tiles; m++) - { - // TRSM: Solve X * L^T = A - ft_tiles[m * n_tiles + k] = hpx::dataflow( - hpx::annotated_function(trsm, "cholesky_tiled"), - ft_tiles[k * n_tiles + k], - ft_tiles[m * n_tiles + k], - N, - N, - Blas_trans, - Blas_right); - } - for (std::size_t m = k + 1; m < n_tiles; m++) - { - // SYRK: A = A - B * B^T - ft_tiles[m * n_tiles + m] = hpx::dataflow( - hpx::annotated_function(syrk, "cholesky_tiled"), - ft_tiles[m * n_tiles + m], - ft_tiles[m * n_tiles + k], - N); - for (std::size_t n = k + 1; n < m; n++) - { - // GEMM: C = C - A * B^T - ft_tiles[m * n_tiles + n] = hpx::dataflow( - hpx::annotated_function(gemm, "cholesky_tiled"), - ft_tiles[m * n_tiles + k], - ft_tiles[n * n_tiles + k], - ft_tiles[m * n_tiles + n], - N, - N, - N, - Blas_no_trans, - Blas_trans); - } - } - } -} - -// Tiled Triangular Solve Algorithms - -void forward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles) -{ - for (std::size_t k = 0; k < n_tiles; k++) - { - // TRSM: Solve L * x = a - ft_rhs[k] = hpx::dataflow( - hpx::annotated_function(trsv, "triangular_solve_tiled"), - ft_tiles[k * n_tiles + k], - ft_rhs[k], - N, - Blas_no_trans); - for (std::size_t m = k + 1; m < n_tiles; m++) - { - // GEMV: b = b - A * a - ft_rhs[m] = hpx::dataflow( - hpx::annotated_function(gemv, "triangular_solve_tiled"), - ft_tiles[m * n_tiles + k], - ft_rhs[k], - ft_rhs[m], - N, - N, - Blas_substract, - Blas_no_trans); - } - } -} - -void backward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles) -{ - for (int k_ = static_cast(n_tiles) - 1; k_ >= 0; k_--) // int instead of std::size_t for last comparison - { - std::size_t k = static_cast(k_); - // TRSM: Solve L^T * x = a - ft_rhs[k] = hpx::dataflow( - hpx::annotated_function(trsv, "triangular_solve_tiled"), - ft_tiles[k * n_tiles + k], - ft_rhs[k], - N, - Blas_trans); - for (int m_ = k_ - 1; m_ >= 0; m_--) // int instead of std::size_t for last comparison - { - std::size_t m = static_cast(m_); - // GEMV:b = b - A^T * a - ft_rhs[m] = hpx::dataflow( - hpx::annotated_function(gemv, "triangular_solve_tiled"), - ft_tiles[k * n_tiles + m], - ft_rhs[k], - ft_rhs[m], - N, - N, - Blas_substract, - Blas_trans); - } - } -} - -void forward_solve_tiled_matrix( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles) -{ - for (std::size_t c = 0; c < m_tiles; c++) - { - for (std::size_t k = 0; k < n_tiles; k++) - { - // TRSM: solve L * X = A - ft_rhs[k * m_tiles + c] = hpx::dataflow( - hpx::annotated_function(trsm, "triangular_solve_tiled_matrix"), - ft_tiles[k * n_tiles + k], - ft_rhs[k * m_tiles + c], - N, - M, - Blas_no_trans, - Blas_left); - for (std::size_t m = k + 1; m < n_tiles; m++) - { - // GEMM: C = C - A * B - ft_rhs[m * m_tiles + c] = hpx::dataflow( - hpx::annotated_function(gemm, "triangular_solve_tiled_matrix"), - ft_tiles[m * n_tiles + k], - ft_rhs[k * m_tiles + c], - ft_rhs[m * m_tiles + c], - N, - M, - N, - Blas_no_trans, - Blas_no_trans); - } - } - } -} - -void backward_solve_tiled_matrix( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles) -{ - for (std::size_t c = 0; c < m_tiles; c++) - { - for (int k_ = static_cast(n_tiles) - 1; k_ >= 0; k_--) // int instead of std::size_t for last comparison - { - std::size_t k = static_cast(k_); - // TRSM: solve L^T * X = A - ft_rhs[k * m_tiles + c] = hpx::dataflow( - hpx::annotated_function(trsm, "triangular_solve_tiled_matrix"), - ft_tiles[k * n_tiles + k], - ft_rhs[k * m_tiles + c], - N, - M, - Blas_trans, - Blas_left); - for (int m_ = k_ - 1; m_ >= 0; m_--) // int instead of std::size_t for last comparison - { - std::size_t m = static_cast(m_); - // GEMM: C = C - A^T * B - ft_rhs[m * m_tiles + c] = hpx::dataflow( - hpx::annotated_function(gemm, "triangular_solve_tiled_matrix"), - ft_tiles[k * n_tiles + m], - ft_rhs[k * m_tiles + c], - ft_rhs[m * m_tiles + c], - N, - M, - N, - Blas_trans, - Blas_no_trans); - } - } - } -} - -void matrix_vector_tiled(Tiled_matrix &ft_tiles, - Tiled_vector &ft_vector, - Tiled_vector &ft_rhs, - int N_row, - int N_col, - std::size_t n_tiles, - std::size_t m_tiles) -{ - for (std::size_t k = 0; k < m_tiles; k++) - { - for (std::size_t m = 0; m < n_tiles; m++) - { - ft_rhs[k] = hpx::dataflow( - hpx::annotated_function(gemv, "prediction_tiled"), - ft_tiles[k * n_tiles + m], - ft_vector[m], - ft_rhs[k], - N_row, - N_col, - Blas_add, - Blas_no_trans); - } - } -} - -void symmetric_matrix_matrix_diagonal_tiled( - Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int N, int M, std::size_t n_tiles, std::size_t m_tiles) -{ - for (std::size_t i = 0; i < m_tiles; ++i) - { - for (std::size_t n = 0; n < n_tiles; ++n) - { // Compute inner product to obtain diagonal elements of - // V^T * V <=> cross(K) * K^-1 * cross(K)^T - ft_vector[i] = hpx::dataflow( - hpx::annotated_function(dot_diag_syrk, "posterior_tiled"), - ft_tiles[n * m_tiles + i], - ft_vector[i], - N, - M); - } - } -} - -void symmetric_matrix_matrix_tiled( - Tiled_matrix &ft_tiles, Tiled_matrix &ft_result, int N, int M, std::size_t n_tiles, std::size_t m_tiles) -{ - for (std::size_t c = 0; c < m_tiles; c++) - { - for (std::size_t k = 0; k < m_tiles; k++) - { - for (std::size_t m = 0; m < n_tiles; m++) - { - // (SYRK for (c == k) possible) - // GEMM: C = C - A^T * B - ft_result[c * m_tiles + k] = hpx::dataflow( - hpx::annotated_function(&gemm, "triangular_solve_tiled_matrix"), - ft_tiles[m * m_tiles + c], - ft_tiles[m * m_tiles + k], - ft_result[c * m_tiles + k], - N, - M, - M, - Blas_trans, - Blas_no_trans); - } - } - } -} - -void vector_difference_tiled(Tiled_vector &ft_minuend, Tiled_vector &ft_subtrahend, int M, std::size_t m_tiles) -{ - for (std::size_t i = 0; i < m_tiles; i++) - { - ft_subtrahend[i] = - hpx::dataflow(hpx::annotated_function(&axpy, "uncertainty_tiled"), ft_minuend[i], ft_subtrahend[i], M); - } -} +GPRAT_NS_BEGIN -void matrix_diagonal_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int M, std::size_t m_tiles) +namespace cpu { - for (std::size_t i = 0; i < m_tiles; i++) - { - ft_vector[i] = hpx::dataflow( - hpx::annotated_function(get_matrix_diagonal, "uncertainty_tiled"), ft_tiles[i * m_tiles + i], M); - } -} -void compute_loss_tiled(Tiled_matrix &ft_tiles, - Tiled_vector &ft_alpha, - Tiled_vector &ft_y, - hpx::shared_future &loss, - int N, - std::size_t n_tiles) +namespace impl { - std::vector> loss_tiled; - loss_tiled.reserve(n_tiles); - for (std::size_t k = 0; k < n_tiles; k++) - { - loss_tiled.push_back(hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&compute_loss), "loss_tiled"), - ft_tiles[k * n_tiles + k], - ft_alpha[k], - ft_y[k], - N)); - } - - loss = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&add_losses), "loss_tiled"), loss_tiled, N, n_tiles); -} -void update_hyperparameter_tiled( - const Tiled_matrix &ft_invK, - const Tiled_matrix &ft_gradK_param, - const Tiled_vector &ft_alpha, - const gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, - int N, +void update_parameters( + const AdamParams &adam_params, + SEKParams &sek_params, + std::size_t N, std::size_t n_tiles, std::size_t iter, - std::size_t param_idx) + std::size_t param_idx, + double trace, + double dot, + bool jitter, + double factor) { - /* - * PART 1: - * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y ) - * - * 1: Compute trace(inv(K) * grad(K)_param) - * 2: Compute y^T * inv(K) * grad(K)_param * inv(K) * y - * - * Update parameter: - * 3: Update moments - * - m_T = beta1 * m_T-1 + (1 - beta1) * g_T - * - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2 - * 4: Adam step: - * - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T) - * - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon) - */ - hpx::shared_future trace = hpx::make_ready_future(0.0); - hpx::shared_future dot = hpx::make_ready_future(0.0); - bool jitter = false; - double factor = 1.0; - if (param_idx == 0 || param_idx == 1) // 0: lengthscale; 1: vertical_lengthscale - { - Tiled_vector diag_tiles; // Diagonal tiles - Tiled_vector inter_alpha; // Intermediate result - // Preallocate memory - inter_alpha.reserve(n_tiles); - diag_tiles.reserve(n_tiles); - // Asynchrnonous initialization - for (std::size_t d = 0; d < n_tiles; d++) - { - diag_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble"), N)); - inter_alpha.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble"), N)); - } - - //////////////////////////////////// - // PART 1: Compute gradient - // Step 1: Compute trace(inv(K)*grad_K_param) - // Compute diagonal tiles of inv(K) * grad(K)_param - for (std::size_t i = 0; i < n_tiles; ++i) - { - for (std::size_t j = 0; j < n_tiles; ++j) - { - diag_tiles[i] = hpx::dataflow( - hpx::annotated_function(dot_diag_gemm, "trace"), - ft_invK[i * n_tiles + j], - ft_gradK_param[j * n_tiles + i], - diag_tiles[i], - N, - N); - } - } - // Compute the trace of the diagonal tiles - for (std::size_t j = 0; j < n_tiles; ++j) - { - trace = - hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_trace), "trace"), diag_tiles[j], trace); - } - // Not sure if can be done this way - // Step 2: Compute alpha^T * grad(K)_param * alpha (with alpha = inv(K) * y) - // Compute inter_alpha = grad(K)_param * alpha - for (std::size_t k = 0; k < n_tiles; k++) - { - for (std::size_t m = 0; m < n_tiles; m++) - { - inter_alpha[k] = hpx::dataflow( - hpx::annotated_function(gemv, "gemv"), - ft_gradK_param[k * n_tiles + m], - ft_alpha[m], - inter_alpha[k], - N, - N, - Blas_add, - Blas_no_trans); - } - } - // Compute alpha^T * inter_alpha - for (std::size_t j = 0; j < n_tiles; ++j) - { - dot = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_dot), "grad_right_tiled"), - inter_alpha[j], - ft_alpha[j], - dot); - } - } - else if (param_idx == 2) // @2: noise_variance - { - jitter = true; - //////////////////////////////////// - // PART 1: Compute gradient - // Step 1: Compute the trace of inv(K) * noise_variance - for (std::size_t j = 0; j < n_tiles; ++j) - { - trace = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_trace_diag), "grad_left_tiled"), - ft_invK[j * n_tiles + j], - trace, - N); - } - //////////////////////////////////// - // Step 2: Compute the alpha^T * alpha * noise_variance - for (std::size_t j = 0; j < n_tiles; ++j) - { - dot = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_dot), "grad_right_tiled"), - ft_alpha[j], - ft_alpha[j], - dot); - } - - factor = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true)); - } - else - { - // Throw an exception for invalid param_idx - throw std::invalid_argument("Invalid param_idx"); - } - // Compute gradient = trace + dot - double gradient = - factor - * hpx::dataflow( - hpx::annotated_function(hpx::unwrapping(&compute_gradient), "update_hyperparam"), trace, dot, N, n_tiles) - .get(); + double gradient = factor * compute_gradient(trace, dot, N, n_tiles); //////////////////////////////////// // PART 2: Update parameter @@ -437,14 +39,14 @@ void update_hyperparameter_tiled( double unconstrained_param = to_unconstrained(sek_params.get_param(param_idx), jitter); // Adam step update with unconstrained parameter // compute beta_t inside - double updated_param = adam_step( - unconstrained_param, - adam_params, - sek_params.m_T[param_idx], - sek_params.w_T[param_idx], - static_cast(iter)); + double updated_param = + adam_step(unconstrained_param, adam_params, sek_params.m_T[param_idx], sek_params.w_T[param_idx], iter); // Transform hyperparameter back to constrained form sek_params.set_param(param_idx, to_constrained(updated_param, jitter)); } +} // namespace impl + } // end of namespace cpu + +GPRAT_NS_END diff --git a/core/src/gprat.cpp b/core/src/gprat.cpp new file mode 100644 index 00000000..969fdb9e --- /dev/null +++ b/core/src/gprat.cpp @@ -0,0 +1,285 @@ +#include "gprat/gprat.hpp" + +#include "gprat/cpu/gp_functions.hpp" +#include "gprat/utils.hpp" + +#if GPRAT_WITH_CUDA +#include "gprat/gpu/gp_functions.cuh" +#endif + +GPRAT_NS_BEGIN + +GP_data::GP_data(const std::string &f_path, std::size_t n, std::size_t n_reg) : + file_path(f_path), + n_samples(n), + n_regressors(n_reg) +{ + data = load_data(f_path, n, n_reg - 1); +} + +GP::GP(std::vector input, + std::vector output, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, + std::vector trainable_bool, + std::shared_ptr target) : + training_input_(std::move(input)), + training_output_(std::move(output)), + n_tiles_(n_tiles), + n_tile_size_(n_tile_size), + trainable_params_(std::move(trainable_bool)), + target_(std::move(target)), + n_reg(n_regressors), + kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) +{ } + +GP::GP(std::vector input, + std::vector output, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, + std::vector trainable_bool) : + training_input_(std::move(input)), + training_output_(std::move(output)), + n_tiles_(n_tiles), + n_tile_size_(n_tile_size), + trainable_params_(std::move(trainable_bool)), + target_(std::make_shared()), + n_reg(n_regressors), + kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) +{ } + +GP::GP(std::vector input, + std::vector output, + std::size_t n_tiles, + std::size_t n_tile_size, + std::size_t n_regressors, + const std::vector &kernel_hyperparams, + std::vector trainable_bool, + int gpu_id, + int n_streams) : + training_input_(std::move(input)), + training_output_(std::move(output)), + n_tiles_(n_tiles), + n_tile_size_(n_tile_size), + trainable_params_(std::move(trainable_bool)), +#if GPRAT_WITH_CUDA + target_(std::make_shared(CUDA_GPU(gpu_id, n_streams))), +#else + target_(std::make_shared()), +#endif + n_reg(n_regressors), + kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) +{ +#if !GPRAT_WITH_CUDA + throw std::runtime_error( + "Cannot create GP object using CUDA for computation. " + "CUDA is not available because GPRat has been compiled without CUDA. " + "Remove arguments gpu_id (" + + std::to_string(gpu_id) + ") and n_streams (" + std::to_string(n_streams) + + ") to perform computations on the CPU."); +#endif +} + +std::string GP::repr() const +{ + std::ostringstream oss; + oss << std::fixed << std::setprecision(12); + oss << "Kernel_Params: [lengthscale=" << kernel_params.lengthscale << ", vertical_lengthscale=" + << kernel_params.vertical_lengthscale << ", noise_variance=" << kernel_params.noise_variance + << ", n_regressors=" << n_reg << "], Trainable_Params: [trainable_params l=" << trainable_params_[0] + << ", trainable_params v=" << trainable_params_[1] << ", trainable_params n=" << trainable_params_[2] + << "], Target: [" << target_->repr() << "], n_tiles=" << n_tiles_ << ", n_tile_size=" << n_tile_size_; + return oss.str(); +} + +std::vector GP::get_training_input() const { return training_input_; } + +std::vector GP::get_training_output() const { return training_output_; } + +std::vector GP::predict(const std::vector &test_input, std::size_t m_tiles, std::size_t m_tile_size) +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + return gpu::predict( + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg, + *std::dynamic_pointer_cast(target_)); + } +#endif + + tiled_scheduler_local scheduler; + return cpu::predict( + scheduler, + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg); +} + +std::vector> +GP::predict_with_uncertainty(const std::vector &test_input, std::size_t m_tiles, std::size_t m_tile_size) +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + return gpu::predict_with_uncertainty( + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg, + *std::dynamic_pointer_cast(target_)); + } +#endif + tiled_scheduler_local scheduler; + return cpu::predict_with_uncertainty( + scheduler, + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg); +} + +std::vector> +GP::predict_with_full_cov(const std::vector &test_input, std::size_t m_tiles, std::size_t m_tile_size) +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + return gpu::predict_with_full_cov( + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg, + *std::dynamic_pointer_cast(target_)); + } +#endif + tiled_scheduler_local scheduler; + return cpu::predict_with_full_cov( + scheduler, + training_input_, + training_output_, + test_input, + kernel_params, + n_tiles_, + n_tile_size_, + m_tiles, + m_tile_size, + n_reg); +} + +std::vector GP::optimize(const AdamParams &adam_params) +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + std::cerr << "GP::optimze_step has not been implemented for the GPU.\n" + << "Instead, this operation executes the CPU implementation." << std::endl; + } +#endif + tiled_scheduler_local scheduler; + return cpu::optimize( + scheduler, + training_input_, + training_output_, + n_tiles_, + n_tile_size_, + n_reg, + adam_params, + kernel_params, + trainable_params_); +} + +double GP::optimize_step(AdamParams &adam_params, std::size_t iter) +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + std::cerr << "GP::optimze_step has not been implemented for the GPU.\n" + << "Instead, this operation executes the CPU implementation." << std::endl; + } +#endif + tiled_scheduler_local scheduler; + return cpu::optimize_step( + scheduler, + training_input_, + training_output_, + n_tiles_, + n_tile_size_, + n_reg, + adam_params, + kernel_params, + trainable_params_, + iter); +} + +double GP::calculate_loss() +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + return gpu::compute_loss( + training_input_, + training_output_, + kernel_params, + n_tiles_, + n_tile_size_, + n_reg, + *std::dynamic_pointer_cast(target_)); + } +#endif + tiled_scheduler_local scheduler; + return cpu::calculate_loss( + scheduler, training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg); +} + +std::vector> GP::cholesky() +{ +#if GPRAT_WITH_CUDA + if (target_->is_gpu()) + { + return gpu::cholesky( + training_input_, + kernel_params, + n_tiles_, + n_tile_size_, + n_reg, + *std::dynamic_pointer_cast(target_)); + } +#endif + tiled_scheduler_local sched; + return cpu::cholesky(sched, training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg); +} + +GPRAT_NS_END diff --git a/core/src/gprat_c.cpp b/core/src/gprat_c.cpp deleted file mode 100644 index c93e792c..00000000 --- a/core/src/gprat_c.cpp +++ /dev/null @@ -1,363 +0,0 @@ -#include "gprat_c.hpp" - -#include "cpu/gp_functions.hpp" -#include "utils_c.hpp" -#include - -#if GPRAT_WITH_CUDA -#include "gpu/gp_functions.cuh" -#endif - -// namespace for GPRat library entities -namespace gprat -{ - -GP_data::GP_data(const std::string &f_path, int n, int n_reg) : - file_path(f_path), - n_samples(n), - n_regressors(n_reg) -{ - data = utils::load_data(f_path, n, n_reg - 1); -} - -GP::GP(std::vector input, - std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, - std::vector trainable_bool, - std::shared_ptr target) : - training_input_(input), - training_output_(output), - n_tiles_(n_tiles), - n_tile_size_(n_tile_size), - trainable_params_(trainable_bool), - target_(target), - n_reg(n_regressors), - kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) -{ } - -GP::GP(std::vector input, - std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, - std::vector trainable_bool) : - training_input_(input), - training_output_(output), - n_tiles_(n_tiles), - n_tile_size_(n_tile_size), - trainable_params_(trainable_bool), - target_(std::make_shared()), - n_reg(n_regressors), - kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) -{ } - -GP::GP(std::vector input, - std::vector output, - int n_tiles, - int n_tile_size, - int n_regressors, - std::vector kernel_hyperparams, - std::vector trainable_bool, - int gpu_id, - int n_streams) : - training_input_(input), - training_output_(output), - n_tiles_(n_tiles), - n_tile_size_(n_tile_size), - trainable_params_(trainable_bool), -#if GPRAT_WITH_CUDA - target_(std::make_shared(CUDA_GPU(gpu_id, n_streams))), -#else - target_(std::make_shared()), -#endif - n_reg(n_regressors), - kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2]) -{ -#if !GPRAT_WITH_CUDA - throw std::runtime_error( - "Cannot create GP object using CUDA for computation. " - "CUDA is not available because GPRat has been compiled without CUDA. " - "Remove arguments gpu_id (" - + std::to_string(gpu_id) + ") and n_streams (" + std::to_string(n_streams) - + ") to perform computations on the CPU."); -#endif -} - -std::string GP::repr() const -{ - std::ostringstream oss; - oss << std::fixed << std::setprecision(12); - oss << "Kernel_Params: [lengthscale=" << kernel_params.lengthscale << ", vertical_lengthscale=" - << kernel_params.vertical_lengthscale << ", noise_variance=" << kernel_params.noise_variance - << ", n_regressors=" << n_reg << "], Trainable_Params: [trainable_params l=" << trainable_params_[0] - << ", trainable_params v=" << trainable_params_[1] << ", trainable_params n=" << trainable_params_[2] - << "], Target: [" << target_->repr() << "], n_tiles=" << n_tiles_ << ", n_tile_size=" << n_tile_size_; - return oss.str(); -} - -std::vector GP::get_training_input() const { return training_input_; } - -std::vector GP::get_training_output() const { return training_output_; } - -std::vector GP::predict(const std::vector &test_input, int m_tiles, int m_tile_size) -{ - return hpx::async( - [this, &test_input, m_tiles, m_tile_size]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - return gpu::predict( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg, - *std::dynamic_pointer_cast(target_)); - } - else - { - return cpu::predict( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); - } -#else - return cpu::predict( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); -#endif - }) - .get(); -} - -std::vector> -GP::predict_with_uncertainty(const std::vector &test_input, int m_tiles, int m_tile_size) -{ - return hpx::async( - [this, &test_input, m_tiles, m_tile_size]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - return gpu::predict_with_uncertainty( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg, - *std::dynamic_pointer_cast(target_)); - } - else - { - return cpu::predict_with_uncertainty( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); - } -#else - return cpu::predict_with_uncertainty( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); -#endif - }) - .get(); -} - -std::vector> -GP::predict_with_full_cov(const std::vector &test_input, int m_tiles, int m_tile_size) -{ - return hpx::async( - [this, &test_input, m_tiles, m_tile_size]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - return gpu::predict_with_full_cov( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg, - *std::dynamic_pointer_cast(target_)); - } - else - { - return cpu::predict_with_full_cov( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); - } -#else - return cpu::predict_with_full_cov( - training_input_, - training_output_, - test_input, - kernel_params, - n_tiles_, - n_tile_size_, - m_tiles, - m_tile_size, - n_reg); -#endif - }) - .get(); -} - -std::vector GP::optimize(const gprat_hyper::AdamParams &adam_params) -{ - return hpx::async( - [this, &adam_params]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - std::cerr << "GP::optimze_step has not been implemented for the GPU.\n" - << "Instead, this operation executes the CPU implementation." << std::endl; - } -#endif - return cpu::optimize( - training_input_, - training_output_, - n_tiles_, - n_tile_size_, - n_reg, - adam_params, - kernel_params, - trainable_params_); - }) - .get(); -} - -double GP::optimize_step(gprat_hyper::AdamParams &adam_params, int iter) -{ - return hpx::async( - [this, &adam_params, iter]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - std::cerr << "GP::optimze_step has not been implemented for the GPU.\n" - << "Instead, this operation executes the CPU implementation." << std::endl; - } -#endif - return cpu::optimize_step( - training_input_, - training_output_, - n_tiles_, - n_tile_size_, - n_reg, - adam_params, - kernel_params, - trainable_params_, - iter); - }) - .get(); -} - -double GP::calculate_loss() -{ - return hpx::async( - [this]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - return gpu::compute_loss( - training_input_, - training_output_, - kernel_params, - n_tiles_, - n_tile_size_, - n_reg, - *std::dynamic_pointer_cast(target_)); - } - else - { - return cpu::compute_loss( - training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg); - } -#else - return cpu::compute_loss( - training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg); -#endif - }) - .get(); -} - -std::vector> GP::cholesky() -{ - return hpx::async( - [this]() - { -#if GPRAT_WITH_CUDA - if (target_->is_gpu()) - { - return gpu::cholesky( - training_input_, - kernel_params, - n_tiles_, - n_tile_size_, - n_reg, - *std::dynamic_pointer_cast(target_)); - } - else - { - return cpu::cholesky(training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg); - } -#else - return cpu::cholesky(training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg); -#endif - }) - .get(); -} - -} // namespace gprat diff --git a/core/src/gpu/adapter_cublas.cu b/core/src/gpu/adapter_cublas.cu index 61227e8d..c3833aac 100644 --- a/core/src/gpu/adapter_cublas.cu +++ b/core/src/gpu/adapter_cublas.cu @@ -1,4 +1,6 @@ -#include "gpu/adapter_cublas.cuh" +#include "gprat/gpu/adapter_cublas.cuh" + +GPRAT_NS_BEGIN // frequently used names using hpx::cuda::experimental::check_cuda_error; @@ -411,3 +413,5 @@ dot(cublasHandle_t cublas, return hpx::make_ready_future(result); } + +GPRAT_NS_END diff --git a/core/src/gpu/cuda_kernels.cu b/core/src/gpu/cuda_kernels.cu index 37378f37..5e77ec6a 100644 --- a/core/src/gpu/cuda_kernels.cu +++ b/core/src/gpu/cuda_kernels.cu @@ -1,6 +1,8 @@ -#include "gpu/cuda_kernels.cuh" +#include "gprat/gpu/cuda_kernels.cuh" -#include "gpu/cuda_utils.cuh" +#include "gprat/gpu/cuda_utils.cuh" + +GPRAT_NS_BEGIN __global__ void transpose(double *transposed, double *original, std::size_t width, std::size_t height) { @@ -25,3 +27,5 @@ __global__ void transpose(double *transposed, double *original, std::size_t widt transposed[index_out] = block[threadIdx.x][threadIdx.y]; } } + +GPRAT_NS_END diff --git a/core/src/gpu/gp_algorithms.cu b/core/src/gpu/gp_algorithms.cu index 39407ed6..b9125e57 100644 --- a/core/src/gpu/gp_algorithms.cu +++ b/core/src/gpu/gp_algorithms.cu @@ -1,14 +1,18 @@ -#include "gpu/gp_algorithms.cuh" +#include "gprat/gpu/gp_algorithms.cuh" + +#include "gprat/gpu/cuda_kernels.cuh" +#include "gprat/gpu/cuda_utils.cuh" +#include "gprat/gpu/gp_optimizer.cuh" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" +#include "gprat/tile_data.hpp" -#include "gp_kernels.hpp" -#include "gpu/cuda_kernels.cuh" -#include "gpu/cuda_utils.cuh" -#include "gpu/gp_optimizer.cuh" -#include "target.hpp" #include #include #include +GPRAT_NS_BEGIN + namespace gpu { @@ -20,7 +24,7 @@ __global__ void gen_tile_covariance_kernel( const std::size_t n_regressors, const std::size_t tile_row, const std::size_t tile_column, - const gprat_hyper::SEKParams sek_params) + const SEKParams sek_params) { // Compute the global indices of the thread unsigned int i = blockIdx.y * blockDim.y + threadIdx.y; @@ -59,8 +63,8 @@ double *gen_tile_covariance(const double *d_input, const std::size_t tile_column, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { double *d_tile; @@ -85,7 +89,7 @@ __global__ void gen_tile_full_prior_covariance_kernel( const std::size_t n_regressors, const std::size_t tile_row, const std::size_t tile_column, - const gprat_hyper::SEKParams sek_params) + const SEKParams sek_params) { unsigned int i = blockIdx.y * blockDim.y + threadIdx.y; unsigned int j = blockIdx.x * blockDim.x + threadIdx.x; @@ -117,8 +121,8 @@ double *gen_tile_full_prior_covariance( const std::size_t tile_colums, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { double *d_tile; @@ -143,7 +147,7 @@ __global__ void gen_tile_prior_covariance_kernel( const std::size_t n_regressors, const std::size_t tile_row, const std::size_t tile_column, - const gprat_hyper::SEKParams sek_params) + const SEKParams sek_params) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -174,8 +178,8 @@ double *gen_tile_prior_covariance( const std::size_t tile_column, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { double *d_tile; @@ -202,7 +206,7 @@ __global__ void gen_tile_cross_covariance_kernel( const std::size_t tile_row, const std::size_t tile_column, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params) + const SEKParams sek_params) { unsigned int i = blockIdx.y * blockDim.y + threadIdx.y; unsigned int j = blockIdx.x * blockDim.x + threadIdx.x; @@ -235,8 +239,8 @@ double *gen_tile_cross_covariance( const std::size_t n_row_tile_size, const std::size_t n_column_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { double *d_tile; @@ -265,7 +269,7 @@ double *gen_tile_cross_covariance( hpx::shared_future gen_tile_cross_cov_T(std::size_t n_row_tile_size, std::size_t n_column_tile_size, const hpx::shared_future f_cross_covariance_tile, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { double *transposed; check_cuda_error(cudaMalloc(&transposed, n_row_tile_size * n_column_tile_size * sizeof(double))); @@ -293,8 +297,7 @@ __global__ void gen_tile_output_kernel(double *tile, const double *output, std:: } } -double * -gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, gprat::CUDA_GPU &gpu) +double *gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, CUDA_GPU &gpu) { dim3 threads_per_block(256); dim3 n_blocks((n_tile_size + 255) / 256); @@ -311,7 +314,7 @@ gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const doub return d_tile; } -double *gen_tile_zeros(std::size_t n_tile_size, gprat::CUDA_GPU &gpu) +double *gen_tile_zeros(std::size_t n_tile_size, CUDA_GPU &gpu) { double *d_tile; cudaStream_t stream = gpu.next_stream(); @@ -345,8 +348,8 @@ std::vector> assemble_tiled_covariance_matrix( const std::size_t n_tiles, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { std::vector> d_tiles(n_tiles * n_tiles); @@ -369,8 +372,8 @@ std::vector> assemble_tiled_covariance_matrix( return d_tiles; } -std::vector> assemble_alpha_tiles( - const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu) +std::vector> +assemble_alpha_tiles(const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu) { std::vector> alpha_tiles(n_tiles); for (std::size_t i = 0; i < n_tiles; i++) @@ -390,8 +393,8 @@ std::vector> assemble_cross_covariance_tiles( const std::size_t m_tile_size, const std::size_t n_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { std::vector> cross_covariance_tiles; cross_covariance_tiles.resize(m_tiles * n_tiles); @@ -416,7 +419,7 @@ std::vector> assemble_cross_covariance_tiles( } std::vector> -assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, gprat::CUDA_GPU &gpu) +assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu) { std::vector> tiles(n_tiles); for (std::size_t i = 0; i < n_tiles; i++) @@ -431,8 +434,8 @@ std::vector> assemble_prior_K_tiles( const std::size_t m_tiles, const std::size_t m_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { std::vector> d_prior_K_tiles; d_prior_K_tiles.resize(m_tiles); @@ -449,8 +452,8 @@ std::vector> assemble_prior_K_tiles_full( const std::size_t m_tiles, const std::size_t m_tile_size, const std::size_t n_regressors, - const gprat_hyper::SEKParams sek_params, - gprat::CUDA_GPU &gpu) + const SEKParams sek_params, + CUDA_GPU &gpu) { std::vector> d_prior_K_tiles(m_tiles * m_tiles); for (std::size_t i = 0; i < m_tiles; i++) @@ -483,7 +486,7 @@ std::vector> assemble_t_cross_covariance_tiles( const std::size_t m_tiles, const std::size_t n_tile_size, const std::size_t m_tile_size, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { std::vector> d_t_cross_covariance_tiles(m_tiles * n_tiles); for (std::size_t i = 0; i < m_tiles; i++) @@ -502,7 +505,7 @@ std::vector> assemble_t_cross_covariance_tiles( } std::vector> assemble_y_tiles( - const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu) + const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu) { std::vector> d_y_tiles(n_tiles); for (std::size_t i = 0; i < n_tiles; i++) @@ -512,10 +515,8 @@ std::vector> assemble_y_tiles( return d_y_tiles; } -std::vector copy_tiled_vector_to_host_vector(std::vector> &d_tiles, - std::size_t n_tile_size, - std::size_t n_tiles, - gprat::CUDA_GPU &gpu) +std::vector copy_tiled_vector_to_host_vector( + std::vector> &d_tiles, std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu) { std::vector h_vector(n_tiles * n_tile_size); std::vector streams(n_tiles); @@ -533,13 +534,13 @@ std::vector copy_tiled_vector_to_host_vector(std::vector> move_lower_tiled_matrix_to_host( +std::vector> move_lower_tiled_matrix_to_host( const std::vector> &d_tiles, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { - std::vector> h_tiles(n_tiles * n_tiles); + std::vector> h_tiles(n_tiles * n_tiles); std::vector streams(n_tiles * (n_tiles + 1) / 2); for (std::size_t i = 0; i < n_tiles; ++i) @@ -547,7 +548,7 @@ std::vector> move_lower_tiled_matrix_to_host( for (std::size_t j = 0; j <= i; ++j) { streams[i] = gpu.next_stream(); - h_tiles[i * n_tiles + j].resize(n_tile_size * n_tile_size); + h_tiles[i * n_tiles + j] = mutable_tile_data(n_tile_size * n_tile_size); check_cuda_error(cudaMemcpyAsync( h_tiles[i * n_tiles + j].data(), d_tiles[i * n_tiles + j].get(), @@ -574,3 +575,5 @@ void free_lower_tiled_matrix(const std::vector> &d_ } } // end of namespace gpu + +GPRAT_NS_END diff --git a/core/src/gpu/gp_functions.cu b/core/src/gpu/gp_functions.cu index 8f5e341f..a4485992 100644 --- a/core/src/gpu/gp_functions.cu +++ b/core/src/gpu/gp_functions.cu @@ -1,14 +1,18 @@ -#include "gpu/gp_functions.cuh" +#include "gprat/gpu/gp_functions.cuh" + +#include "gprat/gpu/cuda_utils.cuh" +#include "gprat/gpu/gp_algorithms.cuh" +#include "gprat/gpu/tiled_algorithms.cuh" +#include "gprat/kernels.hpp" +#include "gprat/target.hpp" +#include "gprat/tile_data.hpp" -#include "gp_kernels.hpp" -#include "gpu/cuda_utils.cuh" -#include "gpu/gp_algorithms.cuh" -#include "gpu/tiled_algorithms.cuh" -#include "target.hpp" #include #include #include +GPRAT_NS_BEGIN + namespace gpu { @@ -16,13 +20,13 @@ std::vector predict(const std::vector &h_training_input, const std::vector &h_training_output, const std::vector &h_test_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { gpu.create(); @@ -65,13 +69,13 @@ std::vector> predict_with_uncertainty( const std::vector &h_training_input, const std::vector &h_training_output, const std::vector &h_test_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { gpu.create(); @@ -150,13 +154,13 @@ std::vector> predict_with_full_cov( const std::vector &h_training_input, const std::vector &h_training_output, const std::vector &h_test_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int m_tiles, int m_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { gpu.create(); @@ -229,11 +233,11 @@ std::vector> predict_with_full_cov( double compute_loss(const std::vector &h_training_input, const std::vector &h_training_output, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { gpu.create(); @@ -279,10 +283,10 @@ optimize(const std::vector &training_input, int n_tiles, int n_tile_size, int n_regressors, - const gprat_hyper::AdamParams &adam_params, - const gprat_hyper::SEKParams &sek_params, + const AdamParams &adam_params, + const SEKParams &sek_params, std::vector trainable_params, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { throw std::logic_error("Function not implemented for GPU"); // return std::vector>(); @@ -293,23 +297,23 @@ double optimize_step(const std::vector &training_input, int n_tiles, int n_tile_size, int n_regressors, - gprat_hyper::AdamParams &adam_params, - gprat_hyper::SEKParams &sek_params, + AdamParams &adam_params, + SEKParams &sek_params, std::vector trainable_params, int iter, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { throw std::logic_error("Function not implemented for GPU"); // return 0.0; } -std::vector> +std::vector> cholesky(const std::vector &h_training_input, - const gprat_hyper::SEKParams &sek_params, + const SEKParams &sek_params, int n_tiles, int n_tile_size, int n_regressors, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { gpu.create(); @@ -323,7 +327,7 @@ cholesky(const std::vector &h_training_input, right_looking_cholesky_tiled(d_tiles, n_tile_size, n_tiles, gpu, cusolver); // Copy tiled matrix to host - std::vector> h_tiles = move_lower_tiled_matrix_to_host(d_tiles, n_tile_size, n_tiles, gpu); + auto h_tiles = move_lower_tiled_matrix_to_host(d_tiles, n_tile_size, n_tiles, gpu); cudaFree(d_training_input); destroy(cusolver); @@ -333,3 +337,5 @@ cholesky(const std::vector &h_training_input, } } // end of namespace gpu + +GPRAT_NS_END diff --git a/core/src/gpu/gp_optimizer.cu b/core/src/gpu/gp_optimizer.cu index 53cca8bb..ea465261 100644 --- a/core/src/gpu/gp_optimizer.cu +++ b/core/src/gpu/gp_optimizer.cu @@ -1,8 +1,12 @@ -#include "gpu/gp_optimizer.cuh" +#include "gprat/gpu/gp_optimizer.cuh" -#include "gpu/adapter_cublas.cuh" -#include "gpu/cuda_kernels.cuh" -#include "gpu/cuda_utils.cuh" +#include "gprat/gpu/adapter_cublas.cuh" +#include "gprat/gpu/cuda_kernels.cuh" +#include "gprat/gpu/cuda_utils.cuh" + +#include + +GPRAT_NS_BEGIN namespace gpu { @@ -36,7 +40,7 @@ double compute_sigmoid(const double parameter) { return 1.0 / (1.0 + exp(-parame double compute_covariance_distance(std::size_t i_global, std::size_t j_global, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &i_input, const std::vector &j_input) { @@ -58,7 +62,7 @@ std::vector gen_tile_distance( std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &input) { std::size_t i_global, j_global; @@ -85,7 +89,7 @@ std::vector gen_tile_covariance_with_distance( std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists) { std::size_t i_global, j_global; @@ -117,7 +121,7 @@ gen_tile_grad_v(std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists) { // Initialize tile @@ -140,7 +144,7 @@ gen_tile_grad_l(std::size_t row, std::size_t col, std::size_t N, std::size_t n_regressors, - gprat_hyper::SEKParams sek_params, + SEKParams sek_params, const std::vector &cov_dists) { // Initialize tile @@ -176,7 +180,7 @@ std::vector gen_tile_grad_v_trans(std::size_t N, const std::vector -gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future f_grad_l_tile, gprat::CUDA_GPU &gpu) +gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future f_grad_l_tile, CUDA_GPU &gpu) { double *transposed; check_cuda_error(cudaMalloc(&transposed, N * N * sizeof(double))); @@ -209,7 +213,7 @@ compute_loss(const hpx::shared_future &K_diag_tile, const hpx::shared_future &alpha_tile, const hpx::shared_future &y_tile, std::size_t N, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { auto [cublas, stream] = gpu.next_cublas_handle(); @@ -233,7 +237,7 @@ add_losses(const std::vector> &losses, std::size_t n_ { l += losses[i].get(); } - l += n_tile_size * n_tiles * log(2.0 * M_PI); + l += n_tile_size * n_tiles * log(2.0 * std::numbers::pi); return hpx::make_ready_future(0.5 * l / (n_tile_size * n_tiles)); } @@ -276,8 +280,8 @@ double update_second_moment(const double &gradient, double v_T, const double &be hpx::shared_future update_param(const double unconstrained_hyperparam, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, double m_T, double v_T, const std::vector beta1_T, @@ -339,11 +343,8 @@ sum_gradright(const std::vector &inter_alpha, const std::vector return 0.0; } -double sum_noise_gradleft(const std::vector &ft_invK, - double grad, - gprat_hyper::SEKParams sek_params, - std::size_t N, - std::size_t n_tiles) +double sum_noise_gradleft( + const std::vector &ft_invK, double grad, SEKParams sek_params, std::size_t N, std::size_t n_tiles) { double noise_der = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true)); for (std::size_t i = 0; i < N; ++i) @@ -353,8 +354,7 @@ double sum_noise_gradleft(const std::vector &ft_invK, return std::move(grad); } -double -sum_noise_gradright(const std::vector &alpha, double grad, gprat_hyper::SEKParams sek_params, std::size_t N) +double sum_noise_gradright(const std::vector &alpha, double grad, SEKParams sek_params, std::size_t N) { // double noise_der = // compute_sigmoid(to_unconstrained(sek_params.noise_variance, true)); @@ -364,3 +364,5 @@ sum_noise_gradright(const std::vector &alpha, double grad, gprat_hyper:: } } // end of namespace gpu + +GPRAT_NS_END diff --git a/core/src/gpu/gp_uncertainty.cu b/core/src/gpu/gp_uncertainty.cu index a7919457..6cc7f50b 100644 --- a/core/src/gpu/gp_uncertainty.cu +++ b/core/src/gpu/gp_uncertainty.cu @@ -1,16 +1,19 @@ -#include "gpu/gp_uncertainty.cuh" +#include "gprat/gpu/gp_uncertainty.cuh" + +#include "gprat/gpu/cuda_utils.cuh" +#include "gprat/target.hpp" -#include "gpu/cuda_utils.cuh" -#include "target.hpp" #include +GPRAT_NS_BEGIN + using hpx::cuda::experimental::check_cuda_error; namespace gpu { -hpx::shared_future diag_posterior( - const hpx::shared_future A, const hpx::shared_future B, std::size_t M, gprat::CUDA_GPU &gpu) +hpx::shared_future +diag_posterior(const hpx::shared_future A, const hpx::shared_future B, std::size_t M, CUDA_GPU &gpu) { auto [cublas, stream] = gpu.next_cublas_handle(); @@ -27,7 +30,7 @@ hpx::shared_future diag_posterior( return hpx::make_ready_future(tile); } -hpx::shared_future diag_tile(const hpx::shared_future A, std::size_t M, gprat::CUDA_GPU &gpu) +hpx::shared_future diag_tile(const hpx::shared_future A, std::size_t M, CUDA_GPU &gpu) { double *diag_tile; check_cuda_error(cudaMalloc(&diag_tile, M * sizeof(double))); @@ -41,3 +44,5 @@ hpx::shared_future diag_tile(const hpx::shared_future A, std } } // end of namespace gpu + +GPRAT_NS_END diff --git a/core/src/gpu/tiled_algorithms.cu b/core/src/gpu/tiled_algorithms.cu index 1ffdd866..3c479ffd 100644 --- a/core/src/gpu/tiled_algorithms.cu +++ b/core/src/gpu/tiled_algorithms.cu @@ -1,10 +1,13 @@ -#include "gpu/tiled_algorithms.cuh" +#include "gprat/gpu/tiled_algorithms.cuh" + +#include "gprat/gpu/adapter_cublas.cuh" +#include "gprat/gpu/gp_optimizer.cuh" +#include "gprat/gpu/gp_uncertainty.cuh" -#include "gpu/adapter_cublas.cuh" -#include "gpu/gp_optimizer.cuh" -#include "gpu/gp_uncertainty.cuh" #include +GPRAT_NS_BEGIN + namespace gpu { @@ -13,7 +16,7 @@ namespace gpu void right_looking_cholesky_tiled(std::vector> &ft_tiles, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu, + CUDA_GPU &gpu, const cusolverDnHandle_t &cusolver) { for (std::size_t k = 0; k < n_tiles; ++k) @@ -86,7 +89,7 @@ void forward_solve_tiled(std::vector> &ft_tiles, std::vector> &ft_rhs, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t k = 0; k < n_tiles; ++k) { @@ -120,7 +123,7 @@ void backward_solve_tiled(std::vector> &ft_tiles, std::vector> &ft_rhs, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { // NOTE: The loops traverse backwards. Its last comparisons require the // usage negative numbers. Therefore they use signed int instead of the @@ -160,7 +163,7 @@ void forward_solve_tiled_matrix( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t c = 0; c < m_tiles; ++c) { @@ -209,7 +212,7 @@ void backward_solve_tiled_matrix( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t c = 0; c < m_tiles; ++c) { @@ -258,7 +261,7 @@ void matrix_vector_tiled(std::vector> &ft_tiles, const std::size_t N_col, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t k = 0; k < m_tiles; ++k) { @@ -288,7 +291,7 @@ void symmetric_matrix_matrix_diagonal_tiled( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t i = 0; i < m_tiles; ++i) { @@ -315,7 +318,7 @@ void compute_gemm_of_invK_y(std::vector> &ft_invK, std::vector> &ft_alpha, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t i = 0; i < n_tiles; ++i) { @@ -344,7 +347,7 @@ hpx::shared_future compute_loss_tiled( std::vector> &ft_y, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { std::vector> loss_tiled(n_tiles); @@ -364,7 +367,7 @@ void symmetric_matrix_matrix_tiled( const std::size_t m_tile_size, const std::size_t n_tiles, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t c = 0; c < m_tiles; ++c) { @@ -397,7 +400,7 @@ void vector_difference_tiled(std::vector> &ft_prior std::vector> &ft_vector, const std::size_t m_tile_size, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t i = 0; i < m_tiles; i++) { @@ -409,7 +412,7 @@ void matrix_diagonal_tiled(std::vector> &ft_priorK, std::vector> &ft_vector, const std::size_t m_tile_size, const std::size_t m_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t i = 0; i < m_tiles; i++) { @@ -422,7 +425,7 @@ void update_grad_K_tiled_mkl(std::vector> &ft_tiles const std::vector> &ft_v2, const std::size_t n_tile_size, const std::size_t n_tiles, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { for (std::size_t i = 0; i < n_tiles; ++i) { @@ -441,8 +444,8 @@ static double update_hyperparameter( const std::vector> &ft_gradparam, const std::vector> &ft_alpha, double &hyperparameter, // lengthscale or vertical-lengthscale - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -451,7 +454,7 @@ static double update_hyperparameter( const std::vector> &beta2_T, int iter, int param_idx, // 0 for lengthscale, 1 for vertical-lengthscale - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { throw std::logic_error("Function not implemented for GPU"); // return 0; @@ -461,8 +464,8 @@ double update_lengthscale( const std::vector> &ft_invK, const std::vector> &ft_gradparam, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -470,7 +473,7 @@ double update_lengthscale( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { return update_hyperparameter( ft_invK, @@ -494,8 +497,8 @@ double update_vertical_lengthscale( const std::vector> &ft_invK, const std::vector> &ft_gradparam, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -503,7 +506,7 @@ double update_vertical_lengthscale( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { return update_hyperparameter( ft_invK, @@ -526,8 +529,8 @@ double update_vertical_lengthscale( double update_noise_variance( const std::vector> &ft_invK, const std::vector> &ft_alpha, - gprat_hyper::SEKParams sek_params, - gprat_hyper::AdamParams adam_params, + SEKParams sek_params, + AdamParams adam_params, const std::size_t n_tile_size, const std::size_t n_tiles, std::vector> &m_T, @@ -535,10 +538,12 @@ double update_noise_variance( const std::vector> &beta1_T, const std::vector> &beta2_T, int iter, - gprat::CUDA_GPU &gpu) + CUDA_GPU &gpu) { throw std::logic_error("Function not implemented for GPU"); // return 0; } } // end of namespace gpu + +GPRAT_NS_END diff --git a/core/src/gp_hyperparameters.cpp b/core/src/hyperparameters.cpp similarity index 85% rename from core/src/gp_hyperparameters.cpp rename to core/src/hyperparameters.cpp index c7c0d9c0..2a4800ce 100644 --- a/core/src/gp_hyperparameters.cpp +++ b/core/src/hyperparameters.cpp @@ -1,11 +1,11 @@ -#include "gp_hyperparameters.hpp" +#include "gprat/hyperparameters.hpp" #include +#include -namespace gprat_hyper -{ +GPRAT_NS_BEGIN -AdamParams::AdamParams(double lr, double b1, double b2, double eps, int opt_i) : +AdamParams::AdamParams(double lr, double b1, double b2, double eps, std::size_t opt_i) : learning_rate(lr), beta1(b1), beta2(b2), @@ -29,4 +29,4 @@ std::string AdamParams::repr() const return oss.str(); } -} // namespace gprat_hyper +GPRAT_NS_END diff --git a/core/src/gp_kernels.cpp b/core/src/kernels.cpp similarity index 73% rename from core/src/gp_kernels.cpp rename to core/src/kernels.cpp index 42952e7e..9fd0218e 100644 --- a/core/src/gp_kernels.cpp +++ b/core/src/kernels.cpp @@ -1,13 +1,13 @@ -#include "gp_kernels.hpp" +#include "gprat/kernels.hpp" #include -namespace gprat_hyper -{ -SEKParams::SEKParams(double lengthscale_, double vertical_lengthscale_, double noise_variance_) : - lengthscale(lengthscale_), - vertical_lengthscale(vertical_lengthscale_), - noise_variance(noise_variance_) +GPRAT_NS_BEGIN + +SEKParams::SEKParams(double in_lengthscale, double in_vertical_lengthscale, double in_noise_variance) : + lengthscale(in_lengthscale), + vertical_lengthscale(in_vertical_lengthscale), + noise_variance(in_noise_variance) { m_T.resize(this->size()); w_T.resize(this->size()); @@ -51,4 +51,5 @@ const double &SEKParams::get_param(std::size_t index) const } throw std::invalid_argument("Get Invalid param_idx"); } -} // namespace gprat_hyper + +GPRAT_NS_END diff --git a/core/src/performance_counters.cpp b/core/src/performance_counters.cpp new file mode 100644 index 00000000..0434e2bb --- /dev/null +++ b/core/src/performance_counters.cpp @@ -0,0 +1,86 @@ +#include "gprat/performance_counters.hpp" + +#include +#include +#include +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +#include +#endif + +GPRAT_NS_BEGIN + +#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name) \ + static std::atomic name(0); \ + std::uint64_t get_##name(bool reset) { return hpx::util::get_and_reset_value(name, reset); } + +GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(tile_data_allocations) +GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(tile_data_deallocations) + +#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR + +void track_tile_data_allocation(std::size_t /*size*/) { tile_data_allocations += 1; } + +void track_tile_data_deallocation(std::size_t /*size*/) { tile_data_deallocations += 1; } + +#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS +// These are non-public functions of their respective CUs. +namespace detail +{ +void register_fp32_performance_counters(); +void register_fp64_performance_counters(); +} // namespace detail + +void register_performance_counters() +{ + // XXX: you can do this with templates, but it's quite a bit more complicated +#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, stats_expr) \ + hpx::performance_counters::install_counter_type( \ + name, \ + [](bool reset) { return hpx::util::get_and_reset_value(stats_expr, reset); }, \ + #stats_expr, \ + "", \ + hpx::performance_counters::counter_type::monotonically_increasing) + + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/tile_data/num_allocations", tile_data_allocations); + GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/tile_data/num_deallocations", tile_data_deallocations); + +#undef GPRAT_MAKE_STATISTICS_ACCESSOR + + detail::register_fp32_performance_counters(); + detail::register_fp64_performance_counters(); +} + +#else +void register_performance_counters() +{ + // no-op for binary compatibility +} +#endif + +void force_evict_memory(const void *start, std::size_t size) +{ + // A cache line size of 64 seems to be a safe estimate. + // see: https://lemire.me/blog/2023/12/12/measuring-the-size-of-the-cache-line-empirically/ + constexpr std::size_t cache_line_size = 64; + + const char *p = static_cast(start); + const char *end = p + size; + + _mm_mfence(); + do { + // Intel recommends clflushopt over normal clflush due to higher performance, see: + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf + _mm_clflush(p); + p += cache_line_size; + } while (p < end); + + // Make sure we don't miss a cache line at the end + if ((reinterpret_cast(p) & (cache_line_size - 1)) + != (reinterpret_cast(end - 1) & (cache_line_size - 1))) + { + _mm_clflush(end - 1); + } + _mm_mfence(); +} + +GPRAT_NS_END diff --git a/core/src/target.cpp b/core/src/target.cpp index 1b500702..3cd90504 100644 --- a/core/src/target.cpp +++ b/core/src/target.cpp @@ -1,16 +1,14 @@ -#include "target.hpp" +#include "gprat/target.hpp" #include #if GPRAT_WITH_CUDA -#include "gpu/cuda_utils.cuh" -using hpx::cuda::experimental::check_cuda_error; +#include "gprat/gpu/cuda_utils.cuh" #endif -namespace gprat -{ +GPRAT_NS_BEGIN -CPU::CPU() { } +CPU::CPU() = default; bool CPU::is_cpu() { return true; } @@ -154,4 +152,4 @@ int gpu_count() #endif } -} // namespace gprat +GPRAT_NS_END diff --git a/core/src/tile_data.cpp b/core/src/tile_data.cpp new file mode 100644 index 00000000..24ef9eb3 --- /dev/null +++ b/core/src/tile_data.cpp @@ -0,0 +1,34 @@ +#include "gprat/tile_data.hpp" + +#include "gprat/performance_counters.hpp" + +#include + +GPRAT_NS_BEGIN + +namespace detail +{ + +void *allocate_tile_data(std::size_t num_bytes) +{ + auto &topology = hpx::get_runtime().get_topology(); + const auto bitmap = topology.cpuset_to_nodeset(topology.get_machine_affinity_mask()); + + track_tile_data_allocation(num_bytes); + return topology.allocate_membind(num_bytes, bitmap, hpx::threads::hpx_hwloc_membind_policy::membind_firsttouch, 0); +} + +void deallocate_tile_data(void *p, std::size_t num_bytes) +{ + track_tile_data_deallocation(num_bytes); + + if (hpx::is_running()) + { + auto &topology = hpx::get_runtime().get_topology(); + topology.deallocate(p, num_bytes); + } +} + +} // namespace detail + +GPRAT_NS_END diff --git a/core/src/utils_c.cpp b/core/src/utils.cpp similarity index 76% rename from core/src/utils_c.cpp rename to core/src/utils.cpp index 896b7ad0..47935bfd 100644 --- a/core/src/utils_c.cpp +++ b/core/src/utils.cpp @@ -1,11 +1,10 @@ -#include "utils_c.hpp" +#include "gprat/utils.hpp" #include -namespace utils -{ +GPRAT_NS_BEGIN -int compute_train_tiles(int n_samples, int n_tile_size) +std::size_t compute_train_tiles(std::size_t n_samples, std::size_t n_tile_size) { if (n_tile_size > 0) { @@ -18,7 +17,7 @@ int compute_train_tiles(int n_samples, int n_tile_size) } } -int compute_train_tile_size(int n_samples, int n_tiles) +std::size_t compute_train_tile_size(std::size_t n_samples, std::size_t n_tiles) { if (n_tiles > 0) { @@ -31,10 +30,10 @@ int compute_train_tile_size(int n_samples, int n_tiles) } } -std::pair compute_test_tiles(int n_test, int n_tiles, int n_tile_size) +std::pair compute_test_tiles(std::size_t n_test, std::size_t n_tiles, std::size_t n_tile_size) { - int m_tiles; - int m_tile_size; + std::size_t m_tiles; + std::size_t m_tile_size; // if n_test is not divisible by (incl. smaller than) n_tile_size, use the same number of tiles if ((n_test % n_tile_size) > 0) @@ -51,10 +50,10 @@ std::pair compute_test_tiles(int n_test, int n_tiles, int n_tile_size) return { m_tiles, m_tile_size }; } -std::vector load_data(const std::string &file_path, int n_samples, int offset) +std::vector load_data(const std::string &file_path, std::size_t n_samples, std::size_t offset) { std::vector _data; - _data.resize(static_cast(n_samples + offset), 0.0); + _data.resize(n_samples + offset, 0.0); FILE *input_file = fopen(file_path.c_str(), "r"); if (input_file == NULL) @@ -63,11 +62,14 @@ std::vector load_data(const std::string &file_path, int n_samples, int o } // load data - int scanned_elements = 0; - for (int i = 0; i < n_samples; i++) + std::size_t scanned_elements = 0; + for (std::size_t i = 0; i < n_samples; i++) { - scanned_elements += - fscanf(input_file, "%lf", &_data[static_cast(i + offset)]); // scanned_elements++; + const auto r = fscanf(input_file, "%lf", &_data[(i + offset)]); + if (r > 0) + { + scanned_elements += static_cast(r); + } } fclose(input_file); @@ -141,4 +143,4 @@ bool compiled_with_cuda() #endif } -} // namespace utils +GPRAT_NS_END diff --git a/examples/gprat_cpp/src/execute.cpp b/examples/gprat_cpp/src/execute.cpp index 8c415727..7089155e 100644 --- a/examples/gprat_cpp/src/execute.cpp +++ b/examples/gprat_cpp/src/execute.cpp @@ -1,32 +1,63 @@ -#include "gprat_c.hpp" -#include "utils_c.hpp" +#include "gprat/gprat.hpp" +#include "gprat/utils.hpp" + #include #include #include int main(int argc, char *argv[]) { + namespace po = hpx::program_options; + po::options_description desc("Allowed options"); + // clang-format off + desc.add_options() + ("help", "produce help message") + ("train_x_path", po::value()->default_value("../../../data/data_1024/training_input.txt"), "training data (x)") + ("train_y_path", po::value()->default_value("../../../data/data_1024/training_output.txt"), "training data (y)") + ("test_path", po::value()->default_value("../../../data/data_1024/test_input.txt"), "test data") + ("tiles", po::value()->default_value(16), "tiles per dimension") + ("regressors", po::value()->default_value(8), "num regressors") + ("start-cores", po::value()->default_value(2), "num CPUs to start with") + ("end-cores", po::value()->default_value(4), "num CPUs to end with") + ("start", po::value()->default_value(512), "Starting number of training samples") + ("end", po::value()->default_value(1024), "End number of training samples") + ("step", po::value()->default_value(2), "Increment of training samples") + ("loop", po::value()->default_value(2), "Number of iterations to be performed for each number of training samples") + ("opt_iter", po::value()->default_value(1), "Number of optimization iterations*/") + ; + // clang-format on + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.contains("help")) + { + std::cout << desc << "\n"; + return 1; + } + ///////////////////// /////// configuration - std::size_t START = 512; - std::size_t END = 1024; - std::size_t STEP = 2; - std::size_t LOOP = 2; - const std::size_t OPT_ITER = 1; + std::size_t START = vm["start"].as(); + std::size_t END = vm["end"].as(); + std::size_t STEP = vm["step"].as(); + std::size_t LOOP = vm["loop"].as(); + const std::size_t OPT_ITER = vm["opt_iter"].as(); - int n_test = 1024; - const std::size_t N_CORES = 4; - const std::size_t n_tiles = 16; - const std::size_t n_reg = 8; + const std::size_t n_test = START; + const std::size_t N_CORES = vm["end-cores"].as(); + const std::size_t n_tiles = vm["tiles"].as(); + const std::size_t n_reg = vm["regressors"].as(); - std::string train_path = "../../../data/data_1024/training_input.txt"; - std::string out_path = "../../../data/data_1024/training_output.txt"; - std::string test_path = "../../../data/data_1024/test_input.txt"; + std::string train_path = vm["train_x_path"].as(); + std::string out_path = vm["train_y_path"].as(); + std::string test_path = vm["test_path"].as(); bool use_gpu = - utils::compiled_with_cuda() && gprat::gpu_count() > 0 && argc > 1 && std::strcmp(argv[1], "--use_gpu") == 0; + gprat::compiled_with_cuda() && gprat::gpu_count() > 0 && argc > 1 && std::strcmp(argv[1], "--use_gpu") == 0; - for (std::size_t core = 2; core <= N_CORES; core = core * 2) + for (std::size_t core = vm["start-cores"].as(); core <= N_CORES; core = core * 2) { // Create new argc and argv to include the --hpx:threads argument std::vector args(argv, argv + argc); @@ -48,15 +79,15 @@ int main(int argc, char *argv[]) for (std::size_t start = START; start <= END; start = start * STEP) { - int n_train = static_cast(start); + const auto n_train = start; for (std::size_t l = 0; l < LOOP; l++) { // Compute tile sizes and number of predict tiles - int tile_size = utils::compute_train_tile_size(n_train, n_tiles); - auto result = utils::compute_test_tiles(n_test, n_tiles, tile_size); + const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles); + const auto result = gprat::compute_test_tiles(n_test, n_tiles, tile_size); ///////////////////// ///// hyperparams - gprat_hyper::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER }; + gprat::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER }; ///////////////////// ////// data loading @@ -93,34 +124,32 @@ int main(int argc, char *argv[]) init_time = end_init - start_init; // Initialize HPX with the new arguments, don't run hpx_main - utils::start_hpx_runtime(new_argc, new_argv); + gprat::start_hpx_runtime(new_argc, new_argv); // Measure the time taken to execute gp.cholesky(); auto start_cholesky = std::chrono::high_resolution_clock::now(); - std::vector> choleksy_cpu = gp_cpu.cholesky(); + const auto choleksy_cpu = gp_cpu.cholesky(); auto end_cholesky = std::chrono::high_resolution_clock::now(); cholesky_time = end_cholesky - start_cholesky; // Measure the time taken to execute gp.optimize(hpar); auto start_opt = std::chrono::high_resolution_clock::now(); - std::vector losses = gp_cpu.optimize(hpar); + const auto losses = gp_cpu.optimize(hpar); auto end_opt = std::chrono::high_resolution_clock::now(); opt_time = end_opt - start_opt; auto start_pred_uncer = std::chrono::high_resolution_clock::now(); - std::vector> sum_cpu = - gp_cpu.predict_with_uncertainty(test_input.data, result.first, result.second); + const auto sum_cpu = gp_cpu.predict_with_uncertainty(test_input.data, result.first, result.second); auto end_pred_uncer = std::chrono::high_resolution_clock::now(); pred_uncer_time = end_pred_uncer - start_pred_uncer; auto start_pred_full_cov = std::chrono::high_resolution_clock::now(); - std::vector> full_cpu = - gp_cpu.predict_with_full_cov(test_input.data, result.first, result.second); + const auto full_cpu = gp_cpu.predict_with_full_cov(test_input.data, result.first, result.second); auto end_pred_full_cov = std::chrono::high_resolution_clock::now(); pred_full_cov_time = end_pred_full_cov - start_pred_full_cov; auto start_pred = std::chrono::high_resolution_clock::now(); - std::vector pred_cpu = gp_cpu.predict(test_input.data, result.first, result.second); + const auto pred_cpu = gp_cpu.predict(test_input.data, result.first, result.second); auto end_pred = std::chrono::high_resolution_clock::now(); pred_time = end_pred - start_pred; } @@ -143,10 +172,10 @@ int main(int argc, char *argv[]) init_time = end_init - start_init; // Initialize HPX with the new arguments, don't run hpx_main - utils::start_hpx_runtime(new_argc, new_argv); + gprat::start_hpx_runtime(new_argc, new_argv); auto start_cholesky = std::chrono::high_resolution_clock::now(); - std::vector> choleksy_gpu = gp_gpu.cholesky(); + const auto choleksy_gpu = gp_gpu.cholesky(); auto end_cholesky = std::chrono::high_resolution_clock::now(); cholesky_time = end_cholesky - start_cholesky; @@ -154,31 +183,29 @@ int main(int argc, char *argv[]) opt_time = std::chrono::seconds(-1); auto start_pred_uncer = std::chrono::high_resolution_clock::now(); - std::vector> sum_gpu = - gp_gpu.predict_with_uncertainty(test_input.data, result.first, result.second); + const auto sum_gpu = gp_gpu.predict_with_uncertainty(test_input.data, result.first, result.second); auto end_pred_uncer = std::chrono::high_resolution_clock::now(); pred_uncer_time = end_pred_uncer - start_pred_uncer; auto start_pred_full_cov = std::chrono::high_resolution_clock::now(); - std::vector> full_gpu = - gp_gpu.predict_with_full_cov(test_input.data, result.first, result.second); + const auto full_gpu = gp_gpu.predict_with_full_cov(test_input.data, result.first, result.second); auto end_pred_full_cov = std::chrono::high_resolution_clock::now(); pred_full_cov_time = end_pred_full_cov - start_pred_full_cov; auto start_pred = std::chrono::high_resolution_clock::now(); - std::vector pred_gpu = gp_gpu.predict(test_input.data, result.first, result.second); + const auto pred_gpu = gp_gpu.predict(test_input.data, result.first, result.second); auto end_pred = std::chrono::high_resolution_clock::now(); pred_time = end_pred - start_pred; } // Stop the HPX runtime - utils::stop_hpx_runtime(); + gprat::stop_hpx_runtime(); auto end_total = std::chrono::high_resolution_clock::now(); auto total_time = end_total - start_total; // Save parameters and times to a .txt file with a header - std::ofstream outfile("../output.csv", std::ios::app); // Append mode + std::ofstream outfile("output.csv", std::ios::app); // Append mode if (outfile.tellp() == 0) { // If file is empty, write the header diff --git a/external_ports/README.md b/external_ports/README.md new file mode 100644 index 00000000..993ec19c --- /dev/null +++ b/external_ports/README.md @@ -0,0 +1,3 @@ +# What is this? + +This contains custom vcpkg ports and forks of official ones. diff --git a/external_ports/intel-mkl/copy-from-dmg.cmake b/external_ports/intel-mkl/copy-from-dmg.cmake new file mode 100644 index 00000000..a5aa67cd --- /dev/null +++ b/external_ports/intel-mkl/copy-from-dmg.cmake @@ -0,0 +1,53 @@ +find_program(HDIUTIL NAMES hdiutil REQUIRED) +set(dmg_path "NOTFOUND" CACHE FILEPATH "Where to find the DMG") +set(output_dir "output_dir" CACHE FILEPATH "Where to put the packages") + +if(NOT EXISTS "${dmg_path}") + message(FATAL_ERROR "'dmg_path' (${dmg_path}) does not exist.") +endif() +if(NOT IS_DIRECTORY "${output_dir}") + message(FATAL_ERROR "'output_dir' (${output_dir}) is not a directory.") +endif() + +execute_process( + COMMAND mktemp -d + RESULT_VARIABLE mktemp_result + OUTPUT_VARIABLE mount_point + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if(NOT mktemp_result STREQUAL "0") + message(FATAL_ERROR "mktemp -d failed: ${mktemp_result}") +elseif(NOT IS_DIRECTORY "${mount_point}") + message(FATAL_ERROR "'mount_point' (${mount_point}) is not a directory.") +endif() + +execute_process( + COMMAND "${HDIUTIL}" attach "${dmg_path}" -mountpoint "${mount_point}" -readonly + RESULT_VARIABLE mount_result +) +if(mount_result STREQUAL "0") + set(dmg_packages_dir "${mount_point}/bootstrapper.app/Contents/Resources/packages") + file(GLOB packages + "${dmg_packages_dir}/intel.oneapi.mac.mkl.devel,*" + "${dmg_packages_dir}/intel.oneapi.mac.mkl.runtime,*" + "${dmg_packages_dir}/intel.oneapi.mac.mkl.product,*" + "${dmg_packages_dir}/intel.oneapi.mac.openmp,*" + ) + # Using execute_process to avoid direct errors + execute_process( + COMMAND cp -R ${packages} "${output_dir}/" + RESULT_VARIABLE copy_result + ) +endif() +execute_process( + COMMAND "${HDIUTIL}" detach "${mount_point}" + RESULT_VARIABLE unmount_result +) + +if(NOT mount_result STREQUAL "0") + message(FATAL_ERROR "Mounting ${dmg_path} failed: ${mount_result}") +elseif(NOT copy_result STREQUAL "0") + message(FATAL_ERROR "Coyping packages failed: ${copy_result}") +elseif(NOT unmount_result STREQUAL "0") + message(FATAL_ERROR "Unounting ${dmg_path} failed: ${unmount_result}") +endif() diff --git a/external_ports/intel-mkl/portfile.cmake b/external_ports/intel-mkl/portfile.cmake new file mode 100644 index 00000000..b07c79f1 --- /dev/null +++ b/external_ports/intel-mkl/portfile.cmake @@ -0,0 +1,256 @@ +# This package installs Intel MKL on Linux, macOS and Windows for x64. +# Configuration: +# - lp64 +# - sequential + +set(VCPKG_POLICY_EMPTY_PACKAGE enabled) + +# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19150/w_onemkl_p_2023.0.0.25930_offline.exe # windows +# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19116/m_onemkl_p_2023.0.0.25376_offline.dmg # macos +# https://registrationcenter-download.intel.com/akdlm/irc_nas/19138/l_onemkl_p_2023.0.0.25398_offline.sh # linux +set(sha "") +if(NOT VCPKG_TARGET_ARCHITECTURE STREQUAL "x64") + # nop +elseif(VCPKG_TARGET_IS_WINDOWS) + set(filename w_onemkl_p_2023.0.0.25930_offline.exe) + set(magic_number 19150) + set(sha a3eb6b75241a2eccb73ed73035ff111172c55d3fa51f545c7542277a155df84ff72fc826621711153e683f84058e64cb549c030968f9f964531db76ca8a3ed46) + set(package_infix "win") +elseif(VCPKG_TARGET_IS_OSX) + set(filename m_onemkl_p_2023.0.0.25376_offline.dmg) + set(magic_number 19116) + set(sha 7b9b8c004054603e6830fb9b9c049d5a4cfc0990c224cb182ac5262ab9f1863775a67491413040e3349c590e2cca58edcfc704db9f3b9f9faa8b5b09022cd2af) + set(package_infix "mac") + set(package_libdir "lib") + set(compiler_libdir "mac/compiler/lib") +elseif(VCPKG_TARGET_IS_LINUX) + set(filename l_onemkl_p_2023.0.0.25398_offline.sh) + set(magic_number 19138) + set(sha b5f2f464675f0fd969dde2faf2e622b834eb1cc406c4a867148116f6c24ba5c709d98b678840f4a89a1778e12cde0ff70ce2ef59faeef3d3f3aa1d0329c71af1) + set(package_infix "lin") + set(package_libdir "lib/intel64") + set(compiler_libdir "linux/compiler/lib/intel64_lin") +endif() + +if(NOT sha) + message(WARNING "${PORT} is empty for ${TARGET_TRIPLET}.") + return() +endif() + +vcpkg_download_distfile(installer_path + URLS "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/${magic_number}/${filename}" + FILENAME "${filename}" + SHA512 "${sha}" +) + +# Note: intel_thread and lp64 are the defaults. +set(interface "lp64") # or ilp64; ilp == 64 bit int api +#https://www.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/linking-your-application-with-onemkl/linking-in-detail/linking-with-interface-libraries/using-the-ilp64-interface-vs-lp64-interface.html +set(threading "sequential") +if(threading STREQUAL "intel_thread") + set(short_thread "iomp") +else() + string(SUBSTRING "${threading}" "0" "3" short_thread) +endif() +set(main_pc_file "mkl-${VCPKG_LIBRARY_LINKAGE}-${interface}-${short_thread}.pc") + +# First extraction level: packages (from offline installer) +set(extract_0_dir "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-extract") +file(REMOVE_RECURSE "${extract_0_dir}") +file(MAKE_DIRECTORY "${extract_0_dir}") + +# Second extraction level: actual files (from packages) +set(extract_1_dir "${CURRENT_PACKAGES_DIR}/intel-extract") +file(REMOVE_RECURSE "${extract_1_dir}") +file(MAKE_DIRECTORY "${extract_1_dir}") + +file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/lib/pkgconfig") + +if(VCPKG_TARGET_IS_WINDOWS) + vcpkg_find_acquire_program(7Z) + message(STATUS "Extracting offline installer") + vcpkg_execute_required_process( + COMMAND "${7Z}" x "${installer_path}" "-o${extract_0_dir}" "-y" "-bso0" "-bsp0" + WORKING_DIRECTORY "${extract_0_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-0" + ) + + set(packages + "intel.oneapi.win.mkl.devel,v=2023.0.0-25930/oneapi-mkl-devel-for-installer_p_2023.0.0.25930.msi" # has the required libs. + "intel.oneapi.win.mkl.runtime,v=2023.0.0-25930/oneapi-mkl-for-installer_p_2023.0.0.25930.msi" # has the required DLLs + #"intel.oneapi.win.compilers-common-runtime,v=2023.0.0-25922" # SVML + "intel.oneapi.win.openmp,v=2023.0.0-25922/oneapi-comp-openmp-for-installer_p_2023.0.0.25922.msi" # OpenMP + #"intel.oneapi.win.tbb.runtime,v=2021.8.0-25874" #TBB + ) + + foreach(pack IN LISTS packages) + set(package_path "${extract_0_dir}/packages/${pack}") + cmake_path(GET pack STEM LAST_ONLY packstem) + cmake_path(NATIVE_PATH package_path package_path_native) + vcpkg_execute_required_process( + COMMAND "${LESSMSI}" x "${package_path_native}" + WORKING_DIRECTORY "${extract_1_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" + ) + file(COPY "${extract_1_dir}/${packstem}/SourceDir/" DESTINATION "${extract_1_dir}") + file(REMOVE_RECURSE "${extract_1_dir}/${packstem}") + endforeach() + + set(mkl_dir "${extract_1_dir}/Intel/Compiler/12.0/mkl/2023.0.0") + file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include") + # see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html for linking + if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") + set(files "mkl_core_dll.lib" "mkl_${threading}_dll.lib" "mkl_intel_${interface}_dll.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib") # "mkl_rt.lib" single dynamic lib with dynamic dispatch + file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin") # Could probably be reduced instead of copying all + if(NOT VCPKG_BUILD_TYPE) + file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin") + endif() + else() + set(files "mkl_core.lib" "mkl_${threading}.lib" "mkl_intel_${interface}.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib") + endif() + foreach(file IN LISTS files) + file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") # instead of manual-link keep normal structure + if(NOT VCPKG_BUILD_TYPE) + file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64") + endif() + endforeach() + file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}") + + set(compiler_dir "${extract_1_dir}/Intel/Compiler/12.0/compiler/2023.0.0") + if(threading STREQUAL "intel_thread") + file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin") + file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") + file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/windows/compiler/lib/intel64_win/" "/lib/intel64/") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "-I \${includedir}" "-I\"\${includedir}\"") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5") + if(NOT VCPKG_BUILD_TYPE) + file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin") + file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64") + endif() + endif() +else() + message(STATUS "Warning: This port is still a work on progress. + E.g. it is not correctly filtering the libraries in accordance with + VCPKG_LIBRARY_LINKAGE. It is using the default threading (Intel OpenMP) + which is known to segfault when used together with GNU OpenMP. +") + + message(STATUS "Extracting offline installer") + if(VCPKG_TARGET_IS_LINUX) + vcpkg_execute_required_process( + COMMAND "bash" "--verbose" "--noprofile" "${installer_path}" "--extract-only" "--extract-folder" "${extract_0_dir}" + WORKING_DIRECTORY "${extract_0_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-0" + ) + file(RENAME "${extract_0_dir}/l_onemkl_p_2023.0.0.25398_offline/packages" "${extract_0_dir}/packages") + elseif(VCPKG_TARGET_IS_OSX) + find_program(HDIUTIL NAMES hdiutil REQUIRED) + file(MAKE_DIRECTORY "${extract_0_dir}/packages") + message(STATUS "... Don't interrupt.") + vcpkg_execute_required_process( + COMMAND "${CMAKE_COMMAND}" "-Ddmg_path=${installer_path}" + "-Doutput_dir=${extract_0_dir}/packages" + "-DHDIUTIL=${HDIUTIL}" + -P "${CMAKE_CURRENT_LIST_DIR}/copy-from-dmg.cmake" + WORKING_DIRECTORY "${extract_0_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-0" + ) + message(STATUS "... Done.") + endif() + + file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.runtime,v=2023.0.0-*") + cmake_path(GET package_path STEM LAST_ONLY packstem) + message(STATUS "Extracting ${packstem}") + vcpkg_execute_required_process( + COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" + "_installdir/mkl/2023.0.0/lib" + "_installdir/mkl/2023.0.0/licensing" + WORKING_DIRECTORY "${extract_1_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" + ) + file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.devel,v=2023.0.0-*") + cmake_path(GET package_path STEM LAST_ONLY packstem) + message(STATUS "Extracting ${packstem}") + vcpkg_execute_required_process( + COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" + "_installdir/mkl/2023.0.0/bin" + "_installdir/mkl/2023.0.0/include" + "_installdir/mkl/2023.0.0/lib" + WORKING_DIRECTORY "${extract_1_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" + ) + file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.openmp,v=2023.0.0-*") + cmake_path(GET package_path STEM LAST_ONLY packstem) + message(STATUS "Extracting ${packstem}") + vcpkg_execute_required_process( + COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" + "_installdir/compiler/2023.0.0" + WORKING_DIRECTORY "${extract_1_dir}" + LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" + ) + + set(mkl_dir "${extract_1_dir}/_installdir/mkl/2023.0.0") + file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include") + file(COPY "${mkl_dir}/${package_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") + if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") + set(to_remove_suffix .a) + elseif(VCPKG_TARGET_IS_OSX) + set(to_remove_suffix .dylib) + else() + set(to_remove_suffix .so) + endif() + file(GLOB_RECURSE files_to_remove + "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}" + "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}.?" + ) + file(REMOVE ${files_to_remove}) + file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "\${exec_prefix}/${package_libdir}" "\${exec_prefix}/lib/intel64" IGNORE_UNCHANGED) + + set(compiler_dir "${extract_1_dir}/_installdir/compiler/2023.0.0") + if(threading STREQUAL "intel_thread") + file(COPY "${compiler_dir}/${compiler_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") + file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/${compiler_libdir}/" "/lib/intel64/" IGNORE_UNCHANGED) + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5") + endif() +endif() + +file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/mkl.pc") +if(NOT VCPKG_BUILD_TYPE) + file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig") + file(GLOB pc_files RELATIVE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/*.pc") + foreach(file IN LISTS pc_files) + file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${file}" "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/include" "/../include") + if(NOT VCPKG_TARGET_IS_WINDOWS) + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/lib/intel64" "/../lib/intel64") + endif() + endforeach() +endif() + +file(COPY "${mkl_dir}/lib/cmake/" DESTINATION "${CURRENT_PACKAGES_DIR}/share/") +vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "MKL_CMAKE_PATH}/../../../" "MKL_CMAKE_PATH}/../../") +vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "redist/\${MKL_ARCH}" "bin") +if(${VCPKG_LIBRARY_LINKAGE} STREQUAL "static") +vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)" +[[define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST) + set(MKL_LINK "static") +]]) +endif() +#TODO: Hardcode settings from portfile in config.cmake +#TODO: Give lapack/blas information about the correct BLA_VENDOR depending on settings. + +file(INSTALL "${mkl_dir}/licensing" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") +file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.product,v=2023.0.0-*") +vcpkg_install_copyright(FILE_LIST "${package_path}/licenses/license.htm") + +file(REMOVE_RECURSE + "${extract_0_dir}" + "${extract_1_dir}" + "${CURRENT_PACKAGES_DIR}/lib/intel64/cmake" + "${CURRENT_PACKAGES_DIR}/lib/intel64/pkgconfig" +) + +file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") diff --git a/external_ports/intel-mkl/usage b/external_ports/intel-mkl/usage new file mode 100644 index 00000000..b8ee798f --- /dev/null +++ b/external_ports/intel-mkl/usage @@ -0,0 +1,4 @@ +intel-mkl provides CMake targets: + + find_package(MKL CONFIG REQUIRED) + target_link_libraries(main PRIVATE MKL::MKL) diff --git a/external_ports/intel-mkl/vcpkg.json b/external_ports/intel-mkl/vcpkg.json new file mode 100644 index 00000000..fc0a76ec --- /dev/null +++ b/external_ports/intel-mkl/vcpkg.json @@ -0,0 +1,16 @@ +{ + "name": "intel-mkl", + "version": "2023.0.0", + "port-version": 5, + "description": "Intel® Math Kernel Library (Intel® MKL) accelerates math processing routines, increases application performance, and reduces development time on Intel® processors.", + "homepage": "https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html", + "license": null, + "supports": "(windows | linux | osx) & x64", + "dependencies": [ + { + "name": "vcpkg-tool-lessmsi", + "host": true, + "platform": "windows" + } + ] +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d5378540..9618627f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,7 +27,7 @@ find_package(Boost REQUIRED) add_executable(GPRat_test_output_correctness src/output_correctness.cpp) target_link_libraries(GPRat_test_output_correctness - PRIVATE GPRat::core Catch2::Catch2WithMain) + PRIVATE GPRat::core Catch2::Catch2WithMain Boost::boost) target_compile_features(GPRat_test_output_correctness PRIVATE cxx_std_17) add_test( diff --git a/test/src/output_correctness.cpp b/test/src/output_correctness.cpp index 1fc73536..1e7ca8fc 100644 --- a/test/src/output_correctness.cpp +++ b/test/src/output_correctness.cpp @@ -1,5 +1,6 @@ -#include "gprat_c.hpp" -#include "utils_c.hpp" +#include "gprat/gprat.hpp" +#include "gprat/utils.hpp" + #include #include @@ -40,6 +41,36 @@ void tag_invoke(boost::json::value_from_tag, boost::json::value &jv, const gprat }; } +template +std::vector to_vector(const gprat::const_tile_data &data) +{ + return { data.begin(), data.end() }; +} + +template +std::vector> to_vector(const std::vector> &data) +{ + std::vector> out; + out.reserve(data.size()); + for (const auto &row : data) + { + out.emplace_back(to_vector(row)); + } + return out; +} + +template +std::vector> to_vector(const std::vector> &data) +{ + std::vector> out; + out.reserve(data.size()); + for (const auto &row : data) + { + out.emplace_back(to_vector(row)); + } + return out; +} + // This helper function deduces the type and assigns the value with the matching key template inline void extract(const boost::json::object &obj, T &t, std::string_view key) @@ -73,11 +104,11 @@ gprat_results run_on_data_cpu(const std::string &train_path, const std::string & const std::size_t n_reg = 8; // Compute tile sizes and number of predict tiles - const int tile_size = utils::compute_train_tile_size(n_train, n_tiles); - const auto test_tiles = utils::compute_test_tiles(n_test, n_tiles, tile_size); + const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles); + const auto test_tiles = gprat::compute_test_tiles(n_test, n_tiles, tile_size); // hyperparams - gprat_hyper::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER }; + gprat::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER }; // data loading gprat::GP_data training_input(train_path, n_train, n_reg); @@ -90,22 +121,17 @@ gprat_results run_on_data_cpu(const std::string &train_path, const std::string & training_input.data, training_output.data, n_tiles, tile_size, n_reg, { 1.0, 1.0, 0.1 }, trainable); // Initialize HPX with no arguments, don't run hpx_main - utils::start_hpx_runtime(0, nullptr); + gprat::start_hpx_runtime(0, nullptr); gprat_results results_cpu; - - results_cpu.choleksy = gp_cpu.cholesky(); - + results_cpu.choleksy = to_vector(gp_cpu.cholesky()); results_cpu.losses = gp_cpu.optimize(hpar); - results_cpu.sum = gp_cpu.predict_with_uncertainty(test_input.data, test_tiles.first, test_tiles.second); - results_cpu.full = gp_cpu.predict_with_full_cov(test_input.data, test_tiles.first, test_tiles.second); - results_cpu.pred = gp_cpu.predict(test_input.data, test_tiles.first, test_tiles.second); // Stop the HPX runtime - utils::stop_hpx_runtime(); + gprat::stop_hpx_runtime(); return results_cpu; } @@ -120,8 +146,8 @@ gprat_results run_on_data_gpu(const std::string &train_path, const std::string & const int gpu_id = 0; const int n_streams = 1; - const int tile_size = utils::compute_train_tile_size(n_train, n_tiles); - const auto test_tiles = utils::compute_test_tiles(n_test, n_tiles, tile_size); + const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles); + const auto test_tiles = gprat::compute_test_tiles(n_test, n_tiles, tile_size); gprat::GP_data training_input(train_path, n_train, n_reg); gprat::GP_data training_output(out_path, n_train, n_reg); @@ -139,16 +165,16 @@ gprat_results run_on_data_gpu(const std::string &train_path, const std::string & gpu_id, n_streams); - utils::start_hpx_runtime(0, nullptr); + gprat::start_hpx_runtime(0, nullptr); gprat_results results_gpu; - results_gpu.choleksy = gp_gpu.cholesky(); + results_gpu.choleksy = to_vector(gp_gpu.cholesky()); // NOTE: optimize and optimize_step are currently not implemented for GPU results_gpu.sum_no_optimize = gp_gpu.predict_with_uncertainty(test_input.data, test_tiles.first, test_tiles.second); results_gpu.full_no_optimize = gp_gpu.predict_with_full_cov(test_input.data, test_tiles.first, test_tiles.second); results_gpu.pred_no_optimize = gp_gpu.predict(test_input.data, test_tiles.first, test_tiles.second); - utils::stop_hpx_runtime(); + gprat::stop_hpx_runtime(); return results_gpu; } @@ -256,7 +282,7 @@ TEST_CASE("GP CPU results match known-good values", "[integration][cpu]") // NOTE: using higher tolerance than for CPU TEST_CASE("GP GPU results match known-good values (no loss)", "[integration][gpu]") { - if (!utils::compiled_with_cuda()) + if (!gprat::compiled_with_cuda()) { WARN("CUDA not available — skipping GPU test."); return; diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json new file mode 100644 index 00000000..3afcbd70 --- /dev/null +++ b/vcpkg-configuration.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json", + "overlay-ports": [ + "./external_ports" + ] +} diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 00000000..0b252332 --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json", + "name": "gprat", + "version-semver": "0.1.0", + "dependencies": [ + { + "name": "boost-json" + }, + { + "name": "intel-mkl" + }, + { + "name": "fmt" + }, + { + "name": "hpx", + "features": [ + "cuda", + "bzip2", + "mpi", + "snappy", + "zlib" + ] + }, + { + "name": "cuda" + } + ], + "default-features": [], + "builtin-baseline": "365f6444ab40ee87c73c947b475b3a267b3cb77c" +}